In [1]:
import findspark
findspark.init()

In [2]:
from pyspark import SparkContext
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('lr_example').getOrCreate()

In [4]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.classification import LogisticRegression
from pyspark.sql.functions import corr
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [5]:
# Use Spark to read in the Ecommerce Customers csv file.
data = spark.read.csv('flights.csv', inferSchema = True, header = True)

In [6]:
# Import the required function
from pyspark.sql.functions import round

In [7]:
# Convert 'mile' to 'km'
data = data.withColumn('km', round(data.mile * 1.60934, 0))

In [8]:
# Create 'label' column indicating whether flight delayed (1) or not (0)
data = data.withColumn('label', (data.delay >= 15).cast('integer'))

In [9]:
data.show(3)

+---+---+---+-------+------+---+----+------+--------+-----+------+-----+
|mon|dom|dow|carrier|flight|org|mile|depart|duration|delay|    km|label|
+---+---+---+-------+------+---+----+------+--------+-----+------+-----+
| 11| 20|  6|     US|    19|JFK|2153|  9.48|     351|   NA|3465.0| null|
|  0| 22|  2|     UA|  1107|ORD| 316| 16.33|      82|   30| 509.0|    1|
|  2| 20|  4|     UA|   226|SFO| 337|  6.17|      82|   -8| 542.0|    0|
+---+---+---+-------+------+---+----+------+--------+-----+------+-----+
only showing top 3 rows



In [10]:
from pyspark.ml.feature import StringIndexer

In [11]:
# Create an indexer
indexer = StringIndexer(inputCol='carrier', outputCol='carrier_idx')

# Indexer identifies categories in the data
indexer_model = indexer.fit(data)

# Indexer creates a new column with a numeric index values
data_indexed = indexer_model.transform(data)

# Repeat the process for the other categorical feature
data_indexed = StringIndexer(inputCol='org', outputCol='org_idx').fit(data_indexed).transform(data_indexed)

In [12]:
data_indexed.show(3)

+---+---+---+-------+------+---+----+------+--------+-----+------+-----+-----------+-------+
|mon|dom|dow|carrier|flight|org|mile|depart|duration|delay|    km|label|carrier_idx|org_idx|
+---+---+---+-------+------+---+----+------+--------+-----+------+-----+-----------+-------+
| 11| 20|  6|     US|    19|JFK|2153|  9.48|     351|   NA|3465.0| null|        6.0|    2.0|
|  0| 22|  2|     UA|  1107|ORD| 316| 16.33|      82|   30| 509.0|    1|        0.0|    0.0|
|  2| 20|  4|     UA|   226|SFO| 337|  6.17|      82|   -8| 542.0|    0|        0.0|    1.0|
+---+---+---+-------+------+---+----+------+--------+-----+------+-----+-----------+-------+
only showing top 3 rows



In [13]:
# Create an assembler object
assembler = VectorAssembler(
    inputCols = ['mon', 'dom', 'dow', 'carrier_idx', 'org_idx', 'km', 'depart', 'duration'],
    outputCol = 'features') # input

In [14]:
data_pre = assembler.transform(data_indexed)

In [15]:
data_pre.show(3, False)

+---+---+---+-------+------+---+----+------+--------+-----+------+-----+-----------+-------+-----------------------------------------+
|mon|dom|dow|carrier|flight|org|mile|depart|duration|delay|km    |label|carrier_idx|org_idx|features                                 |
+---+---+---+-------+------+---+----+------+--------+-----+------+-----+-----------+-------+-----------------------------------------+
|11 |20 |6  |US     |19    |JFK|2153|9.48  |351     |NA   |3465.0|null |6.0        |2.0    |[11.0,20.0,6.0,6.0,2.0,3465.0,9.48,351.0]|
|0  |22 |2  |UA     |1107  |ORD|316 |16.33 |82      |30   |509.0 |1    |0.0        |0.0    |[0.0,22.0,2.0,0.0,0.0,509.0,16.33,82.0]  |
|2  |20 |4  |UA     |226   |SFO|337 |6.17  |82      |-8   |542.0 |0    |0.0        |1.0    |[2.0,20.0,4.0,0.0,1.0,542.0,6.17,82.0]   |
+---+---+---+-------+------+---+----+------+--------+-----+------+-----+-----------+-------+-----------------------------------------+
only showing top 3 rows



In [16]:
final_data = data_pre.select('features', 'label')
final_data.count()

50000

In [17]:
final_data = final_data.na.drop()
final_data.count()

47022

In [18]:
final_data.show(5, False)

+-----------------------------------------+-----+
|features                                 |label|
+-----------------------------------------+-----+
|[0.0,22.0,2.0,0.0,0.0,509.0,16.33,82.0]  |1    |
|[2.0,20.0,4.0,0.0,1.0,542.0,6.17,82.0]   |0    |
|[9.0,13.0,1.0,1.0,0.0,1989.0,10.33,195.0]|0    |
|[5.0,2.0,1.0,0.0,1.0,885.0,7.98,102.0]   |0    |
|[7.0,2.0,6.0,1.0,0.0,1180.0,10.83,135.0] |1    |
+-----------------------------------------+-----+
only showing top 5 rows



In [19]:
train_data, test_data = final_data.randomSplit([0.8, 0.2])

In [20]:
# Create a Logistic Regression Model object
logistic = LogisticRegression(featuresCol='features',
                      labelCol='label',
                      predictionCol='prediction')

In [21]:
# Fit the model to the data and call this model logisticModel
logisticModel = logistic.fit(train_data,)

In [22]:
# Create a predictions for the testing data and show confusion matrix
test_model = logisticModel.transform(test_data)
test_model.groupBy('label', 'prediction').count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    1|       0.0| 1698|
|    0|       0.0| 2625|
|    1|       1.0| 3141|
|    0|       1.0| 1980|
+-----+----------+-----+



In [23]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator

In [24]:
# Calculate the elements of the confusion matrix
TN = test_model.filter('prediction = 0 and label = prediction').count()
TP = test_model.filter('prediction = 1 and label = prediction').count()
FN = test_model.filter('prediction = 0 and label != prediction').count()
FP = test_model.filter('prediction = 1 and label != prediction').count()

In [25]:
# Calculate precision and recall
precision = TP / (TP + FP)
recall = TP / (TP + FN)
print('precision = {:.2f}\nrecall   = {:.2}'.format(precision, recall))

precision = 0.61
recall   = 0.65


In [26]:
print('Acc: ', (TN + TP) / (TN+TP+FN+FP))

Acc:  0.6105463786531131


In [27]:
# Find weighted precision
multi_evaluator = MulticlassClassificationEvaluator()
weighted_precision = multi_evaluator.evaluate(test_model,
                                              {multi_evaluator.metricName: "weightedPrecision"})

In [28]:
weighted_precision

0.6103630501071541

In [29]:
# Find AUC
binary_evaluator = BinaryClassificationEvaluator()
auc = binary_evaluator.evaluate(test_model,
                                {binary_evaluator.metricName: 'areaUnderROC'})

In [30]:
auc

0.6475066523153014

In [31]:
# Save model
logisticModel.save('logisticModel_Flights_50k')

In [32]:
from pyspark.ml.classification import LogisticRegressionModel
# Load model from
logisticModel2 = LogisticRegressionModel.load('logisticModel_Flights_50k')

In [33]:
# Predict new values (Assuming select test_data)
unlabeled_data = test_data.select('features')

In [34]:
predictions = logisticModel2.transform(unlabeled_data)

In [35]:
predictions.show(2, False)

+-------------------------------------+----------------------------------------+----------------------------------------+----------+
|features                             |rawPrediction                           |probability                             |prediction|
+-------------------------------------+----------------------------------------+----------------------------------------+----------+
|(8,[1,5,6,7],[6.0,378.0,21.33,69.0]) |[-1.0700505098348914,1.0700505098348914]|[0.25539347871726,0.7446065212827401]   |1.0       |
|(8,[1,5,6,7],[6.0,1291.0,20.0,148.0])|[-1.1526114042830131,1.1526114042830131]|[0.24001242269150957,0.7599875773084904]|1.0       |
+-------------------------------------+----------------------------------------+----------------------------------------+----------+
only showing top 2 rows

