In [1]:
import findspark
findspark.init()

In [2]:
from pyspark import SparkContext
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import round
# Import VectorAssembler and Vectors
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.sql.functions import corr

In [3]:
sc = SparkContext()
spark = SparkSession(sc)

### Chuẩn bị và chuẩn hóa dữ liệu, xác định inputs, output

In [4]:
# Use Spark to read in the flights csv file
data = spark.read.csv(["../../Data/flights.csv"], header=True, inferSchema=True)

In [5]:
# Print the Schema of the DataFrame
data.printSchema()

root
 |-- mon: integer (nullable = true)
 |-- dom: integer (nullable = true)
 |-- dow: integer (nullable = true)
 |-- carrier: string (nullable = true)
 |-- flight: integer (nullable = true)
 |-- org: string (nullable = true)
 |-- mile: integer (nullable = true)
 |-- depart: double (nullable = true)
 |-- duration: integer (nullable = true)
 |-- delay: string (nullable = true)



In [6]:
data.show(3)

+---+---+---+-------+------+---+----+------+--------+-----+
|mon|dom|dow|carrier|flight|org|mile|depart|duration|delay|
+---+---+---+-------+------+---+----+------+--------+-----+
| 11| 20|  6|     US|    19|JFK|2153|  9.48|     351|   NA|
|  0| 22|  2|     UA|  1107|ORD| 316| 16.33|      82|   30|
|  2| 20|  4|     UA|   226|SFO| 337|  6.17|      82|   -8|
+---+---+---+-------+------+---+----+------+--------+-----+
only showing top 3 rows



In [7]:
# Convert 'mile' to 'km'
data = data.withColumn('km', round(data.mile * 1.60934, 0))

In [8]:
# Create 'label' column indicating whether flight delayed (1) or not (0)
data = data.withColumn('label', (data.delay >=15).cast('integer'))

In [9]:
data.columns

['mon',
 'dom',
 'dow',
 'carrier',
 'flight',
 'org',
 'mile',
 'depart',
 'duration',
 'delay',
 'km',
 'label']

In [10]:
data.show(3)

+---+---+---+-------+------+---+----+------+--------+-----+------+-----+
|mon|dom|dow|carrier|flight|org|mile|depart|duration|delay|    km|label|
+---+---+---+-------+------+---+----+------+--------+-----+------+-----+
| 11| 20|  6|     US|    19|JFK|2153|  9.48|     351|   NA|3465.0| null|
|  0| 22|  2|     UA|  1107|ORD| 316| 16.33|      82|   30| 509.0|    1|
|  2| 20|  4|     UA|   226|SFO| 337|  6.17|      82|   -8| 542.0|    0|
+---+---+---+-------+------+---+----+------+--------+-----+------+-----+
only showing top 3 rows



In [11]:
from pyspark.ml.feature import StringIndexer

In [12]:
# Create an indexer
indexer = StringIndexer(inputCol='carrier', outputCol='carrier_idx')
# Indexer identifies categories in the data
indexer_model = indexer.fit(data)
# Indexer creates a new column with numeric index values
data_indexed = indexer_model.transform(data)
# Repeat the process for the other categorical feature
data_indexed = StringIndexer(inputCol='org', outputCol='org_idx').fit(data_indexed).transform(data_indexed)

In [13]:
# Onehot for carrier and org

In [14]:
data_indexed.show(3)

+---+---+---+-------+------+---+----+------+--------+-----+------+-----+-----------+-------+
|mon|dom|dow|carrier|flight|org|mile|depart|duration|delay|    km|label|carrier_idx|org_idx|
+---+---+---+-------+------+---+----+------+--------+-----+------+-----+-----------+-------+
| 11| 20|  6|     US|    19|JFK|2153|  9.48|     351|   NA|3465.0| null|        6.0|    2.0|
|  0| 22|  2|     UA|  1107|ORD| 316| 16.33|      82|   30| 509.0|    1|        0.0|    0.0|
|  2| 20|  4|     UA|   226|SFO| 337|  6.17|      82|   -8| 542.0|    0|        0.0|    1.0|
+---+---+---+-------+------+---+----+------+--------+-----+------+-----+-----------+-------+
only showing top 3 rows



In [15]:
assembler = VectorAssembler(inputCols=["mon", "dom", "dow", "carrier_idx", "km", "depart", "duration"],
                            outputCol="features")

In [16]:
data_pre = assembler.transform(data_indexed)

In [17]:
data_pre.show(2, False)

+---+---+---+-------+------+---+----+------+--------+-----+------+-----+-----------+-------+-------------------------------------+
|mon|dom|dow|carrier|flight|org|mile|depart|duration|delay|km    |label|carrier_idx|org_idx|features                             |
+---+---+---+-------+------+---+----+------+--------+-----+------+-----+-----------+-------+-------------------------------------+
|11 |20 |6  |US     |19    |JFK|2153|9.48  |351     |NA   |3465.0|null |6.0        |2.0    |[11.0,20.0,6.0,6.0,3465.0,9.48,351.0]|
|0  |22 |2  |UA     |1107  |ORD|316 |16.33 |82      |30   |509.0 |1    |0.0        |0.0    |[0.0,22.0,2.0,0.0,509.0,16.33,82.0]  |
+---+---+---+-------+------+---+----+------+--------+-----+------+-----+-----------+-------+-------------------------------------+
only showing top 2 rows



In [18]:
data_unknown_delay = data_pre[data_pre['delay']=='NA']

In [19]:
data_unknown_delay.count()

2978

In [20]:
data_known_delay = data_pre[data_pre['delay']!='NA']

In [21]:
data_known_delay.count()

47022

In [22]:
final_data = data_known_delay.select('features', 'label')
final_data.count()

47022

In [23]:
final_data.na.drop()
final_data.count()

47022

In [24]:
final_data.show(3, truncate=False)

+-------------------------------------+-----+
|features                             |label|
+-------------------------------------+-----+
|[0.0,22.0,2.0,0.0,509.0,16.33,82.0]  |1    |
|[2.0,20.0,4.0,0.0,542.0,6.17,82.0]   |0    |
|[9.0,13.0,1.0,1.0,1989.0,10.33,195.0]|0    |
+-------------------------------------+-----+
only showing top 3 rows



### Chuẩn bị train/test dataset

In [25]:
train_data, test_data = final_data.randomSplit([0.8, 0.2])

In [26]:
train_data.describe().show()

+-------+-------------------+
|summary|              label|
+-------+-------------------+
|  count|              37626|
|   mean| 0.5100993993515123|
| stddev|0.49990463484559083|
|    min|                  0|
|    max|                  1|
+-------+-------------------+



In [27]:
test_data.describe().show()

+-------+------------------+
|summary|             label|
+-------+------------------+
|  count|              9396|
|   mean|0.5161770966368667|
| stddev|0.4997648282824691|
|    min|                 0|
|    max|                 1|
+-------+------------------+



### Xây dựng model với train dataset

In [28]:
# Create a Logistic Regression Model object
logistic = LogisticRegression(featuresCol='features',
                      labelCol='label',
                      predictionCol='prediction')

In [29]:
# Fit the model to the data and call this model lrModel
logisticModel = logistic.fit(train_data)

In [30]:
# Create predictions for the testing data and show confutsion matrix
test_model = logisticModel.transform(test_data)
test_model.groupBy('label', 'prediction').count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    1|       0.0| 1844|
|    0|       0.0| 2534|
|    1|       1.0| 3006|
|    0|       1.0| 2012|
+-----+----------+-----+



### Đánh giá model với test dataset

In [31]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator

In [32]:
# Calculate the elements of the confusion matrix
TN = test_model.filter('prediction = 0 AND label = prediction').count()
TP = test_model.filter('prediction = 1 AND label = prediction').count()
FN = test_model.filter('prediction = 0 AND label != prediction').count()
FP = test_model.filter('prediction = 1 AND label != prediction').count()

In [33]:
acc = (TP + TN) / (TP + TN + FP + FN)
# Calculate precision and recall
precision = TP / (TP + FP)
recall = TP / (TP + FN)
print('precision = {:.2f}\nrecall = {:.2f}\nacc={:.2f}'.format(precision, recall, acc))

precision = 0.60
recall = 0.62
acc=0.59


In [34]:
# Find weighted precision
multi_evaluator = MulticlassClassificationEvaluator()
weighted_precision = multi_evaluator.evaluate(test_model,
                                              {multi_evaluator.metricName:'weightedPrecision'})


In [35]:
weighted_precision

0.5892507049123914

In [36]:
# Find AUC
binary_evaluator = BinaryClassificationEvaluator()
auc = binary_evaluator.evaluate(test_model, {binary_evaluator.metricName: "areaUnderROC"})

In [37]:
auc

0.6271550383026208

### Lưu trữ và tải model

In [38]:
# Save Model
logisticModel.save('logisticModel_Flights_50k')

In [39]:
from pyspark.ml.classification import LogisticRegressionModel
# Load model from
logisticModel2 = LogisticRegressionModel.load('logisticModel_Flights_50k')

### Dự đoán dữ liệu mới

In [40]:
# Predict new values (Assuming select test_data)
unlabeled_data = test_data.select('features')

In [41]:
predictions = logisticModel2.transform(unlabeled_data)

In [42]:
predictions.show(5)

+--------------------+--------------------+--------------------+----------+
|            features|       rawPrediction|         probability|prediction|
+--------------------+--------------------+--------------------+----------+
|[0.0,1.0,2.0,0.0,...|[-0.3600446722001...|[0.41094875210439...|       1.0|
|[0.0,1.0,2.0,0.0,...|[-0.3696260500817...|[0.40863138414061...|       1.0|
|[0.0,1.0,2.0,0.0,...|[-0.4674880060336...|[0.38521097095670...|       1.0|
|[0.0,1.0,2.0,0.0,...|[-0.9672704855702...|[0.27542488650185...|       1.0|
|[0.0,1.0,2.0,0.0,...|[-0.3421460501203...|[0.41528827020268...|       1.0|
+--------------------+--------------------+--------------------+----------+
only showing top 5 rows

