# Demo Pipeline Linear Regression

### Dataset: flights.csv
- You'll build a regression model to predict flight duration 
- With dow, org, mile as a predictor

First thing to do is start a Spark Session

In [1]:
import findspark
findspark.init()

In [2]:
import pyspark

In [3]:
from pyspark import SparkContext
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession

from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [4]:
spark = SparkSession.builder.appName('lr_demo').getOrCreate()

In [5]:
# Use Spark to read in the Ecommerce Customers csv file.
data = spark.read.csv("../../Data/flights.csv",inferSchema=True,header=True)

In [6]:
# Print the Schema of the DataFrame
data.printSchema()

root
 |-- mon: integer (nullable = true)
 |-- dom: integer (nullable = true)
 |-- dow: integer (nullable = true)
 |-- carrier: string (nullable = true)
 |-- flight: integer (nullable = true)
 |-- org: string (nullable = true)
 |-- mile: integer (nullable = true)
 |-- depart: double (nullable = true)
 |-- duration: integer (nullable = true)
 |-- delay: string (nullable = true)



In [7]:
data.show(3)

+---+---+---+-------+------+---+----+------+--------+-----+
|mon|dom|dow|carrier|flight|org|mile|depart|duration|delay|
+---+---+---+-------+------+---+----+------+--------+-----+
| 11| 20|  6|     US|    19|JFK|2153|  9.48|     351|   NA|
|  0| 22|  2|     UA|  1107|ORD| 316| 16.33|      82|   30|
|  2| 20|  4|     UA|   226|SFO| 337|  6.17|      82|   -8|
+---+---+---+-------+------+---+----+------+--------+-----+
only showing top 3 rows



In [8]:
data.head()

Row(mon=11, dom=20, dow=6, carrier='US', flight=19, org='JFK', mile=2153, depart=9.48, duration=351, delay='NA')

In [9]:
# for item in data.head():
#     print(item)

In [10]:
data.count()

50000

In [11]:
# Remove the 'flight' column
data = data.drop('flight')

In [12]:
# Number of records with missing 'delay' values
data.filter('delay IS NULL').count()

0

In [13]:
# Remove records with missing 'delay' values
data = data.filter('delay IS NOT NULL')

In [14]:
# Remove records with missing values in any column and get the number of remaining rows
data = data.na.drop()
data.count()

50000

In [15]:
# Import the required function
from pyspark.sql.functions import round

In [16]:
# Convert 'mile' to 'km' and drop 'mile' column
data = data.withColumn('km', round(data.mile * 1.60934, 0))

In [17]:
# Create 'label' column indicating whether flight delayed (1) or not (0)
data = data.withColumn('label', (data.delay >= 15).cast('integer'))
# Check first five records
data.show(5)

+---+---+---+-------+---+----+------+--------+-----+------+-----+
|mon|dom|dow|carrier|org|mile|depart|duration|delay|    km|label|
+---+---+---+-------+---+----+------+--------+-----+------+-----+
| 11| 20|  6|     US|JFK|2153|  9.48|     351|   NA|3465.0| null|
|  0| 22|  2|     UA|ORD| 316| 16.33|      82|   30| 509.0|    1|
|  2| 20|  4|     UA|SFO| 337|  6.17|      82|   -8| 542.0|    0|
|  9| 13|  1|     AA|ORD|1236| 10.33|     195|   -5|1989.0|    0|
|  4|  2|  5|     AA|ORD| 258|  8.92|      65|   NA| 415.0| null|
+---+---+---+-------+---+----+------+--------+-----+------+-----+
only showing top 5 rows



In [18]:
final_data = data
final_data.count()

final_data = final_data.na.drop()
final_data.count()

final_data.show(5)

+---+---+---+-------+---+----+------+--------+-----+------+-----+
|mon|dom|dow|carrier|org|mile|depart|duration|delay|    km|label|
+---+---+---+-------+---+----+------+--------+-----+------+-----+
|  0| 22|  2|     UA|ORD| 316| 16.33|      82|   30| 509.0|    1|
|  2| 20|  4|     UA|SFO| 337|  6.17|      82|   -8| 542.0|    0|
|  9| 13|  1|     AA|ORD|1236| 10.33|     195|   -5|1989.0|    0|
|  5|  2|  1|     UA|SFO| 550|  7.98|     102|    2| 885.0|    0|
|  7|  2|  6|     AA|ORD| 733| 10.83|     135|   54|1180.0|    1|
+---+---+---+-------+---+----+------+--------+-----+------+-----+
only showing top 5 rows



# Thực hiện Pipeline

In [19]:
train_data, test_data = final_data.randomSplit([0.8, 0.2])

In [20]:
# Import class for creating a pipeline
from pyspark.ml import Pipeline

In [21]:
# Convert categorical strings to index values
indexer = StringIndexer(inputCol='org', outputCol='org_idx')
# One-hot encode index values
onehot = OneHotEncoder(inputCols=['org_idx', 'dow'], outputCols=['org_dummy', 'dow_dummy'])
# Assemble predictors into a single column
assembler = VectorAssembler(inputCols=['km', 'org_dummy', 'dow_dummy'], outputCol='features')
# A linear regression object
regression = LinearRegression(featuresCol='features', labelCol='duration', predictionCol='prediction')

In [22]:
# Construct a pipeLine
pipeline = Pipeline(stages=[indexer, onehot, assembler, regression])
# Train the pipeline on the training data
pipeline = pipeline.fit(test_data)

### Đánh giá kết quả

In [23]:
# Make predictions on the testing data
predictions = pipeline.transform(test_data)

In [24]:
# Inspect results
predictions.select("prediction", "duration").show(5)

+------------------+--------+
|        prediction|duration|
+------------------+--------+
| 92.06544884577296|      85|
|131.49858178771746|     135|
|131.49858178771746|     130|
|164.47091893691396|     170|
| 76.54487590652394|      80|
+------------------+--------+
only showing top 5 rows



In [25]:
from pyspark.ml.evaluation import RegressionEvaluator

In [26]:
RegressionEvaluator(labelCol='duration').evaluate(predictions)

11.102217054757931

### Lưu và load model

In [27]:
# Save pipiline model
pipeline.save("Pipeline_flight_50k")

In [29]:
# Load pipeline model
from pyspark.ml import PipelineModel

In [30]:
pipeline2 = PipelineModel.load('Pipeline_flight_50k')

### Dự đoán mẫu mới

In [31]:
# Predict new values (Assuming select test_data)
unlabeled_data = test_data.drop('label')

In [35]:
# Make predictions on the new data
predictions2 = pipeline2.transform(unlabeled_data)

In [37]:
# Inspect results
predictions2.select('features', 'prediction').show(5)

+--------------------+------------------+
|            features|        prediction|
+--------------------+------------------+
|(14,[0,1,10],[649...| 92.06544884577296|
|(14,[0,1,10],[118...|131.49858178771746|
|(14,[0,1,10],[118...|131.49858178771746|
|(14,[0,1,10],[162...|164.47091893691396|
|(14,[0,2,10],[542...| 76.54487590652394|
+--------------------+------------------+
only showing top 5 rows

