In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml.regression import LinearRegression
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
spark=SparkSession.builder.appName("Linear Regression").getOrCreate()

In [3]:

flightschema=StructType([

                            StructField("DayofMonth",IntegerType(), False),
                            StructField("DayOfWeek",IntegerType(), False),
                            StructField("Carrier",StringType(), False),
                            StructField("OriginAirportID",IntegerType(), False),
                            StructField("DestAirportID",IntegerType(), False),
                            StructField("DepDelay",IntegerType(), False),
                            StructField("ArrDelay",IntegerType(), False)
])

In [23]:
df=spark.read.csv("C:/Users/User/Desktop/SparkFolder/Data/flights.csv",schema=flightschema,header=True)

In [24]:
df.show(3)

+----------+---------+-------+---------------+-------------+--------+--------+
|DayofMonth|DayOfWeek|Carrier|OriginAirportID|DestAirportID|DepDelay|ArrDelay|
+----------+---------+-------+---------------+-------------+--------+--------+
|        19|        5|     DL|          11433|        13303|      -3|       1|
|        19|        5|     DL|          14869|        12478|       0|      -8|
|        19|        5|     DL|          14057|        14869|      -4|     -15|
+----------+---------+-------+---------------+-------------+--------+--------+
only showing top 3 rows



# Select Important Features

In [25]:
df.columns

['DayofMonth',
 'DayOfWeek',
 'Carrier',
 'OriginAirportID',
 'DestAirportID',
 'DepDelay',
 'ArrDelay']

In [26]:
important_cols=['DayofMonth','DayOfWeek','OriginAirportID','DestAirportID','DepDelay','ArrDelay']


In [27]:
data=df.select(important_cols)

In [28]:
data.show()

+----------+---------+---------------+-------------+--------+--------+
|DayofMonth|DayOfWeek|OriginAirportID|DestAirportID|DepDelay|ArrDelay|
+----------+---------+---------------+-------------+--------+--------+
|        19|        5|          11433|        13303|      -3|       1|
|        19|        5|          14869|        12478|       0|      -8|
|        19|        5|          14057|        14869|      -4|     -15|
|        19|        5|          15016|        11433|      28|      24|
|        19|        5|          11193|        12892|      -6|     -11|
|        19|        5|          10397|        15016|      -1|     -19|
|        19|        5|          15016|        10397|       0|      -1|
|        19|        5|          10397|        14869|      15|      24|
|        19|        5|          10397|        10423|      33|      34|
|        19|        5|          11278|        10397|     323|     322|
|        19|        5|          14107|        13487|      -7|     -13|
|     

# Preparing training Data

In [29]:
#definr the assembler
assembler=VectorAssembler(inputCols=['DayofMonth','DayOfWeek','OriginAirportID','DestAirportID','DepDelay']\
                          ,outputCol="features")

In [30]:
#Transform Data
traindata_tran=assembler.transform(data)

In [31]:
traindata_tran.show()

+----------+---------+---------------+-------------+--------+--------+--------------------+
|DayofMonth|DayOfWeek|OriginAirportID|DestAirportID|DepDelay|ArrDelay|            features|
+----------+---------+---------------+-------------+--------+--------+--------------------+
|        19|        5|          11433|        13303|      -3|       1|[19.0,5.0,11433.0...|
|        19|        5|          14869|        12478|       0|      -8|[19.0,5.0,14869.0...|
|        19|        5|          14057|        14869|      -4|     -15|[19.0,5.0,14057.0...|
|        19|        5|          15016|        11433|      28|      24|[19.0,5.0,15016.0...|
|        19|        5|          11193|        12892|      -6|     -11|[19.0,5.0,11193.0...|
|        19|        5|          10397|        15016|      -1|     -19|[19.0,5.0,10397.0...|
|        19|        5|          15016|        10397|       0|      -1|[19.0,5.0,15016.0...|
|        19|        5|          10397|        14869|      15|      24|[19.0,5.0,

In [32]:
traindata_tran.head(1)[0][-1]  # Shows 5 features being vectorised

DenseVector([19.0, 5.0, 11433.0, 13303.0, -3.0])

In [33]:
final_train_data=traindata_tran.select(traindata_tran["ArrDelay"].cast("Int").alias("label"),traindata_tran["features"])

In [34]:
final_train_data.show(truncate=False,n=3)  # Nice trick, reaon to  dirty your hands

+-----+-------------------------------+
|label|features                       |
+-----+-------------------------------+
|1    |[19.0,5.0,11433.0,13303.0,-3.0]|
|-8   |[19.0,5.0,14869.0,12478.0,0.0] |
|-15  |[19.0,5.0,14057.0,14869.0,-4.0]|
+-----+-------------------------------+
only showing top 3 rows



In [35]:
final_train_data.count()

2702218

# Divide Data into Training and Testing Data

In [36]:
df1=traindata_tran.select(["ArrDelay","features"])

In [37]:
df1.show(3)

+--------+--------------------+
|ArrDelay|            features|
+--------+--------------------+
|       1|[19.0,5.0,11433.0...|
|      -8|[19.0,5.0,14869.0...|
|     -15|[19.0,5.0,14057.0...|
+--------+--------------------+
only showing top 3 rows



In [38]:
train_data, test_data=df1.randomSplit([0.7,0.3])

In [40]:
train_data.show(truncate=False,n=3)

+--------+--------------------------------+
|ArrDelay|features                        |
+--------+--------------------------------+
|-94     |[2.0,4.0,14307.0,12264.0,-5.0]  |
|-86     |[7.0,2.0,11618.0,14771.0,-14.0] |
|-79     |[20.0,1.0,12478.0,14771.0,-16.0]|
+--------+--------------------------------+
only showing top 3 rows



In [41]:
test_data.show(truncate=False, n=3)

+--------+-------------------------------+
|ArrDelay|features                       |
+--------+-------------------------------+
|-69     |[12.0,7.0,11618.0,14771.0,-9.0]|
|-68     |[6.0,1.0,12478.0,13198.0,-13.0]|
|-65     |[7.0,2.0,12478.0,14747.0,-1.0] |
+--------+-------------------------------+
only showing top 3 rows



# Train Data

In [45]:
lr=LinearRegression(labelCol="ArrDelay",featuresCol="features",predictionCol="prediction", maxIter=10, regParam=3)

In [46]:
lrmodel=lr.fit(train_data)

In [48]:
results=lrmodel.evaluate(train_data)

In [49]:
results.r2

0.8783185590070417

In [50]:
results.r2adj

0.8783182372920073

In [51]:
results.meanAbsoluteError

9.17507388022935

In [53]:
lrmodel.transform(train_data).show(truncate=False)

+--------+--------------------------------+-------------------+
|ArrDelay|features                        |prediction         |
+--------+--------------------------------+-------------------+
|-94     |[2.0,4.0,14307.0,12264.0,-5.0]  |-7.637654806704127 |
|-86     |[7.0,2.0,11618.0,14771.0,-14.0] |-16.64629460363917 |
|-79     |[20.0,1.0,12478.0,14771.0,-16.0]|-18.104971859076535|
|-75     |[6.0,1.0,12478.0,14747.0,-4.0]  |-7.054792541805659 |
|-74     |[4.0,6.0,11618.0,14747.0,-4.0]  |-7.883220238621742 |
|-71     |[6.0,1.0,12478.0,14057.0,-2.0]  |-5.047210636752241 |
|-71     |[13.0,1.0,12478.0,12892.0,-3.0] |-5.666052945478228 |
|-70     |[13.0,1.0,12478.0,12954.0,-4.0] |-6.611639157492311 |
|-69     |[2.0,2.0,14492.0,12953.0,-60.0] |-58.78722058800809 |
|-69     |[13.0,1.0,12478.0,12892.0,-5.0] |-7.531722232320651 |
|-68     |[6.0,1.0,12478.0,14747.0,-1.0]  |-4.256288611542024 |
|-68     |[30.0,2.0,12478.0,12892.0,-5.0] |-7.485652002127751 |
|-67     |[4.0,6.0,11618.0,14771.0,-15.0

# Test Data Prediction

In [54]:
prediction=lrmodel.transform(test_data)

In [55]:
prediction.show(5)

+--------+--------------------+-------------------+
|ArrDelay|            features|         prediction|
+--------+--------------------+-------------------+
|     -69|[12.0,7.0,11618.0...|-12.601898777579311|
|     -68|[6.0,1.0,12478.0,...|-15.131720788235938|
|     -65|[7.0,2.0,12478.0,...|-4.3802440897512644|
|     -64|[12.0,7.0,10721.0...| -5.825642012355034|
|     -64|[27.0,6.0,12478.0...| -5.643811028675506|
+--------+--------------------+-------------------+
only showing top 5 rows



In [57]:
test_eval=lrmodel.evaluate(test_data)

In [58]:
test_eval.r2

0.8767483259110911

In [59]:
test_eval.rootMeanSquaredError

13.529223369663539