In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml.regression import LinearRegression
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
spark=SparkSession.builder.appName("Linear Regression").getOrCreate()

In [3]:

flightschema=StructType([

                            StructField("DayofMonth",IntegerType(), False),
                            StructField("DayOfWeek",IntegerType(), False),
                            StructField("Carrier",StringType(), False),
                            StructField("OriginAirportID",IntegerType(), False),
                            StructField("DestAirportID",IntegerType(), False),
                            StructField("DepDelay",IntegerType(), False),
                            StructField("ArrDelay",IntegerType(), False)
])

In [4]:
df=spark.read.csv("C:/Users/User/Desktop/SparkFolder/Data/raw-flight-data.csv",schema=flightschema,header=True)

In [5]:
df.show(3)

+----------+---------+-------+---------------+-------------+--------+--------+
|DayofMonth|DayOfWeek|Carrier|OriginAirportID|DestAirportID|DepDelay|ArrDelay|
+----------+---------+-------+---------------+-------------+--------+--------+
|        19|        5|     DL|          11433|        13303|      -3|       1|
|        19|        5|     DL|          14869|        12478|       0|      -8|
|        19|        5|     DL|          14057|        14869|      -4|     -15|
+----------+---------+-------+---------------+-------------+--------+--------+
only showing top 3 rows



# Select Important Features

In [6]:
df.columns

['DayofMonth',
 'DayOfWeek',
 'Carrier',
 'OriginAirportID',
 'DestAirportID',
 'DepDelay',
 'ArrDelay']

In [7]:
important_cols=['DayofMonth','DayOfWeek','OriginAirportID','DestAirportID','DepDelay','ArrDelay']


In [8]:
data=df.select(important_cols)

In [9]:
data.show()

+----------+---------+---------------+-------------+--------+--------+
|DayofMonth|DayOfWeek|OriginAirportID|DestAirportID|DepDelay|ArrDelay|
+----------+---------+---------------+-------------+--------+--------+
|        19|        5|          11433|        13303|      -3|       1|
|        19|        5|          14869|        12478|       0|      -8|
|        19|        5|          14057|        14869|      -4|     -15|
|        19|        5|          15016|        11433|      28|      24|
|        19|        5|          11193|        12892|      -6|     -11|
|        19|        5|          10397|        15016|      -1|     -19|
|        19|        5|          15016|        10397|       0|      -1|
|        19|        5|          10397|        14869|      15|      24|
|        19|        5|          10397|        10423|      33|      34|
|        19|        5|          11278|        10397|     323|     322|
|        19|        5|          14107|        13487|      -7|     -13|
|     

# Preparing training Data

In [10]:
#definr the assembler
assembler=VectorAssembler(inputCols=['DayofMonth','DayOfWeek','OriginAirportID','DestAirportID','DepDelay']\
                          ,outputCol="features")

In [11]:
#Transform Data
traindata_tran=assembler.transform(data)

In [12]:
traindata_tran.show()

+----------+---------+---------------+-------------+--------+--------+--------------------+
|DayofMonth|DayOfWeek|OriginAirportID|DestAirportID|DepDelay|ArrDelay|            features|
+----------+---------+---------------+-------------+--------+--------+--------------------+
|        19|        5|          11433|        13303|      -3|       1|[19.0,5.0,11433.0...|
|        19|        5|          14869|        12478|       0|      -8|[19.0,5.0,14869.0...|
|        19|        5|          14057|        14869|      -4|     -15|[19.0,5.0,14057.0...|
|        19|        5|          15016|        11433|      28|      24|[19.0,5.0,15016.0...|
|        19|        5|          11193|        12892|      -6|     -11|[19.0,5.0,11193.0...|
|        19|        5|          10397|        15016|      -1|     -19|[19.0,5.0,10397.0...|
|        19|        5|          15016|        10397|       0|      -1|[19.0,5.0,15016.0...|
|        19|        5|          10397|        14869|      15|      24|[19.0,5.0,

In [13]:
traindata_tran.head(1)[0][-1]  # Shows 5 features being vectorised

DenseVector([19.0, 5.0, 11433.0, 13303.0, -3.0])

In [14]:
final_train_data=traindata_tran.select(traindata_tran["ArrDelay"].cast("Int").alias("label"),traindata_tran["features"])

In [15]:
final_train_data.show(truncate=False,n=3)  # Nice trick, reaon to  dirty your hands

+-----+-------------------------------+
|label|features                       |
+-----+-------------------------------+
|1    |[19.0,5.0,11433.0,13303.0,-3.0]|
|-8   |[19.0,5.0,14869.0,12478.0,0.0] |
|-15  |[19.0,5.0,14057.0,14869.0,-4.0]|
+-----+-------------------------------+
only showing top 3 rows



In [16]:
final_train_data.count()

2719418

# Divide Data into Training and Testing Data

In [17]:
df1=traindata_tran.select(["ArrDelay","features"])

In [18]:
df1.show(3)

+--------+--------------------+
|ArrDelay|            features|
+--------+--------------------+
|       1|[19.0,5.0,11433.0...|
|      -8|[19.0,5.0,14869.0...|
|     -15|[19.0,5.0,14057.0...|
+--------+--------------------+
only showing top 3 rows



In [19]:
df2=df1.na.drop().show()

+--------+--------------------+
|ArrDelay|            features|
+--------+--------------------+
|       1|[19.0,5.0,11433.0...|
|      -8|[19.0,5.0,14869.0...|
|     -15|[19.0,5.0,14057.0...|
|      24|[19.0,5.0,15016.0...|
|     -11|[19.0,5.0,11193.0...|
|     -19|[19.0,5.0,10397.0...|
|      -1|[19.0,5.0,15016.0...|
|      24|[19.0,5.0,10397.0...|
|      34|[19.0,5.0,10397.0...|
|     322|[19.0,5.0,11278.0...|
|     -13|[19.0,5.0,14107.0...|
|      41|[19.0,5.0,11433.0...|
|      20|[19.0,5.0,11298.0...|
|      -7|[19.0,5.0,11433.0...|
|      75|[19.0,5.0,10397.0...|
|      57|[19.0,5.0,12451.0...|
|      10|[19.0,5.0,12953.0...|
|     -10|[19.0,5.0,11433.0...|
|      38|[19.0,5.0,10397.0...|
|      25|[19.0,5.0,13204.0...|
+--------+--------------------+
only showing top 20 rows



In [20]:
train_data, test_data=df.randomSplit([0.7,0.3])

AttributeError: 'NoneType' object has no attribute 'randomSplit'

In [None]:
train_data.show(3)