# Import Libraries

In [27]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
from pyspark.sql.functions import round,col,desc,count
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator
from pyspark.ml.regression import LinearRegression
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

# Session Creation
<br>
https://github.com/SuperJohn/spark-and-python-for-big-data-with-pyspark/blob/master/Spark_for_Machine_Learning/Linear_Regression/Linear_Regression_Code_Along.ipynb

In [28]:
spark = SparkSession.builder.master("local[*]").config("spark.executor.memory", "70g")\
     .config("spark.driver.memory", "50g").config("spark.memory.offHeap.enabled",True)\
     .config("spark.memory.offHeap.size","16g").appName("Flight_delay").getOrCreate()


# spark = SparkSession.builder.appName('Flight_delay').getOrCreate()

# Read Data

In [29]:
Flight_Spark_frame  = spark.read.option('recursiveFileLookup','True').option("header", "true")\
                        .csv("dataverse_files/*")
# Flight_Spark_frame.printSchema()

                                                                                

In [30]:
print(Flight_Spark_frame.dtypes)

[('Year', 'string'), ('Month', 'string'), ('DayofMonth', 'string'), ('DayOfWeek', 'string'), ('DepTime', 'string'), ('CRSDepTime', 'string'), ('ArrTime', 'string'), ('CRSArrTime', 'string'), ('UniqueCarrier', 'string'), ('FlightNum', 'string'), ('TailNum', 'string'), ('ActualElapsedTime', 'string'), ('CRSElapsedTime', 'string'), ('AirTime', 'string'), ('ArrDelay', 'string'), ('DepDelay', 'string'), ('Origin', 'string'), ('Dest', 'string'), ('Distance', 'string'), ('TaxiIn', 'string'), ('TaxiOut', 'string'), ('Cancelled', 'string'), ('CancellationCode', 'string'), ('Diverted', 'string'), ('CarrierDelay', 'string'), ('WeatherDelay', 'string'), ('NASDelay', 'string'), ('SecurityDelay', 'string'), ('LateAircraftDelay', 'string')]


In [31]:
carrier_Spark_frame  = spark.read.option("header", "true").csv("carriers.csv")
plane_Spark_frame  = spark.read.option("header", "true").csv("plane-data.csv")
airport_Spark_frame  = spark.read.option("header", "true").csv("airports.csv")

In [32]:
print(carrier_Spark_frame.dtypes)
plane_Spark_frame = plane_Spark_frame.withColumnRenamed("year","Pyear")
print(plane_Spark_frame.dtypes)
print(airport_Spark_frame.dtypes)

[('Code', 'string'), ('Description', 'string')]
[('tailnum', 'string'), ('type', 'string'), ('manufacturer', 'string'), ('issue_date', 'string'), ('model', 'string'), ('status', 'string'), ('aircraft_type', 'string'), ('engine_type', 'string'), ('Pyear', 'string')]
[('iata', 'string'), ('airport', 'string'), ('city', 'string'), ('state', 'string'), ('country', 'string'), ('lat', 'string'), ('long', 'string')]


In [33]:
plane_Spark_frame

DataFrame[tailnum: string, type: string, manufacturer: string, issue_date: string, model: string, status: string, aircraft_type: string, engine_type: string, Pyear: string]

# String Transfromation to Integer For Prediction

In [34]:
Flight_Spark_frame = Flight_Spark_frame.withColumn("WeatherDelay",col("WeatherDelay").cast(IntegerType())) \
    .withColumn("NASDelay",col("NASDelay").cast(IntegerType())) \
    .withColumn("SecurityDelay",col("SecurityDelay").cast(IntegerType()))\
    .withColumn("LateAircraftDelay",col("LateAircraftDelay").cast(IntegerType()))\
    .withColumn("CarrierDelay",col("CarrierDelay").cast(IntegerType()))\
    .withColumn("ArrDelay",col("ArrDelay").cast(IntegerType()))\
    .withColumn("DepDelay",col("DepDelay").cast(IntegerType()))\
    .withColumn("Month",col("Month").cast(IntegerType()))\
    .withColumn("Year",col("Year").cast(IntegerType()))\
    .withColumn("DayOfWeek",col("DayOfWeek").cast(IntegerType()))\
    .withColumn("DayofMonth",col("DayofMonth").cast(IntegerType()))

# MERGE flight data with plane and airport 

In [35]:
plane_flight_data = Flight_Spark_frame.join(plane_Spark_frame,"TailNum","inner")

In [36]:
# Departure Merge takes origin information 
plane_flight_dep_airport_data = plane_flight_data.withColumn("Origin", col("Origin")).\
                                join(airport_Spark_frame.withColumn("Origin", col("iata")), on="Origin")
# Arrival Merge takes destination information 
plane_flight_arr_airport_data = plane_flight_data.withColumn("Dest", col("Dest")).\
                                join(airport_Spark_frame.withColumn("Dest", col("iata")), on="Dest")

In [1]:
# plane_flight_arr_airport_data.columns

In [38]:
plane_flight_arr_airport_data.head()

Row(Dest='PDX', TailNum='N611SW', Year=2007, Month=1, DayofMonth=1, DayOfWeek=1, DepTime='1430', CRSDepTime='1420', ArrTime='1553', CRSArrTime='1550', UniqueCarrier='WN', FlightNum='2386', ActualElapsedTime='83', CRSElapsedTime='90', AirTime='74', ArrDelay=3, DepDelay=10, Origin='SMF', Distance='479', TaxiIn='2', TaxiOut='7', Cancelled='0', CancellationCode=None, Diverted='0', CarrierDelay=0, WeatherDelay=0, NASDelay=0, SecurityDelay=0, LateAircraftDelay=0, type='Corporation', manufacturer='BOEING', issue_date='10/20/1995', model='737-3H4', status='Valid', aircraft_type='Fixed Wing Multi-Engine', engine_type='Turbo-Fan', Pyear='1995', iata='PDX', airport='Portland Intl', city='Portland', state='OR', country='USA', lat='45.58872222', long='-122.5975')

In [39]:
# str_index_flight_data = plane_flight_arr_airport_data.withColumn("Distance",col("Distance").cast(IntegerType()))
# str_index_flight_data = str_index_flight_data.withColumn("Pyear",col("Pyear").cast(IntegerType()))

# Create learnable Column by transforming string values to a index value

In [40]:
str_index_flight_data = StringIndexer(inputCol='UniqueCarrier', outputCol='UniqueCarrier_idx',handleInvalid="skip").\
fit(plane_flight_arr_airport_data).transform(plane_flight_arr_airport_data)
str_index_flight_data = StringIndexer(inputCol='Origin', outputCol='Origin_idx',handleInvalid="skip").\
fit(str_index_flight_data).transform(str_index_flight_data)
str_index_flight_data = StringIndexer(inputCol='issue_date', outputCol='issue_date_idx',handleInvalid="skip").\
fit(str_index_flight_data).transform(str_index_flight_data)
str_index_flight_data = StringIndexer(inputCol='type', outputCol='type_idx',handleInvalid="skip").\
fit(str_index_flight_data).transform(str_index_flight_data)
str_index_flight_data = StringIndexer(inputCol='manufacturer', outputCol='manufacturer_idx',handleInvalid="skip").\
fit(str_index_flight_data).transform(str_index_flight_data)
str_index_flight_data = StringIndexer(inputCol='model', outputCol='model_idx',handleInvalid="skip").\
fit(str_index_flight_data).transform(str_index_flight_data)
str_index_flight_data = StringIndexer(inputCol='status', outputCol='status_idx',handleInvalid="skip").\
fit(str_index_flight_data).transform(str_index_flight_data)


                                                                                

In [41]:
str_index_flight_data = StringIndexer(inputCol='aircraft_type', outputCol='aircraft_type_idx',handleInvalid="skip").\
fit(str_index_flight_data).transform(str_index_flight_data)
str_index_flight_data = StringIndexer(inputCol='engine_type', outputCol='engine_type_idx',handleInvalid="skip").\
fit(str_index_flight_data).transform(str_index_flight_data)

                                                                                

In [42]:
# str_index_flight_data = StringIndexer(inputCol='airport_date', outputCol='airport_idx').\
# fit(str_index_flight_data).transform(str_index_flight_data)
str_index_flight_data = StringIndexer(inputCol='city', outputCol='city_idx',handleInvalid="skip").\
fit(str_index_flight_data).transform(str_index_flight_data)
str_index_flight_data = StringIndexer(inputCol='state', outputCol='state_idx',handleInvalid="skip").\
fit(str_index_flight_data).transform(str_index_flight_data)
str_index_flight_data = StringIndexer(inputCol='country', outputCol='country_idx',handleInvalid="skip").\
fit(str_index_flight_data).transform(str_index_flight_data)
str_index_flight_data = StringIndexer(inputCol='lat', outputCol='lat_idx',handleInvalid="skip").\
fit(str_index_flight_data).transform(str_index_flight_data)
str_index_flight_data = StringIndexer(inputCol='long', outputCol='long_idx',handleInvalid="skip").\
fit(str_index_flight_data).transform(str_index_flight_data)

                                                                                

In [2]:
# str_index_flight_data.printSchema()

In [44]:
str_index_flight_data = str_index_flight_data.withColumn("Distance",col("Distance").cast(IntegerType()))
str_index_flight_data = str_index_flight_data.withColumn("Pyear",col("Pyear").cast(IntegerType()))

# An assembler to transform the big data frame to a vector 

In [45]:
assembler = VectorAssembler(
    inputCols=['Year','Month','DayofMonth','DayOfWeek','UniqueCarrier_idx','Origin_idx','Distance','type_idx',
               'manufacturer_idx','model_idx','status_idx','aircraft_type_idx','engine_type_idx','Pyear','city_idx',
               'state_idx','country_idx','lat_idx','long_idx'],
    outputCol="features")

# Drop all rows with null values 
<br>
This would affect the performance though

In [57]:
str_index_flight_data = str_index_flight_data.dropna(how="any")


In [58]:
str_index_flight_data_assembled = assembler.setHandleInvalid("skip").transform(str_index_flight_data)

# Split the data into train and test

In [59]:
final_data = str_index_flight_data_assembled.select("features",'DepDelay')
train_data,test_data = final_data.randomSplit([0.7, 0.3], seed=42)

In [60]:
# train_data.show()

# Define the regression model

In [61]:
flight_delay_lr = LinearRegression(labelCol='DepDelay')


# Fit the regression model

In [62]:
flight_delay_lr_lrModel = flight_delay_lr.fit(train_data,)
print("Coefficients: {} Intercept: {}".format(flight_delay_lr_lrModel.coefficients,flight_delay_lr_lrModel.intercept))


                                                                                

22/12/02 00:24:50 WARN Instrumentation: [40a7be29] regParam is zero, which might cause numerical instability and overfitting.




22/12/02 00:26:44 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
22/12/02 00:26:44 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS


                                                                                

22/12/02 00:28:51 WARN InstanceBuilder$NativeLAPACK: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK
22/12/02 00:28:51 WARN Instrumentation: [40a7be29] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/12/02 00:28:51 ERROR LBFGS: Failure! Resetting history: breeze.optimize.FirstOrderException: Line search zoom failed
22/12/02 00:28:51 ERROR LBFGS: Failure again! Giving up and returning. Maybe the objective is just poorly behaved?




Coefficients: [2.208669679294616,0.3695456450661353,0.0375841226401842,0.5777605268031156,1.4070570865156462,0.4466512185069203,-0.002083262988756436,0.0,0.094795473130844,-0.016540208201425374,0.0,0.0,-0.14463216273928717,0.20975926678361076,-0.00010812223659155567,0.030489327444966692,0.0,-0.0016130840020186024,-0.0016130840020186024] Intercept: -4865.393467547159


                                                                                

# Test the learned Model

In [63]:
test_results = flight_delay_lr_lrModel.evaluate(test_data)
test_results.residuals.show()




+-------------------+
|          residuals|
+-------------------+
|-7.3925034753810905|
| -4.609339949501191|
| 1.9555024423689247|
+-------------------+





# RMSE for the learned regression model

In [64]:
print("RMSE: {}".format(test_results.rootMeanSquaredError))
print("MSE: {}".format(test_results.meanSquaredError))

RMSE: 5.154904208799616
MSE: 26.57303740189999


# Show the prediction result for the model

In [65]:
unlabeled_data = test_data.select('features')
predictions = flight_delay_lr_lrModel.transform(unlabeled_data)


In [66]:
predictions.show()



+--------------------+-------------------+
|            features|         prediction|
+--------------------+-------------------+
|[2005.0,1.0,30.0,...| 7.3925034753810905|
|[2004.0,1.0,17.0,...|  4.609339949501191|
|[2004.0,6.0,28.0,...|-1.9555024423689247|
+--------------------+-------------------+



                                                                                