# Airline delays 
## Bureau of Transportation Statistics
https://www.transtats.bts.gov/DL_SelectFields.asp?Table_ID=236   
https://www.bts.gov/topics/airlines-and-airports/understanding-reporting-causes-flight-delays-and-cancellations

2015 - 2019

### Additioinal sources
This might be useful in matching station codes to airports:
1. http://dss.ucar.edu/datasets/ds353.4/inventories/station-list.html
2. https://www.world-airport-codes.com/

In [3]:
# imports
import re
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time
from pyspark.sql import functions as f
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType, NullType, ShortType, DateType, BooleanType, BinaryType
from pyspark.sql import SQLContext
from pyspark.sql import DataFrameNaFunctions
sqlContext = SQLContext(sc)

from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator

In [4]:
display(dbutils.fs.ls("dbfs:/mnt/mids-w261/data/datasets_final_project/parquet_airlines_data"))

path,name,size
dbfs:/mnt/mids-w261/data/datasets_final_project/parquet_airlines_data/2015.parquet/,2015.parquet/,0
dbfs:/mnt/mids-w261/data/datasets_final_project/parquet_airlines_data/2016.parquet/,2016.parquet/,0
dbfs:/mnt/mids-w261/data/datasets_final_project/parquet_airlines_data/2017.parquet/,2017.parquet/,0
dbfs:/mnt/mids-w261/data/datasets_final_project/parquet_airlines_data/2018.parquet/,2018.parquet/,0
dbfs:/mnt/mids-w261/data/datasets_final_project/parquet_airlines_data/2019.parquet/,2019.parquet/,0


In [5]:
airlines = spark.read.option("header", "true").parquet(f"dbfs:/mnt/mids-w261/data/datasets_final_project/parquet_airlines_data/201*.parquet")
#display(airlines.sample(False, 0.00001))

In [6]:
  airlines.printSchema()

In [7]:
f'{airlines.count():,}'

In [8]:
#display(airlines.describe())

In [9]:
airlines.where('MONTH == "MONTH"').count()

In [10]:
airlines2 = airlines.filter(airlines.CANCELLED == False)
airlines2 = airlines2.filter(airlines2.DIVERTED == False)



In [11]:
airlines2.filter(airlines2['ARR_DELAY'].isNull()).count()

In [12]:
airlines3 = airlines2.filter(airlines2['ARR_DELAY'].isNull() )
airlines3.filter(airlines3['OP_UNIQUE_CARRIER'] == '9E').count()

In [13]:
airlines3.filter(airlines3['OP_UNIQUE_CARRIER'].isNull()).count()

In [14]:
display(airlines2.filter(airlines2['ARR_DELAY'].isNull() ))

YEAR,QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,FL_DATE,OP_UNIQUE_CARRIER,ORIGIN_AIRPORT_ID,ORIGIN_AIRPORT_SEQ_ID,ORIGIN_CITY_MARKET_ID,ORIGIN,ORIGIN_CITY_NAME,ORIGIN_STATE_ABR,ORIGIN_STATE_FIPS,ORIGIN_STATE_NM,ORIGIN_WAC,DEST_AIRPORT_ID,DEST_AIRPORT_SEQ_ID,DEST_CITY_MARKET_ID,DEST,DEST_CITY_NAME,DEST_STATE_ABR,DEST_STATE_FIPS,DEST_STATE_NM,DEST_WAC,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,DEP_DELAY_NEW,DEP_DEL15,DEP_DELAY_GROUP,DEP_TIME_BLK,TAXI_OUT,WHEELS_OFF,WHEELS_ON,TAXI_IN,CRS_ARR_TIME,ARR_TIME,ARR_DELAY,ARR_DELAY_NEW,ARR_DEL15,ARR_DELAY_GROUP,ARR_TIME_BLK,CANCELLED,DIVERTED,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,AIR_TIME,FLIGHTS,DISTANCE,DISTANCE_GROUP,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY
2018,2,5,31,4,2018-05-31,9E,14492,1449202,34492,RDU,"Raleigh/Durham, NC",NC,37,North Carolina,36,14100,1410005,34100,PHL,"Philadelphia, PA",PA,42,Pennsylvania,23,1230,1223,-7.0,0.0,0.0,-1.0,1200-1259,38.0,1301,1356.0,4.0,1400,1400.0,,,,,1400-1459,False,False,90.0,97.0,55.0,1.0,337.0,2,,,,,
2018,2,5,7,1,2018-05-07,9E,10397,1039707,30397,ATL,"Atlanta, GA",GA,13,Georgia,34,11641,1164102,31641,FAY,"Fayetteville, NC",NC,37,North Carolina,36,1349,1345,-4.0,0.0,0.0,-1.0,1300-1359,28.0,1413,1504.0,4.0,1508,1508.0,,,,,1500-1559,False,False,79.0,83.0,51.0,1.0,331.0,2,,,,,
2018,2,5,14,1,2018-05-14,9E,10397,1039707,30397,ATL,"Atlanta, GA",GA,13,Georgia,34,11641,1164102,31641,FAY,"Fayetteville, NC",NC,37,North Carolina,36,1349,1345,-4.0,0.0,0.0,-1.0,1300-1359,23.0,1408,1501.0,7.0,1508,1508.0,,,,,1500-1559,False,False,79.0,83.0,53.0,1.0,331.0,2,,,,,
2018,2,5,21,1,2018-05-21,9E,10397,1039707,30397,ATL,"Atlanta, GA",GA,13,Georgia,34,11641,1164102,31641,FAY,"Fayetteville, NC",NC,37,North Carolina,36,1349,1343,-6.0,0.0,0.0,-1.0,1300-1359,20.0,1403,1457.0,11.0,1508,1508.0,,,,,1500-1559,False,False,79.0,85.0,54.0,1.0,331.0,2,,,,,
2018,2,5,21,1,2018-05-21,9E,11641,1164102,31641,FAY,"Fayetteville, NC",NC,37,North Carolina,36,10397,1039707,30397,ATL,"Atlanta, GA",GA,13,Georgia,34,1533,1530,-3.0,0.0,0.0,-1.0,1500-1559,27.0,1557,1654.0,7.0,1701,1701.0,,,,,1700-1759,False,False,88.0,91.0,57.0,1.0,331.0,2,,,,,
2018,2,5,31,4,2018-05-31,9E,11641,1164102,31641,FAY,"Fayetteville, NC",NC,37,North Carolina,36,10397,1039707,30397,ATL,"Atlanta, GA",GA,13,Georgia,34,1533,1530,-3.0,0.0,0.0,-1.0,1500-1559,23.0,1553,1655.0,6.0,1701,1701.0,,,,,1700-1759,False,False,88.0,91.0,62.0,1.0,331.0,2,,,,,
2018,2,5,22,2,2018-05-22,9E,12323,1232305,32323,ILM,"Wilmington, NC",NC,37,North Carolina,36,10397,1039707,30397,ATL,"Atlanta, GA",GA,13,Georgia,34,727,719,-8.0,0.0,0.0,-1.0,0700-0759,20.0,739,844.0,13.0,857,857.0,,,,,0800-0859,False,False,90.0,98.0,65.0,1.0,377.0,2,,,,,
2018,2,5,20,7,2018-05-20,9E,10792,1079206,30792,BUF,"Buffalo, NY",NY,36,New York,22,12478,1247805,31703,JFK,"New York, NY",NY,36,New York,22,1836,1849,13.0,13.0,0.0,0.0,1800-1859,13.0,1902,2000.0,15.0,2015,2015.0,,,,,2000-2059,False,False,99.0,86.0,58.0,1.0,301.0,2,,,,,
2018,2,5,8,2,2018-05-08,9E,10397,1039707,30397,ATL,"Atlanta, GA",GA,13,Georgia,34,11612,1161206,31612,EVV,"Evansville, IN",IN,18,Indiana,42,1540,1535,-5.0,0.0,0.0,-1.0,1500-1559,28.0,1603,1557.0,6.0,1603,1603.0,,,,,1600-1659,False,False,83.0,88.0,54.0,1.0,350.0,2,,,,,
2018,2,5,9,3,2018-05-09,9E,12953,1295304,31703,LGA,"New York, NY",NY,36,New York,22,14576,1457606,34576,ROC,"Rochester, NY",NY,36,New York,22,2156,2208,12.0,12.0,0.0,0.0,2100-2159,18.0,2226,2307.0,7.0,2314,2314.0,,,,,2300-2359,False,False,78.0,66.0,41.0,1.0,254.0,2,,,,,


In [15]:
#airlines.where(airlines.ARR_DELAY == null).count()

In [16]:
for year in range(2015, 2020):
  print(year , airlines.select('MONTH').where(f'YEAR == {year}').distinct().collect())

In [17]:
def is_Weekend(x):
  """
  Function to determine if a given day of the week is a weekend_day(Friday, Saturday, Sunday)
  """
  if   x < 5: 
    return 0
  else: 
    return 1

def is_RushHour(x):
  """
  Function to determine if a given time of the day is rush hour (1600-2100)
  """
  if (x != None) and (x >= 1600) and (x <= 2100): 
    return 1
  else: 
    return 0
 
def preprocessAirlines(df):
  cols_to_keep = ['MONTH', 'DAY_OF_WEEK', 'OP_UNIQUE_CARRIER', 'ORIGIN', 'DEST', 'DEP_DELAY', 'DEP_TIME_BLK', 'ARR_DELAY', 'ARR_TIME_BLK', 'CRS_ELAPSED_TIME', 'DISTANCE',  'CARRIER_DELAY', 'WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY', 'IS_WEEKEND', 'DEP_RUSH_HOUR', 'ARR_RUSH_HOUR']
  cols_to_remove = [x for x in df.columns if x not in cols_to_keep]
  df = df.orderBy("FL_DATE") 
  df = df.filter(df.CANCELLED == False)
  df = df.filter(df.DIVERTED == False)
  df = df.withColumn('CARRIER_DELAY', f.when(df.CARRIER_DELAY.isNotNull(), 1).otherwise(0))
  df = df.withColumn('WEATHER_DELAY', f.when(df.WEATHER_DELAY.isNotNull(), 1).otherwise(0))
  df = df.withColumn('NAS_DELAY', f.when(df.NAS_DELAY.isNotNull(), 1).otherwise(0))
  df = df.withColumn('SECURITY_DELAY', f.when(df.SECURITY_DELAY.isNotNull(), 1).otherwise(0))
  df = df.withColumn('LATE_AIRCRAFT_DELAY', f.when(df.LATE_AIRCRAFT_DELAY.isNotNull(), 1).otherwise(0))
  df = df.withColumn("IS_WEEKEND", f.udf(is_Weekend, IntegerType())("DAY_OF_WEEK"))
  df = df.withColumn("DEP_RUSH_HOUR", f.udf(is_RushHour, IntegerType())("DEP_TIME"))
  df = df.withColumn("ARR_RUSH_HOUR", f.udf(is_RushHour, IntegerType())("CRS_ARR_TIME"))
  preprocessAirlines_df = df.drop(*cols_to_remove)
  return preprocessAirlines_df

In [18]:
airlines_df =  preprocessAirlines(airlines)
airlines_df.printSchema()

In [19]:
display(airlines_df.sample(False, 0.0000001))

MONTH,DAY_OF_WEEK,OP_UNIQUE_CARRIER,ORIGIN,DEST,DEP_DELAY,DEP_TIME_BLK,ARR_DELAY,ARR_TIME_BLK,CRS_ELAPSED_TIME,DISTANCE,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,IS_WEEKEND,DEP_RUSH_HOUR,ARR_RUSH_HOUR


In [20]:
def nullDataFrame(df):
  null_feature_list = []
  count = df.count()
  for column in df.columns:
    nulls = df.filter(df[column].isNull()).count()
    nulls_perct = np.round((nulls/count)*100, 2)
    null_feature_list.append([column, nulls, nulls_perct])
  nullCounts_df = pd.DataFrame(np.array(null_feature_list), columns=['Feature_Name', 'Null_Counts', 'Percentage_Null_Counts'])
  return nullCounts_df

In [21]:
nullCounts_df = nullDataFrame(airlines_df)
nullCounts_df

In [22]:
airlines_df.filter(airlines_df['ARR_DELAY'].isNull()).count()

In [23]:
display(airlines_df.filter(airlines_df['ARR_DELAY'].isNull() ))

In [24]:
airlines_preprocessed_filtered = airlines_df.fillna(0, subset=['ARR_DELAY', 'DEP_DELAY'])

In [25]:
f'{airlines_preprocessed_filtered.count():,}'

In [26]:
nullCounts_df2 = nullDataFrame(airlines_preprocessed_filtered)
nullCounts_df2

In [27]:
display(airlines_preprocessed_filtered.describe())

In [28]:
display(airlines_preprocessed_filtered.take(10))

In [29]:
(airlines_preprocessed_filtered.select('ORIGIN').distinct().count())


In [30]:
(airlines_preprocessed_filtered.select('DEST').distinct().count())

In [31]:
(airlines_preprocessed_filtered.select('OP_UNIQUE_CARRIER').distinct().count())

## Feautre Engineering & Feature Selection:

In [33]:
numeric_features = [x[0] for x in airlines_preprocessed_filtered.dtypes if x[1] == 'int' or x[1] == 'double']
numeric_features.remove('ARR_DELAY')
numeric_features

In [34]:
from pyspark.ml.feature import VectorAssembler
vectorAssembler = VectorAssembler(inputCols = numeric_features, outputCol = 'features')
vector_airlines_preprocessed_filtered = vectorAssembler.transform(airlines_preprocessed_filtered)
# display(vector_airlines_preprocessed_filtered)
vector_airlines_preprocessed_filtered = vector_airlines_preprocessed_filtered.select(['features', 'ARR_DELAY'])
# display(vector_airlines_preprocessed_filtered.show())
display(vector_airlines_preprocessed_filtered)

features,ARR_DELAY
"List(0, 13, List(0, 1, 2, 3, 4, 11), List(1.0, 4.0, -6.0, 160.0, 733.0, 1.0))",-22.0
"List(0, 13, List(0, 1, 2, 3, 4, 12), List(1.0, 4.0, -4.0, 120.0, 733.0, 1.0))",-11.0
"List(1, 13, List(), List(1.0, 4.0, 24.0, 210.0, 1192.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0))",37.0
"List(0, 13, List(0, 1, 3, 4, 12), List(1.0, 4.0, 120.0, 733.0, 1.0))",1.0
"List(0, 13, List(0, 1, 2, 3, 4, 12), List(1.0, 4.0, -3.0, 120.0, 733.0, 1.0))",-11.0
"List(1, 13, List(), List(1.0, 4.0, 12.0, 185.0, 1171.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0))",20.0
"List(0, 13, List(0, 1, 2, 3, 4, 12), List(1.0, 4.0, 8.0, 175.0, 1171.0, 1.0))",-1.0
"List(0, 13, List(0, 1, 2, 3, 4), List(1.0, 4.0, -8.0, 338.0, 2724.0))",1.0
"List(1, 13, List(), List(1.0, 4.0, 78.0, 65.0, 247.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0))",82.0
"List(1, 13, List(), List(1.0, 4.0, 70.0, 70.0, 247.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0))",85.0


In [35]:
train_df, val_df, test_df = vector_airlines_preprocessed_filtered.randomSplit([0.8,0.1,0.1], seed = 2020)
display(train_df.show(5))

In [36]:
train_cnt = train_df.count()
val_cnt = val_df.count()
test_cnt = test_df.count()
total_cnt = train_cnt + val_cnt + test_cnt

In [37]:
print('train_df records: {}\n val_df records: {}\n test_df records: {}\n total records: {}'.format(train_cnt, val_cnt, test_cnt, total_cnt) )

## Linear Regression:

In [39]:
lr = LinearRegression(featuresCol = 'features', labelCol='ARR_DELAY')
lr_model = lr.fit(train_df)
print("Coefficients: " + str(lr_model.coefficients))
print("\nIntercept: " + str(lr_model.intercept))

In [40]:
print(lr_model.explainParams())

In [41]:
regression_evaluator_r2 = RegressionEvaluator(predictionCol="prediction", labelCol="ARR_DELAY",metricName="r2")
regression_evaluator_rmse = RegressionEvaluator(predictionCol="prediction", labelCol="ARR_DELAY",metricName="rmse")
regression_evaluator_mae = RegressionEvaluator(predictionCol="prediction", labelCol="ARR_DELAY",metricName="mae")

regression_metrics_list = []

In [42]:
# train_df evaluation metrics
lr_predictions_train = lr_model.transform(train_df)  # lr_predictions_train.select("prediction","ARR_DELAY","features").show(5)
lr_train_r2 = regression_evaluator_r2.evaluate(lr_predictions_train)
lr_train_rmse = regression_evaluator_rmse.evaluate(lr_predictions_train)
lr_train_mae = regression_evaluator_mae.evaluate(lr_predictions_train)
regression_metrics_list.append(["LinearRegression_TrainData", lr_train_r2, lr_train_rmse, lr_train_mae ])


# val_df evaluation metrics
lr_predictions_val = lr_model.transform(val_df)
lr_val_r2 = regression_evaluator_r2.evaluate(lr_predictions_val)
lr_val_rmse = regression_evaluator_rmse.evaluate(lr_predictions_val)
lr_val_mae = regression_evaluator_mae.evaluate(lr_predictions_val)
regression_metrics_list.append(["LinearRegression_ValData", lr_val_r2, lr_val_rmse, lr_val_mae ])

In [43]:
# from  pyspark.sql.functions import abs
# residuals = trainingSummary.residuals
# abs_residuals = residuals.withColumn('abs_residuals',abs(residuals.residuals))
# display(abs_residuals.show(5))
# display(abs_residuals.describe().show())
# abs_residuals_summary = abs_residuals.describe()
# display(abs_residuals_summary)
# MEA_train_df = abs_residuals_summary.select('abs_residuals').toPandas().iloc[1]
# print('Mean Absolute Error of train_df: ', MEA_train_df)

In [44]:
lr_predictions_test = lr_model.transform(test_df)
lr_predictions_test.select("prediction","ARR_DELAY","features").show(5)

from pyspark.ml.evaluation import RegressionEvaluator
lr_evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="ARR_DELAY",metricName="r2")
print("R Squared (R2) on val_data = %g" % lr_evaluator.evaluate(lr_predictions_test))

## Decision Tree Regressor

In [46]:
dt = DecisionTreeRegressor(featuresCol="features", labelCol='ARR_DELAY')  # Train a DecisionTree model.
dt_model = dt.fit(train_df)

In [47]:
print(dt_model.explainParams())

In [48]:
# train_df evaluation metrics
dt_predictions_train = dt_model.transform(train_df)  
dt_train_r2 = regression_evaluator_r2.evaluate(dt_predictions_train)
dt_train_rmse = regression_evaluator_rmse.evaluate(dt_predictions_train)
dt_train_mae = regression_evaluator_mae.evaluate(dt_predictions_train)
regression_metrics_list.append(["DecisionTreeRegressor_TrainData", dt_train_r2, dt_train_rmse, dt_train_mae ])


# val_df evaluation metrics
dt_predictions_val = dt_model.transform(val_df)
dt_val_r2 = regression_evaluator_r2.evaluate(dt_predictions_val)
dt_val_rmse = regression_evaluator_rmse.evaluate(dt_predictions_val)
dt_val_mae = regression_evaluator_mae.evaluate(dt_predictions_val)
regression_metrics_list.append(["DecisionTreeRegressor_ValData", dt_val_r2, dt_val_rmse, dt_val_mae ])

In [49]:
display(dt_model) 

treeNode
"{""index"":31,""featureType"":""continuous"",""prediction"":null,""threshold"":99.5,""categories"":null,""feature"":2,""overflow"":false}"
"{""index"":15,""featureType"":""continuous"",""prediction"":null,""threshold"":0.5,""categories"":null,""feature"":5,""overflow"":false}"
"{""index"":7,""featureType"":""continuous"",""prediction"":null,""threshold"":1.5,""categories"":null,""feature"":2,""overflow"":false}"
"{""index"":3,""featureType"":""continuous"",""prediction"":null,""threshold"":-4.5,""categories"":null,""feature"":2,""overflow"":false}"
"{""index"":1,""featureType"":""continuous"",""prediction"":null,""threshold"":-9.5,""categories"":null,""feature"":2,""overflow"":false}"
"{""index"":0,""featureType"":null,""prediction"":-16.769909173369324,""threshold"":null,""categories"":null,""feature"":null,""overflow"":false}"
"{""index"":2,""featureType"":null,""prediction"":-12.736101423037061,""threshold"":null,""categories"":null,""feature"":null,""overflow"":false}"
"{""index"":5,""featureType"":""continuous"",""prediction"":null,""threshold"":164.5,""categories"":null,""feature"":3,""overflow"":false}"
"{""index"":4,""featureType"":null,""prediction"":-8.04484245295857,""threshold"":null,""categories"":null,""feature"":null,""overflow"":false}"
"{""index"":6,""featureType"":null,""prediction"":-11.540018738712366,""threshold"":null,""categories"":null,""feature"":null,""overflow"":false}"


## Random Forest Regressor

In [51]:
rf = RandomForestRegressor(featuresCol="features", labelCol='ARR_DELAY')
rf_model = rf.fit(train_df)

In [52]:
print(rf_model.explainParams())

In [53]:
# train_df evaluation metrics
rf_predictions_train = rf_model.transform(train_df)  
rf_train_r2 = regression_evaluator_r2.evaluate(rf_predictions_train)
rf_train_rmse = regression_evaluator_rmse.evaluate(rf_predictions_train)
rf_train_mae = regression_evaluator_mae.evaluate(rf_predictions_train)
regression_metrics_list.append(["RandomForestRegressor_TrainData", rf_train_r2, rf_train_rmse, rf_train_mae ])


# val_df evaluation metrics
rf_predictions_val = rf_model.transform(val_df)
rf_val_r2 = regression_evaluator_r2.evaluate(rf_predictions_val)
rf_val_rmse = regression_evaluator_rmse.evaluate(rf_predictions_val)
rf_val_mae = regression_evaluator_mae.evaluate(rf_predictions_val)
regression_metrics_list.append(["RandomForestRegressor_ValData", rf_val_r2, rf_val_rmse, rf_val_mae ])

## Gradient-Boosted Trees

In [55]:
gbt = GBTRegressor(featuresCol="features", labelCol='ARR_DELAY')
gbt_model = gbt.fit(train_df)

In [56]:
print(gbt_model.explainParams())

In [57]:
# train_df evaluation metrics
gbt_predictions_train = gbt_model.transform(train_df)  
gbt_train_r2 = regression_evaluator_r2.evaluate(gbt_predictions_train)
gbt_train_rmse = regression_evaluator_rmse.evaluate(gbt_predictions_train)
gbt_train_mae = regression_evaluator_mae.evaluate(gbt_predictions_train)
regression_metrics_list.append(["GradientBoostedTreeRegressor_TrainData", gbt_train_r2, gbt_train_rmse, gbt_train_mae ])


# val_df evaluation metrics
gbt_predictions_val = gbt_model.transform(val_df)
gbt_val_r2 = regression_evaluator_r2.evaluate(gbt_predictions_val)
gbt_val_rmse = regression_evaluator_rmse.evaluate(gbt_predictions_val)
gbt_val_mae = regression_evaluator_mae.evaluate(gbt_predictions_val)
regression_metrics_list.append(["GradientBoostedTreeRegressor_ValData", gbt_val_r2, gbt_val_rmse, gbt_val_mae ])

In [58]:
regression_metrics_df = pd.DataFrame(regression_metrics_list, columns = ['Model_Data' , 'R^2', 'RMSE', 'MAE']) 
display(regression_metrics_df)

Model_Data,R^2,RMSE,MAE
LinearRegression_TrainData,0.930281693529758,12.046667345682192,8.704472347471443
LinearRegression_ValData,0.9302085366558466,12.044174050518752,8.705095292699935
DecisionTreeRegressor_TrainData,0.6954692529758277,25.156908274749146,10.239642000803014
DecisionTreeRegressor_ValData,0.6946710052228073,24.9086103135382,10.249951229174805
RandomForestRegressor_TrainData,0.6832914150909308,25.68249205388697,10.615702083109014
RandomForestRegressor_ValData,0.6819623540130102,25.59427556756637,10.5989877321516
GradientBoostedTreeRegressor_TrainData,0.7038058596726879,24.80515605748653,9.977951188491126
GradientBoostedTreeRegressor_ValData,0.7065805344668528,24.57578220065924,9.969779922672815


### Ignore Below

In [60]:
sample_airlines_df = airlines_preprocessed_filtered.sample(False, 0.0001, 2020)
pandas_airlines_df = sample_airlines_df.toPandas()

In [61]:
#pandas_airlines_df[['ARR_DELAY', 'ARR_TIME', 'DEP_DELAY', 'DEP_TIME', 'CARRIER_DELAY', 'WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY']][pandas_airlines_df.DEP_DELAY < 0 ].head(20)

In [62]:
nullCounts_df2 = nullDataFrame(airlines_preprocessed_filtered)
nullCounts_df2

Unnamed: 0,Feature_Name,Null_Counts,Percentage_Null_Counts
0,MONTH,0,0.0
1,DAY_OF_WEEK,0,0.0
2,OP_UNIQUE_CARRIER,0,0.0
3,ORIGIN,0,0.0
4,DEST,0,0.0
5,DEP_DELAY,0,0.0
6,DEP_TIME_BLK,0,0.0
7,ARR_DELAY,0,0.0
8,ARR_TIME_BLK,0,0.0
9,CRS_ELAPSED_TIME,0,0.0


In [63]:
features = ['ARR_DELAY', 'DEP_DELAY', 'CARRIER_DELAY', 'WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY', 'IS_WEEKEND', 'DEP_RUSH_HOUR', 'ARR_RUSH_HOUR']
# airlines_filtered[Delay_List].describe().show()

In [64]:
sample_airlines_df = airlines_preprocessed_filtered.select(features).sample(False, 0.001, 2020)
pandas_df = sample_airlines_df.toPandas()

In [65]:
pandas_df.corr()

Unnamed: 0,ARR_DELAY,DEP_DELAY,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,IS_WEEKEND,DEP_RUSH_HOUR,ARR_RUSH_HOUR
ARR_DELAY,1.0,0.957012,0.623497,0.623497,0.623497,0.623497,0.623497,-0.011584,0.065292,0.057953
DEP_DELAY,0.957012,1.0,0.551323,0.551323,0.551323,0.551323,0.551323,-0.004922,0.064986,0.054647
CARRIER_DELAY,0.623497,0.551323,1.0,1.0,1.0,1.0,1.0,-0.010433,0.09992,0.080117
WEATHER_DELAY,0.623497,0.551323,1.0,1.0,1.0,1.0,1.0,-0.010433,0.09992,0.080117
NAS_DELAY,0.623497,0.551323,1.0,1.0,1.0,1.0,1.0,-0.010433,0.09992,0.080117
SECURITY_DELAY,0.623497,0.551323,1.0,1.0,1.0,1.0,1.0,-0.010433,0.09992,0.080117
LATE_AIRCRAFT_DELAY,0.623497,0.551323,1.0,1.0,1.0,1.0,1.0,-0.010433,0.09992,0.080117
IS_WEEKEND,-0.011584,-0.004922,-0.010433,-0.010433,-0.010433,-0.010433,-0.010433,1.0,0.006678,0.008769
DEP_RUSH_HOUR,0.065292,0.064986,0.09992,0.09992,0.09992,0.09992,0.09992,0.006678,1.0,0.429058
ARR_RUSH_HOUR,0.057953,0.054647,0.080117,0.080117,0.080117,0.080117,0.080117,0.008769,0.429058,1.0


In [66]:
fig = plt.figure(figsize = (15,20))
ax = fig.gca()
pandas_df.hist(ax=ax, bins=30, figsize=(3,15),)
#ax.set_yscale('log')
plt.yscale('log')
display(plt.show())

# Weather
https://data.nodc.noaa.gov/cgi-bin/iso?id=gov.noaa.ncdc:C00532

In [68]:
dbutils.fs.ls("dbfs:/mnt/mids-w261/data/datasets_final_project/parquet_weather_data")

In [69]:
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType, NullType
schema = StructType([StructField('STATION', StringType(), True), 
                      StructField('DATE', StringType(), True),
                      StructField('SOURCE', StringType(), True),
                      StructField('LATITUDE', StringType(), True),
                      StructField('LONGITUDE', StringType(), True),
                      StructField('ELEVATION', StringType(), True),
                      StructField('NAME', StringType(), True),
                      StructField('REPORT_TYPE', StringType(), True),
                      StructField('CALL_SIGN', StringType(), True),
                      StructField('QUALITY_CONTROL', StringType(), True),
                      StructField('WND', StringType(), True),
                      StructField('CIG', StringType(), True),
                      StructField('VIS', StringType(), True),
                      StructField('TMP', StringType(), True),
                      StructField('DEW', StringType(), True),
                      StructField('SLP', StringType(), True),
                      StructField('AA1', StringType(), True),
                      StructField('AA2', StringType(), True),
                      StructField('AJ1', StringType(), True),
                      StructField('AY1', StringType(), True),
                      StructField('AY2', StringType(), True),
                      StructField('GA1', StringType(), True),
                      StructField('GA2', StringType(), True),
                      StructField('GA3', StringType(), True),
                      StructField('GE1', StringType(), True),
                      StructField('GF1', StringType(), True),
                      StructField('IA1', StringType(), True),
                      StructField('KA1', StringType(), True),
                      StructField('KA2', StringType(), True),
                      StructField('MA1', StringType(), True),
                      StructField('MD1', StringType(), True),
                      StructField('MW1', StringType(), True),
                      StructField('OC1', StringType(), True),
                      StructField('OD1', StringType(), True),
                      StructField('SA1', StringType(), True),
                      StructField('UA1', StringType(), True),
                      StructField('REM', StringType(), True),
                      StructField('EQD', StringType(), True)
                    ])



In [70]:
weather = spark.read.option("header", "true")\
                      .schema(schema)\
                      .parquet(f"dbfs:/mnt/mids-w261/data/datasets_final_project/parquet_weather_data/201*a.parquet")
weather.count()


In [71]:
display(weather.where('DATE =="DATE"'))

In [72]:
#display(weather.describe())

In [73]:
nullCounts_weather_df = nullDataFrame(weather)
nullCounts_weather_df

In [74]:
#display(weather.sample(False, 0.0000001))

In [75]:
weather[["DATE"]].describe().show()

In [76]:
weather_df = weather.withColumn("DATE_IN_DATEFORMAT",weather['DATE'].cast(DateType()))
weather_df.select('DATE_IN_DATEFORMAT','DATE').show(10,False)

In [77]:
display(weather_df.sample(False, 0.0000001))

In [78]:
# Join Airlines data and Weather data by DATE and AIRPORT

# Stations

In [80]:
stations = spark.read.option("header", "true").csv("dbfs:/mnt/mids-w261/data/DEMO8/gsod/stations.csv.gz")

In [81]:
display(stations)

In [82]:
from pyspark.sql import functions as f
stations.where(f.col('name').contains('JAN MAYEN NOR NAVY'))

In [83]:
stations.select('name').distinct().count()

In [84]:
display(stations.select('name').distinct())

In [85]:
weather.select('NAME').distinct().count()

In [86]:
#display(weather.select('name').distinct())