# Airline delays 
## Bureau of Transportation Statistics
https://www.transtats.bts.gov/DL_SelectFields.asp?Table_ID=236   
https://www.bts.gov/topics/airlines-and-airports/understanding-reporting-causes-flight-delays-and-cancellations

2015 - 2019

### Additioinal sources
This might be useful in matching station codes to airports:
1. http://dss.ucar.edu/datasets/ds353.4/inventories/station-list.html
2. https://www.world-airport-codes.com/

In [3]:
# imports
import re
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time
from pyspark.sql import functions as f
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType, NullType, ShortType, DateType, BooleanType, BinaryType
from pyspark.sql import SQLContext
from pyspark.sql.functions import col
from pyspark.sql import DataFrameNaFunctions
sqlContext = SQLContext(sc)

from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit, CrossValidator

In [4]:
display(dbutils.fs.ls("dbfs:/mnt/mids-w261/data/datasets_final_project/parquet_airlines_data"))

path,name,size
dbfs:/mnt/mids-w261/data/datasets_final_project/parquet_airlines_data/2015.parquet/,2015.parquet/,0
dbfs:/mnt/mids-w261/data/datasets_final_project/parquet_airlines_data/2016.parquet/,2016.parquet/,0
dbfs:/mnt/mids-w261/data/datasets_final_project/parquet_airlines_data/2017.parquet/,2017.parquet/,0
dbfs:/mnt/mids-w261/data/datasets_final_project/parquet_airlines_data/2018.parquet/,2018.parquet/,0
dbfs:/mnt/mids-w261/data/datasets_final_project/parquet_airlines_data/2019.parquet/,2019.parquet/,0


In [5]:
airlines = spark.read.option("header", "true").parquet(f"dbfs:/mnt/mids-w261/data/datasets_final_project/parquet_airlines_data/201*.parquet")

In [6]:
  airlines.printSchema()

In [7]:
f'{airlines.count():,}'

In [8]:
airlines_train, airlines_val, airlines_test = airlines.randomSplit([0.8,0.1,0.1], seed = 2020)

In [9]:
train_cnt = airlines_train.count()
val_cnt = airlines_val.count()
test_cnt = airlines_test.count()
total_cnt = train_cnt + val_cnt + test_cnt
print('airlines_train records: {}\n airlines_val records: {}\n  airlines_test records: {}\n total records: {}'.format(train_cnt, val_cnt, test_cnt, total_cnt) )

In [10]:
#display(airlines.describe())
#airlines.where('MONTH == "MONTH"').count() 
# airlines2 = airlines_train.filter(airlines_train.CANCELLED == False)
# airlines2 = airlines2.filter(airlines2.DIVERTED == False)
# airlines2.filter(airlines2['ARR_DELAY'].isNull()).count()
# airlines3 = airlines2.filter(airlines2['ARR_DELAY'].isNull() )
# airlines3.filter(airlines3['OP_UNIQUE_CARRIER'] == '9E').count()
# airlines3.filter(airlines3['OP_UNIQUE_CARRIER'].isNull()).count()
# display(airlines2.filter(airlines2['ARR_DELAY'].isNull() ))
#airlines.where(airlines.ARR_DELAY == null).count()
# for year in range(2015, 2020):
#   print(year , airlines.select('MONTH').where(f'YEAR == {year}').distinct().collect())

In [11]:
def is_Weekend(x):
  """
  Function to determine if a given day of the week is a weekend_day(Friday, Saturday, Sunday)
  """
  if   x < 5: 
    return 0
  else: 
    return 1

def is_RushHour(x):
  """
  Function to determine if a given time of the day is rush hour (1600-2100)
  """
  if (x != None) and (x >= 1600) and (x <= 2100): 
    return 1
  else: 
    return 0
 
def preprocessAirlines(df):
  cols_to_keep = ['MONTH', 'DAY_OF_WEEK', 'OP_UNIQUE_CARRIER', 'ORIGIN', 'DEST', 'DEP_DELAY', 'DEP_TIME_BLK', 'ARR_DELAY', 'ARR_TIME_BLK', 'CRS_ELAPSED_TIME', 'DISTANCE',  'CARRIER_DELAY', 'WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY', 'IS_WEEKEND', 'DEP_RUSH_HOUR', 'ARR_RUSH_HOUR']
  cols_to_remove = [x for x in df.columns if x not in cols_to_keep]
  df = df.orderBy("FL_DATE") 
  df = df.filter(df.CANCELLED == False)
  df = df.filter(df.DIVERTED == False)
  df = df.withColumn('CARRIER_DELAY', f.when(df.CARRIER_DELAY.isNotNull(), 1).otherwise(0))
  df = df.withColumn('WEATHER_DELAY', f.when(df.WEATHER_DELAY.isNotNull(), 1).otherwise(0))
  df = df.withColumn('NAS_DELAY', f.when(df.NAS_DELAY.isNotNull(), 1).otherwise(0))
  df = df.withColumn('SECURITY_DELAY', f.when(df.SECURITY_DELAY.isNotNull(), 1).otherwise(0))
  df = df.withColumn('LATE_AIRCRAFT_DELAY', f.when(df.LATE_AIRCRAFT_DELAY.isNotNull(), 1).otherwise(0))
  df = df.withColumn("IS_WEEKEND", f.udf(is_Weekend, IntegerType())("DAY_OF_WEEK"))
  df = df.withColumn("DEP_RUSH_HOUR", f.udf(is_RushHour, IntegerType())("DEP_TIME"))
  df = df.withColumn("ARR_RUSH_HOUR", f.udf(is_RushHour, IntegerType())("CRS_ARR_TIME"))
  df = df.fillna(0, subset=['ARR_DELAY', 'DEP_DELAY'])
  preprocessAirlines_df = df.drop(*cols_to_remove)
  return preprocessAirlines_df

In [12]:
airlines_train_df =  preprocessAirlines(airlines_train)
airlines_train_df.printSchema()

In [13]:
display(airlines_train_df.sample(False, 0.0000001))

MONTH,DAY_OF_WEEK,OP_UNIQUE_CARRIER,ORIGIN,DEST,DEP_DELAY,DEP_TIME_BLK,ARR_DELAY,ARR_TIME_BLK,CRS_ELAPSED_TIME,DISTANCE,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,IS_WEEKEND,DEP_RUSH_HOUR,ARR_RUSH_HOUR
8,7,UA,PHL,IAH,2.0,0001-0559,-17.0,0700-0759,202.0,1325.0,0,0,0,0,0,1,0,0
8,6,AS,FAI,SEA,-4.0,0600-0659,26.0,1100-1159,212.0,1533.0,1,1,1,1,1,1,0,0
12,4,WN,HOU,MCO,11.0,0600-0659,-2.0,0900-0959,130.0,849.0,0,0,0,0,0,0,0,0


In [14]:
def nullDataFrame(df):
  null_feature_list = []
  count = df.count()
  for column in df.columns:
    nulls = df.filter(df[column].isNull()).count()
    nulls_perct = np.round((nulls/count)*100, 2)
    null_feature_list.append([column, nulls, nulls_perct])
  nullCounts_df = pd.DataFrame(np.array(null_feature_list), columns=['Feature_Name', 'Null_Counts', 'Percentage_Null_Counts'])
  return nullCounts_df


nullCounts_df = nullDataFrame(airlines_train_df)
nullCounts_df

Unnamed: 0,Feature_Name,Null_Counts,Percentage_Null_Counts
0,MONTH,0,0.0
1,DAY_OF_WEEK,0,0.0
2,OP_UNIQUE_CARRIER,0,0.0
3,ORIGIN,0,0.0
4,DEST,0,0.0
5,DEP_DELAY,0,0.0
6,DEP_TIME_BLK,0,0.0
7,ARR_DELAY,0,0.0
8,ARR_TIME_BLK,0,0.0
9,CRS_ELAPSED_TIME,0,0.0


In [15]:
f'{airlines_train_df.count():,}'

In [16]:
# (airlines_train_df.select('ORIGIN').distinct().count())
# (airlines_train_df.select('DEST').distinct().count())
# (airlines_train_df.select('OP_UNIQUE_CARRIER').distinct().count())

## Feautre Engineering & Feature Selection:

In [18]:
numeric_features = [x[0] for x in airlines_train_df.dtypes if x[1] == 'int' or x[1] == 'double']
numeric_features.remove('ARR_DELAY')
numeric_features

In [19]:
categorical_features = [x[0] for x in airlines_train_df.dtypes if x[1] == 'string']
categorical_features

In [20]:
stages = []
for categoricalCol in categorical_features:
    stringIndexer = StringIndexer(inputCol = categoricalCol, outputCol = categoricalCol + 'Index')
    encoder = OneHotEncoderEstimator(inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"])
    stages += [stringIndexer, encoder]
assemblerInputs = [c + "classVec" for c in categorical_features] + numeric_features
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler]

In [21]:
pipeline = Pipeline().setStages(stages)
feature_selection_pipeline = pipeline.fit(airlines_train_df)
vector_airlines_train_df = feature_selection_pipeline.transform(airlines_train_df)
vector_airlines_train_df.printSchema()

In [22]:
# from pyspark.ml.feature import VectorAssembler
# vectorAssembler = VectorAssembler(inputCols = numeric_features, outputCol = 'features')
# vector_airlines_preprocessed_filtered = vectorAssembler.transform(airlines_preprocessed_filtered)
# vector_airlines_preprocessed_filtered = vector_airlines_preprocessed_filtered.select(['features', 'ARR_DELAY'])
# display(vector_airlines_preprocessed_filtered)

## Linear Regression:

In [24]:
# lr = LinearRegression(featuresCol = 'features', labelCol='ARR_DELAY')
# lr_model = lr.fit(train_df)
#print("Coefficients: " + str(lr_model.coefficients))
#print("\nIntercept: " + str(lr_model.intercept))

In [25]:
# print(lr_model.explainParams())

In [26]:
# regression_evaluator_r2 = RegressionEvaluator(predictionCol="prediction", labelCol="ARR_DELAY",metricName="r2")
# regression_evaluator_rmse = RegressionEvaluator(predictionCol="prediction", labelCol="ARR_DELAY",metricName="rmse")
# regression_evaluator_mae = RegressionEvaluator(predictionCol="prediction", labelCol="ARR_DELAY",metricName="mae")

# regression_metrics_list = []

In [27]:
# # train_df evaluation metrics
# lr_predictions_train = lr_model.transform(train_df)  # lr_predictions_train.select("prediction","ARR_DELAY","features").show(5)
# lr_train_r2 = regression_evaluator_r2.evaluate(lr_predictions_train)
# lr_train_rmse = regression_evaluator_rmse.evaluate(lr_predictions_train)
# lr_train_mae = regression_evaluator_mae.evaluate(lr_predictions_train)
# regression_metrics_list.append(["LinearRegression_TrainData", lr_train_r2, lr_train_rmse, lr_train_mae ])


# # val_df evaluation metrics
# lr_predictions_val = lr_model.transform(val_df)
# lr_val_r2 = regression_evaluator_r2.evaluate(lr_predictions_val)
# lr_val_rmse = regression_evaluator_rmse.evaluate(lr_predictions_val)
# lr_val_mae = regression_evaluator_mae.evaluate(lr_predictions_val)
# regression_metrics_list.append(["LinearRegression_ValData", lr_val_r2, lr_val_rmse, lr_val_mae ])

In [28]:
# from  pyspark.sql.functions import abs
# residuals = trainingSummary.residuals
# abs_residuals = residuals.withColumn('abs_residuals',abs(residuals.residuals))
# display(abs_residuals.show(5))
# display(abs_residuals.describe().show())
# abs_residuals_summary = abs_residuals.describe()
# display(abs_residuals_summary)
# MEA_train_df = abs_residuals_summary.select('abs_residuals').toPandas().iloc[1]
# print('Mean Absolute Error of train_df: ', MEA_train_df)

In [29]:
# lr_predictions_test = lr_model.transform(test_df)
# lr_predictions_test.select("prediction","ARR_DELAY","features").show(5)

# from pyspark.ml.evaluation import RegressionEvaluator
# lr_evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="ARR_DELAY",metricName="r2")
# print("R Squared (R2) on val_data = %g" % lr_evaluator.evaluate(lr_predictions_test))

## Cross Validation:

In [31]:
train_df2 = vector_airlines_train_df.select(col("ARR_DELAY").alias("label"), col("features"))
train_df2.show(2)

# val_df2 = val_df.select(col("ARR_DELAY").alias("label"), col("features"))
# val_df2.show(2)

In [32]:
lr = LinearRegression(featuresCol = 'features', labelCol='label')

paramGrid = ParamGridBuilder()\
    .addGrid(lr.regParam, [0.1, 0.01]) \
    .build()
   # .addGrid(lr.maxIter, [3, 10])\
  #  .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])\
   

crossval = CrossValidator(estimator=lr,
                          estimatorParamMaps=paramGrid,
                          evaluator=RegressionEvaluator(),
                          numFolds=5) 

cvModel = crossval.fit(train_df2)

#prediction = cvModel.transform(val_df2)


In [33]:
regression_evaluator_r2 = RegressionEvaluator(predictionCol="prediction", labelCol="label",metricName="r2")
regression_evaluator_rmse = RegressionEvaluator(predictionCol="prediction", labelCol="label",metricName="rmse")
regression_evaluator_mae = RegressionEvaluator(predictionCol="prediction", labelCol="label",metricName="mae")

regression_metrics_list = []

In [34]:
# train_df evaluation metrics
lr_predictions_train = cvModel.transform(train_df2)
lr_train_r2 = regression_evaluator_r2.evaluate(lr_predictions_train)
lr_train_rmse = regression_evaluator_rmse.evaluate(lr_predictions_train)
lr_train_mae = regression_evaluator_mae.evaluate(lr_predictions_train)
regression_metrics_list.append(["LinearRegression_TrainData_CV", lr_train_r2, lr_train_rmse, lr_train_mae ])


In [35]:
# # val_df evaluation metrics
# lr_val_r2 = regression_evaluator_r2.evaluate(prediction)
# lr_val_rmse = regression_evaluator_rmse.evaluate(prediction)
# lr_val_mae = regression_evaluator_mae.evaluate(prediction)
# regression_metrics_list.append(["LinearRegression_ValData_CV", lr_val_r2, lr_val_rmse, lr_val_mae ])

In [36]:
regression_metrics_df = pd.DataFrame(regression_metrics_list, columns = ['Model_Data' , 'R^2', 'RMSE', 'MAE']) 
display(regression_metrics_df)

In [37]:
bestLRModel = cvModel.bestModel
bestParams = bestLRModel.extractParamMap()
bestParams

https://runawayhorse001.github.io/LearningApacheSpark/reg.html

In [39]:
for k, v in bestParams:
  print(v)

## Decision Tree Regressor

In [41]:
# dt = DecisionTreeRegressor(featuresCol="features", labelCol='ARR_DELAY')  # Train a DecisionTree model.
# dt_model = dt.fit(train_df)

In [42]:
# print(dt_model.explainParams())

In [43]:
# # train_df evaluation metrics
# dt_predictions_train = dt_model.transform(train_df)  
# dt_train_r2 = regression_evaluator_r2.evaluate(dt_predictions_train)
# dt_train_rmse = regression_evaluator_rmse.evaluate(dt_predictions_train)
# dt_train_mae = regression_evaluator_mae.evaluate(dt_predictions_train)
# regression_metrics_list.append(["DecisionTreeRegressor_TrainData", dt_train_r2, dt_train_rmse, dt_train_mae ])


# # val_df evaluation metrics
# dt_predictions_val = dt_model.transform(val_df)
# dt_val_r2 = regression_evaluator_r2.evaluate(dt_predictions_val)
# dt_val_rmse = regression_evaluator_rmse.evaluate(dt_predictions_val)
# dt_val_mae = regression_evaluator_mae.evaluate(dt_predictions_val)
# regression_metrics_list.append(["DecisionTreeRegressor_ValData", dt_val_r2, dt_val_rmse, dt_val_mae ])

In [44]:
# display(dt_model) 

In [45]:
dt = DecisionTreeRegressor(featuresCol="features", labelCol='label') 

paramGrid_dt = ParamGridBuilder()\
    .addGrid(dt.maxBins, [10, 32]) \
    .build()  

crossval_dt = CrossValidator(estimator=dt,
                          estimatorParamMaps=paramGrid_dt,
                          evaluator=RegressionEvaluator(),
                          numFolds=5) 

cvModel_dt = crossval_dt.fit(train_df2)

In [46]:
dt_predictions_train = cvModel_dt.transform(train_df2)
dt_train_r2 = regression_evaluator_r2.evaluate(dt_predictions_train)
dt_train_rmse = regression_evaluator_rmse.evaluate(dt_predictions_train)
dt_train_mae = regression_evaluator_mae.evaluate(dt_predictions_train)
regression_metrics_list.append(["DecisionTreeRegressor_TrainData_CV", dt_train_r2, dt_train_rmse, dt_train_mae ])

In [47]:
bestDTModel = cvModel_dt.bestModel
bestParams_dt = bestDTModel.extractParamMap()
bestParams_dt

## Random Forest Regressor

In [49]:
# rf = RandomForestRegressor(featuresCol="features", labelCol='ARR_DELAY')
# rf_model = rf.fit(train_df)

In [50]:
# print(rf_model.explainParams())

In [51]:
# # train_df evaluation metrics
# rf_predictions_train = rf_model.transform(train_df)  
# rf_train_r2 = regression_evaluator_r2.evaluate(rf_predictions_train)
# rf_train_rmse = regression_evaluator_rmse.evaluate(rf_predictions_train)
# rf_train_mae = regression_evaluator_mae.evaluate(rf_predictions_train)
# regression_metrics_list.append(["RandomForestRegressor_TrainData", rf_train_r2, rf_train_rmse, rf_train_mae ])


# # val_df evaluation metrics
# rf_predictions_val = rf_model.transform(val_df)
# rf_val_r2 = regression_evaluator_r2.evaluate(rf_predictions_val)
# rf_val_rmse = regression_evaluator_rmse.evaluate(rf_predictions_val)
# rf_val_mae = regression_evaluator_mae.evaluate(rf_predictions_val)
# regression_metrics_list.append(["RandomForestRegressor_ValData", rf_val_r2, rf_val_rmse, rf_val_mae ])

In [52]:
rf = RandomForestRegressor(featuresCol="features", labelCol='label')

paramGrid_rf = ParamGridBuilder()\
    .addGrid(rf.maxBins, [10, 32]) \
    .build()  

crossval_rf = CrossValidator(estimator=rf,
                          estimatorParamMaps=paramGrid_rf,
                          evaluator=RegressionEvaluator(),
                          numFolds=5) 

cvModel_rf = crossval_rf.fit(train_df2)

In [53]:
rf_predictions_train = cvModel_rf.transform(train_df2)
rf_train_r2 = regression_evaluator_r2.evaluate(rf_predictions_train)
rf_train_rmse = regression_evaluator_rmse.evaluate(rf_predictions_train)
rf_train_mae = regression_evaluator_mae.evaluate(rf_predictions_train)
regression_metrics_list.append(["RandomForestRegressor_TrainData_CV", rf_train_r2, rf_train_rmse, rf_train_mae ])

In [54]:
bestRFModel = cvModel_rf.bestModel
bestParams_rf = bestRFModel.extractParamMap()
bestParams_rf

## Gradient-Boosted Trees

In [56]:
# gbt = GBTRegressor(featuresCol="features", labelCol='ARR_DELAY')
# gbt_model = gbt.fit(train_df)

In [57]:
print(gbt_model.explainParams())

In [58]:
# # train_df evaluation metrics
# gbt_predictions_train = gbt_model.transform(train_df)  
# gbt_train_r2 = regression_evaluator_r2.evaluate(gbt_predictions_train)
# gbt_train_rmse = regression_evaluator_rmse.evaluate(gbt_predictions_train)
# gbt_train_mae = regression_evaluator_mae.evaluate(gbt_predictions_train)
# regression_metrics_list.append(["GradientBoostedTreeRegressor_TrainData", gbt_train_r2, gbt_train_rmse, gbt_train_mae ])


# # val_df evaluation metrics
# gbt_predictions_val = gbt_model.transform(val_df)
# gbt_val_r2 = regression_evaluator_r2.evaluate(gbt_predictions_val)
# gbt_val_rmse = regression_evaluator_rmse.evaluate(gbt_predictions_val)
# gbt_val_mae = regression_evaluator_mae.evaluate(gbt_predictions_val)
# regression_metrics_list.append(["GradientBoostedTreeRegressor_ValData", gbt_val_r2, gbt_val_rmse, gbt_val_mae ])

In [59]:
gbt = GBTRegressor(featuresCol="features", labelCol='label')

paramGrid_gbt = ParamGridBuilder()\
    .addGrid(gbt.maxBins, [10, 32]) \
    .build()  

crossval_gbt = CrossValidator(estimator=gbt,
                          estimatorParamMaps=paramGrid_gbt,
                          evaluator=RegressionEvaluator(),
                          numFolds=5) 

cvModel_gbt = crossval_gbt.fit(train_df2)

In [60]:
gbt_predictions_train = cvModel_gbt.transform(train_df2)
gbt_train_r2 = regression_evaluator_r2.evaluate(gbt_predictions_train)
gbt_train_rmse = regression_evaluator_rmse.evaluate(gbt_predictions_train)
gbt_train_mae = regression_evaluator_mae.evaluate(gbt_predictions_train)
regression_metrics_list.append(["GradientBoostedTreeRegressor_TrainData_CV", gbt_train_r2, gbt_train_rmse, gbt_train_mae ])

In [62]:
regression_metrics_df = pd.DataFrame(regression_metrics_list, columns = ['Model_Data' , 'R^2', 'RMSE', 'MAE']) 
display(regression_metrics_df)

Model_Data,R^2,RMSE,MAE
LinearRegression_TrainData_CV,0.9333326713511594,11.773567834606734,8.449675674480563
DecisionTreeRegressor_TrainData_CV,0.6987655312918775,24.970621841550475,10.28041461617062
RandomForestRegressor_TrainData_CV,0.6870668540352702,25.517067370460445,10.88919230251608
GradientBoostedTreeRegressor_TrainData_CV,0.7152746349749126,24.31873958382748,9.978794206252733


### Ignore Below

In [64]:
sample_airlines_df = airlines_preprocessed_filtered.sample(False, 0.0001, 2020)
pandas_airlines_df = sample_airlines_df.toPandas()

In [65]:
#pandas_airlines_df[['ARR_DELAY', 'ARR_TIME', 'DEP_DELAY', 'DEP_TIME', 'CARRIER_DELAY', 'WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY']][pandas_airlines_df.DEP_DELAY < 0 ].head(20)

In [66]:
nullCounts_df2 = nullDataFrame(airlines_preprocessed_filtered)
nullCounts_df2

In [67]:
features = ['ARR_DELAY', 'DEP_DELAY', 'CARRIER_DELAY', 'WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY', 'IS_WEEKEND', 'DEP_RUSH_HOUR', 'ARR_RUSH_HOUR']
# airlines_filtered[Delay_List].describe().show()

In [68]:
sample_airlines_df = airlines_preprocessed_filtered.select(features).sample(False, 0.001, 2020)
pandas_df = sample_airlines_df.toPandas()

In [69]:
pandas_df.corr()

In [70]:
fig = plt.figure(figsize = (15,20))
ax = fig.gca()
pandas_df.hist(ax=ax, bins=30, figsize=(3,15),)
#ax.set_yscale('log')
plt.yscale('log')
display(plt.show())

# Weather
https://data.nodc.noaa.gov/cgi-bin/iso?id=gov.noaa.ncdc:C00532

In [72]:
dbutils.fs.ls("dbfs:/mnt/mids-w261/data/datasets_final_project/parquet_weather_data")

In [73]:
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType, NullType
schema = StructType([StructField('STATION', StringType(), True), 
                      StructField('DATE', StringType(), True),
                      StructField('SOURCE', StringType(), True),
                      StructField('LATITUDE', StringType(), True),
                      StructField('LONGITUDE', StringType(), True),
                      StructField('ELEVATION', StringType(), True),
                      StructField('NAME', StringType(), True),
                      StructField('REPORT_TYPE', StringType(), True),
                      StructField('CALL_SIGN', StringType(), True),
                      StructField('QUALITY_CONTROL', StringType(), True),
                      StructField('WND', StringType(), True),
                      StructField('CIG', StringType(), True),
                      StructField('VIS', StringType(), True),
                      StructField('TMP', StringType(), True),
                      StructField('DEW', StringType(), True),
                      StructField('SLP', StringType(), True),
                      StructField('AA1', StringType(), True),
                      StructField('AA2', StringType(), True),
                      StructField('AJ1', StringType(), True),
                      StructField('AY1', StringType(), True),
                      StructField('AY2', StringType(), True),
                      StructField('GA1', StringType(), True),
                      StructField('GA2', StringType(), True),
                      StructField('GA3', StringType(), True),
                      StructField('GE1', StringType(), True),
                      StructField('GF1', StringType(), True),
                      StructField('IA1', StringType(), True),
                      StructField('KA1', StringType(), True),
                      StructField('KA2', StringType(), True),
                      StructField('MA1', StringType(), True),
                      StructField('MD1', StringType(), True),
                      StructField('MW1', StringType(), True),
                      StructField('OC1', StringType(), True),
                      StructField('OD1', StringType(), True),
                      StructField('SA1', StringType(), True),
                      StructField('UA1', StringType(), True),
                      StructField('REM', StringType(), True),
                      StructField('EQD', StringType(), True)
                    ])



In [74]:
weather = spark.read.option("header", "true")\
                      .schema(schema)\
                      .parquet(f"dbfs:/mnt/mids-w261/data/datasets_final_project/parquet_weather_data/201*a.parquet")
weather.count()


In [75]:
display(weather.where('DATE =="DATE"'))

In [76]:
#display(weather.describe())

In [77]:
nullCounts_weather_df = nullDataFrame(weather)
nullCounts_weather_df

In [78]:
#display(weather.sample(False, 0.0000001))

In [79]:
weather[["DATE"]].describe().show()

In [80]:
weather_df = weather.withColumn("DATE_IN_DATEFORMAT",weather['DATE'].cast(DateType()))
weather_df.select('DATE_IN_DATEFORMAT','DATE').show(10,False)

In [81]:
display(weather_df.sample(False, 0.0000001))

In [82]:
# Join Airlines data and Weather data by DATE and AIRPORT

# Stations

In [84]:
stations = spark.read.option("header", "true").csv("dbfs:/mnt/mids-w261/data/DEMO8/gsod/stations.csv.gz")

In [85]:
display(stations)

In [86]:
from pyspark.sql import functions as f
stations.where(f.col('name').contains('JAN MAYEN NOR NAVY'))

In [87]:
stations.select('name').distinct().count()

In [88]:
display(stations.select('name').distinct())

In [89]:
weather.select('NAME').distinct().count()

In [90]:
#display(weather.select('name').distinct())