# Airline delays 
## Bureau of Transportation Statistics
https://www.transtats.bts.gov/DL_SelectFields.asp?Table_ID=236   
https://www.bts.gov/topics/airlines-and-airports/understanding-reporting-causes-flight-delays-and-cancellations

2015 - 2019

### Additioinal sources
This might be useful in matching station codes to airports:
1. http://dss.ucar.edu/datasets/ds353.4/inventories/station-list.html
2. https://www.world-airport-codes.com/

In [3]:
# imports
import re
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time
from pyspark.sql import functions as f
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType, NullType, ShortType, DateType, BooleanType, BinaryType
from pyspark.sql import SQLContext
from pyspark.sql import DataFrameNaFunctions
sqlContext = SQLContext(sc)


In [4]:
display(dbutils.fs.ls("dbfs:/mnt/mids-w261/data/datasets_final_project/parquet_airlines_data"))

path,name,size
dbfs:/mnt/mids-w261/data/datasets_final_project/parquet_airlines_data/2015.parquet/,2015.parquet/,0
dbfs:/mnt/mids-w261/data/datasets_final_project/parquet_airlines_data/2016.parquet/,2016.parquet/,0
dbfs:/mnt/mids-w261/data/datasets_final_project/parquet_airlines_data/2017.parquet/,2017.parquet/,0
dbfs:/mnt/mids-w261/data/datasets_final_project/parquet_airlines_data/2018.parquet/,2018.parquet/,0
dbfs:/mnt/mids-w261/data/datasets_final_project/parquet_airlines_data/2019.parquet/,2019.parquet/,0


In [5]:
airlines = spark.read.option("header", "true").parquet(f"dbfs:/mnt/mids-w261/data/datasets_final_project/parquet_airlines_data/201*.parquet")
#display(airlines.sample(False, 0.00001))

In [6]:
  airlines.printSchema()

In [7]:
f'{airlines.count():,}'

In [8]:
#display(airlines.describe())

In [9]:
airlines.where('MONTH == "MONTH"').count()

In [10]:
for year in range(2015, 2020):
  print(year , airlines.select('MONTH').where(f'YEAR == {year}').distinct().collect())

In [11]:
def nullDataFrame(df):
  null_feature_list = []
  count = df.count()
  for column in df.columns:
    nulls = df.filter(df[column].isNull()).count()
    nulls_perct = np.round((nulls/count)*100, 2)
    null_feature_list.append([column, nulls, nulls_perct])
  nullCounts_df = pd.DataFrame(np.array(null_feature_list), columns=['Feature_Name', 'Null_Counts', 'Percentage_Null_Counts'])
  return nullCounts_df

In [12]:
#airlines.orderBy("FL_DATE").show(5)


In [13]:
def is_Weekend(x):
  """
  Function to determine if a given day of the week is a weekend_day(Friday, Saturday, Sunday)
  """
  if   x < 5: 
    return 0
  else: 
    return 1

def is_RushHour(x):
  """
  Function to determine if a given time of the day is rush hour (1600-2100)
  """
  if (x != None) and (x >= 1600) and (x <= 2100): 
    return 1
  else: 
    return 0
    
def preprocessAirlines(df):
  cols_to_keep = ['MONTH', 'DAY_OF_WEEK', 'OP_UNIQUE_CARRIER', 'ORIGIN', 'DEST', 'DEP_DELAY', 'DEP_TIME_BLK', 'ARR_DELAY', 'ARR_TIME_BLK', 'CRS_ELAPSED_TIME', 'DISTANCE',  'CARRIER_DELAY', 'WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY', 'IS_WEEKEND', 'DEP_RUSH_HOUR', 'ARR_RUSH_HOUR']
  cols_to_remove = [x for x in df.columns if x not in cols_to_keep]
  df = df.orderBy("FL_DATE")  
  df = df.fillna({'CARRIER_DELAY': 0, 'WEATHER_DELAY': 0, 'NAS_DELAY': 0, 'SECURITY_DELAY': 0, 'LATE_AIRCRAFT_DELAY': 0 })
  df = df.withColumn("IS_WEEKEND", f.udf(is_Weekend, IntegerType())("DAY_OF_WEEK"))
  df = df.withColumn("DEP_RUSH_HOUR", f.udf(is_RushHour, IntegerType())("DEP_TIME"))
  df = df.withColumn("ARR_RUSH_HOUR", f.udf(is_RushHour, IntegerType())("CRS_ARR_TIME"))
  preprocessAirlines_df = df.drop(*cols_to_remove)
  return preprocessAirlines_df

In [14]:
airlines_df =  preprocessAirlines(airlines)
airlines_df.printSchema()

In [15]:
display(airlines_df.sample(False, 0.0000001))

MONTH,DAY_OF_WEEK,OP_UNIQUE_CARRIER,ORIGIN,DEST,DEP_DELAY,DEP_TIME_BLK,ARR_DELAY,ARR_TIME_BLK,CRS_ELAPSED_TIME,DISTANCE,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,IS_WEEKEND,DEP_RUSH_HOUR,ARR_RUSH_HOUR
1,5,DL,SLC,PDX,-4.0,1100-1159,-11.0,1200-1259,124.0,630.0,0.0,0.0,0.0,0.0,0.0,1,0,0
7,3,AA,DCA,BOS,1.0,0900-0959,-6.0,1100-1159,92.0,399.0,0.0,0.0,0.0,0.0,0.0,0,0,0
11,6,YX,DCA,EWR,-9.0,0700-0759,-18.0,0800-0859,74.0,199.0,0.0,0.0,0.0,0.0,0.0,1,0,0


In [16]:
type(airlines_df)

In [17]:
nullCounts_df = nullDataFrame(airlines_df)
nullCounts_df

Unnamed: 0,Feature_Name,Null_Counts,Percentage_Null_Counts
0,MONTH,0,0.0
1,DAY_OF_WEEK,0,0.0
2,OP_UNIQUE_CARRIER,0,0.0
3,ORIGIN,0,0.0
4,DEST,0,0.0
5,DEP_DELAY,477296,1.5
6,DEP_TIME_BLK,0,0.0
7,ARR_DELAY,570640,1.8
8,ARR_TIME_BLK,0,0.0
9,CRS_ELAPSED_TIME,164,0.0


In [18]:
# 81.73% of records have null/NaN values for [CARRIER_DELAY, WEATHER_DELAY, NAS_DELAY, SECURITY_DELAY, LATE_AIRCRAFT_DELAY ]
airlines_preprocessed_filtered = airlines_df.filter(airlines['ARR_DELAY'].isNotNull() )
airlines_preprocessed_filtered = airlines_preprocessed_filtered.filter(airlines['DEP_DELAY'].isNotNull() )
f'{airlines_preprocessed_filtered.count():,}'

In [19]:
nullCounts_df2 = nullDataFrame(airlines_preprocessed_filtered)
nullCounts_df2

Unnamed: 0,Feature_Name,Null_Counts,Percentage_Null_Counts
0,MONTH,0,0.0
1,DAY_OF_WEEK,0,0.0
2,OP_UNIQUE_CARRIER,0,0.0
3,ORIGIN,0,0.0
4,DEST,0,0.0
5,DEP_DELAY,0,0.0
6,DEP_TIME_BLK,0,0.0
7,ARR_DELAY,0,0.0
8,ARR_TIME_BLK,0,0.0
9,CRS_ELAPSED_TIME,0,0.0


In [20]:
display(airlines_preprocessed_filtered.describe())

summary,MONTH,DAY_OF_WEEK,OP_UNIQUE_CARRIER,ORIGIN,DEST,DEP_DELAY,DEP_TIME_BLK,ARR_DELAY,ARR_TIME_BLK,CRS_ELAPSED_TIME,DISTANCE,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,IS_WEEKEND,DEP_RUSH_HOUR,ARR_RUSH_HOUR
count,31176201.0,31176201.0,31176201,31176201,31176201,31171613.0,31176201,31176201.0,31176201,31176201.0,31176201.0,5799114.0,5799114.0,5799114.0,5799114.0,5799114.0,31176201.0,31176201.0,31176201.0
mean,6.568153669525033,3.936863506878211,,,,9.784760320231102,,4.615475952313754,,143.39964208596166,825.1078288210933,19.98459350859459,3.2259498606166392,15.44036813209742,0.0891679315150555,25.364284785572416,0.4127280293067138,0.2794518164673111,0.3078280448602445
stddev,3.39745307682337,1.9914611932350432,,,,43.31986119785449,,45.59418015238945,,74.8523322855301,608.6626085549944,59.307979706257626,26.812025382335808,34.739082338772576,2.9147981743398224,48.60358147038255,0.4923247006863924,0.4487298799936953,0.4615950026724097
min,1.0,1.0,9E,ABE,ABE,-234.0,0001-0559,-238.0,0001-0559,-99.0,28.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,12.0,7.0,YX,YUM,YUM,2710.0,2300-2359,2695.0,2300-2359,813.0,5095.0,2695.0,2692.0,1848.0,1078.0,2454.0,1.0,1.0,1.0


In [21]:
numeric_features = [x[0] for x in airlines_preprocessed_filtered.dtypes if x[1] == 'int' or x[1] == 'double']
numeric_features.remove('ARR_DELAY')
numeric_features

In [22]:
from pyspark.ml.feature import VectorAssembler
vectorAssembler = VectorAssembler(inputCols = numeric_features, outputCol = 'features')
vector_airlines_preprocessed_filtered = vectorAssembler.transform(airlines_preprocessed_filtered)
vector_airlines_preprocessed_filtered = vector_airlines_preprocessed_filtered.select(['features', 'ARR_DELAY'])
display(vector_airlines_preprocessed_filtered.show())

In [23]:
train_df, val_df, test_df = vector_airlines_preprocessed_filtered.randomSplit([0.8,0.1,0.1], seed = 2020)
display(train_df.show(5))

In [24]:
# train_df, val_df, test_df = airlines_preprocessed_filtered.randomSplit([0.8,0.1,0.1], seed = 2020)

In [25]:
train_cnt = train_df.count()
val_cnt = val_df.count()
test_cnt = test_df.count()
total_cnt = train_cnt + val_cnt + test_cnt

In [26]:
print('train_df records: {}\n val_df records: {}\n test_df records: {}\n total records: {}'.format(train_cnt, val_cnt, test_cnt, total_cnt) )

In [27]:
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(featuresCol = 'features', labelCol='ARR_DELAY', maxIter=10, regParam=0.3, elasticNetParam=0.8)
lr_model = lr.fit(train_df)
print("Coefficients: " + str(lr_model.coefficients))
print("Intercept: " + str(lr_model.intercept))

In [28]:
trainingSummary = lr_model.summary
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

In [29]:
print("numIterations: %d" % trainingSummary.totalIterations)
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

In [30]:
from  pyspark.sql.functions import abs
residuals = trainingSummary.residuals
abs_residuals = residuals.withColumn('abs_residuals',abs(residuals.residuals))
display(abs_residuals.show(5))
display(abs_residuals.describe().show())

In [31]:
abs_residuals_summary = abs_residuals.describe()
display(abs_residuals_summary)

summary,residuals,abs_residuals
count,24938973.0,24938973.0
mean,0.0005647702874134274,7.784774118956812
stddev,9.96341009774851,6.218265882194345
min,-255.19745577605957,3.470878709777025e-06
max,197.40831011880897,255.19745577605957


In [32]:
MEA_train_df = abs_residuals_summary.select('abs_residuals').toPandas().iloc[1]
print('Mean Absolute Error of train_df: ', MEA_train_df)

In [33]:
lr_predictions_val = lr_model.transform(val_df)
lr_predictions_val.select("prediction","ARR_DELAY","features").show(5)

from pyspark.ml.evaluation import RegressionEvaluator
lr_evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="ARR_DELAY",metricName="r2")
print("R Squared (R2) on val_data = %g" % lr_evaluator.evaluate(lr_predictions_val))

In [34]:
lr_predictions_test = lr_model.transform(test_df)
lr_predictions_test.select("prediction","ARR_DELAY","features").show(5)

from pyspark.ml.evaluation import RegressionEvaluator
lr_evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="ARR_DELAY",metricName="r2")
print("R Squared (R2) on val_data = %g" % lr_evaluator.evaluate(lr_predictions_test))

In [35]:
sample_airlines_df = airlines_preprocessed_filtered.sample(False, 0.0001, 2020)
pandas_airlines_df = sample_airlines_df.toPandas()

In [36]:
#pandas_airlines_df[['ARR_DELAY', 'ARR_TIME', 'DEP_DELAY', 'DEP_TIME', 'CARRIER_DELAY', 'WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY']][pandas_airlines_df.DEP_DELAY < 0 ].head(20)

In [37]:
nullCounts_df2 = nullDataFrame(airlines_preprocessed_filtered)
nullCounts_df2

Unnamed: 0,Feature_Name,Null_Counts,Percentage_Null_Counts
0,MONTH,0,0.0
1,DAY_OF_WEEK,0,0.0
2,OP_UNIQUE_CARRIER,0,0.0
3,ORIGIN,0,0.0
4,DEST,0,0.0
5,DEP_DELAY,4588,0.01
6,DEP_TIME_BLK,0,0.0
7,ARR_DELAY,0,0.0
8,ARR_TIME_BLK,0,0.0
9,CRS_ELAPSED_TIME,0,0.0


In [38]:
features = ['ARR_DELAY', 'DEP_DELAY', 'CARRIER_DELAY', 'WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY', 'IS_WEEKEND', 'DEP_RUSH_HOUR', 'ARR_RUSH_HOUR']
# airlines_filtered[Delay_List].describe().show()

In [39]:
sample_airlines_df = airlines_preprocessed_filtered.select(features).sample(False, 0.001, 2020)
pandas_df = sample_airlines_df.toPandas()

In [40]:
pandas_df.corr()

Unnamed: 0,ARR_DELAY,DEP_DELAY,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,IS_WEEKEND,DEP_RUSH_HOUR,ARR_RUSH_HOUR
ARR_DELAY,1.0,0.953464,0.640308,0.248526,0.288776,0.009273,0.491252,-0.006583,0.06095,0.053482
DEP_DELAY,0.953464,1.0,0.645558,0.230005,0.177649,0.015039,0.521432,-1.7e-05,0.061483,0.049762
CARRIER_DELAY,0.640308,0.645558,1.0,-0.040552,-0.096505,-0.012682,-0.085862,0.035752,-0.053316,-0.056121
WEATHER_DELAY,0.248526,0.230005,-0.040552,1.0,-0.007366,-0.004658,-0.034703,-0.001323,0.00138,-0.008848
NAS_DELAY,0.288776,0.177649,-0.096505,-0.007366,1.0,-0.014785,-0.145014,-0.043513,0.003826,0.054213
SECURITY_DELAY,0.009273,0.015039,-0.012682,-0.004658,-0.014785,1.0,-0.019517,0.012748,-0.02424,-0.01626
LATE_AIRCRAFT_DELAY,0.491252,0.521432,-0.085862,-0.034703,-0.145014,-0.019517,1.0,-0.010223,0.053879,0.035619
IS_WEEKEND,-0.006583,-1.7e-05,0.035752,-0.001323,-0.043513,0.012748,-0.010223,1.0,0.001464,0.002057
DEP_RUSH_HOUR,0.06095,0.061483,-0.053316,0.00138,0.003826,-0.02424,0.053879,0.001464,1.0,0.422751
ARR_RUSH_HOUR,0.053482,0.049762,-0.056121,-0.008848,0.054213,-0.01626,0.035619,0.002057,0.422751,1.0


In [41]:
fig = plt.figure(figsize = (15,20))
ax = fig.gca()
pandas_df.hist(ax=ax, bins=30, figsize=(3,15),)
#ax.set_yscale('log')
plt.yscale('log')
display(plt.show())

# Weather
https://data.nodc.noaa.gov/cgi-bin/iso?id=gov.noaa.ncdc:C00532

In [43]:
dbutils.fs.ls("dbfs:/mnt/mids-w261/data/datasets_final_project/parquet_weather_data")

In [44]:
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType, NullType
schema = StructType([StructField('STATION', StringType(), True), 
                      StructField('DATE', StringType(), True),
                      StructField('SOURCE', StringType(), True),
                      StructField('LATITUDE', StringType(), True),
                      StructField('LONGITUDE', StringType(), True),
                      StructField('ELEVATION', StringType(), True),
                      StructField('NAME', StringType(), True),
                      StructField('REPORT_TYPE', StringType(), True),
                      StructField('CALL_SIGN', StringType(), True),
                      StructField('QUALITY_CONTROL', StringType(), True),
                      StructField('WND', StringType(), True),
                      StructField('CIG', StringType(), True),
                      StructField('VIS', StringType(), True),
                      StructField('TMP', StringType(), True),
                      StructField('DEW', StringType(), True),
                      StructField('SLP', StringType(), True),
                      StructField('AA1', StringType(), True),
                      StructField('AA2', StringType(), True),
                      StructField('AJ1', StringType(), True),
                      StructField('AY1', StringType(), True),
                      StructField('AY2', StringType(), True),
                      StructField('GA1', StringType(), True),
                      StructField('GA2', StringType(), True),
                      StructField('GA3', StringType(), True),
                      StructField('GE1', StringType(), True),
                      StructField('GF1', StringType(), True),
                      StructField('IA1', StringType(), True),
                      StructField('KA1', StringType(), True),
                      StructField('KA2', StringType(), True),
                      StructField('MA1', StringType(), True),
                      StructField('MD1', StringType(), True),
                      StructField('MW1', StringType(), True),
                      StructField('OC1', StringType(), True),
                      StructField('OD1', StringType(), True),
                      StructField('SA1', StringType(), True),
                      StructField('UA1', StringType(), True),
                      StructField('REM', StringType(), True),
                      StructField('EQD', StringType(), True)
                    ])



In [45]:
weather = spark.read.option("header", "true")\
                      .schema(schema)\
                      .parquet(f"dbfs:/mnt/mids-w261/data/datasets_final_project/parquet_weather_data/201*a.parquet")
weather.count()


In [46]:
display(weather.where('DATE =="DATE"'))

STATION,DATE,SOURCE,LATITUDE,LONGITUDE,ELEVATION,NAME,REPORT_TYPE,CALL_SIGN,QUALITY_CONTROL,WND,CIG,VIS,TMP,DEW,SLP,AA1,AA2,AJ1,AY1,AY2,GA1,GA2,GA3,GE1,GF1,IA1,KA1,KA2,MA1,MD1,MW1,OC1,OD1,SA1,UA1,REM,EQD


In [47]:
#display(weather.describe())

In [48]:
nullCounts_weather_df = nullDataFrame(weather)
nullCounts_weather_df

Unnamed: 0,Feature_Name,Null_Counts,Percentage_Null_Counts
0,STATION,0,0.0
1,DATE,0,0.0
2,SOURCE,0,0.0
3,LATITUDE,0,0.0
4,LONGITUDE,0,0.0
5,ELEVATION,0,0.0
6,NAME,4715523,0.75
7,REPORT_TYPE,0,0.0
8,CALL_SIGN,0,0.0
9,QUALITY_CONTROL,0,0.0


In [49]:
#display(weather.sample(False, 0.0000001))

In [50]:
weather[["DATE"]].describe().show()

In [51]:
weather_df = weather.withColumn("DATE_IN_DATEFORMAT",weather['DATE'].cast(DateType()))
weather_df.select('DATE_IN_DATEFORMAT','DATE').show(10,False)

In [52]:
display(weather_df.sample(False, 0.0000001))

STATION,DATE,SOURCE,LATITUDE,LONGITUDE,ELEVATION,NAME,REPORT_TYPE,CALL_SIGN,QUALITY_CONTROL,WND,CIG,VIS,TMP,DEW,SLP,AA1,AA2,AJ1,AY1,AY2,GA1,GA2,GA3,GE1,GF1,IA1,KA1,KA2,MA1,MD1,MW1,OC1,OD1,SA1,UA1,REM,EQD,DATE_IN_DATEFORMAT
11302099999,2018-11-09T08:00:00,4,47.4333333,9.7333333,410.0,"DORNBIRN, AU",FM-12,99999,V020,"250,1,N,0010,1","99999,9,9,N",999999999,+00891,+00741,101811,,,,,,,,999999096951,"3,1,007,1,+999,9",,,SYN05811302 35/// /2501 10089 20074 39695 40181 53007 333 55300=,,,,,,,,,,,2018-11-09
6669099999,2016-01-28T19:00:00,4,47.4833333,8.4,843.0,"LAEGEREN, SZ",FM-12,99999,V020,"999,9,C,0000,1","99999,9,9,N",999999999,+00681,+00491,999999,,999999092871,"2,1,010,1,+999,9",315679,39900571999,59900411999,SYN07606669 43/// /23// 10068 20049 39287 48567 52010 333 55300 20007 91111 91208=,,,,,,,,,,,,,,,,2016-01-28
2535099999,2016-05-28T04:20:00,4,58.4564,13.972672,98.75,"SKOVDE, SW",FM-15,99999,V020,"999,9,V,0010,1","00183,1,9,N",009999199,+01001,+00801,999999,,,"07,1,+00183,1,99,9","08,1,+00244,1,99,9",,"9,AGL ,+99999,+99999",99999071999001831999999,102101999999,,,MET069METAR ESGR 280420Z AUTO VRB02KT 9999 BKN006/// OVC008/// 10/08 Q1021=,,,,,,,,,,,,2016-05-28
2807099999,2016-06-29T12:00:00,4,68.607269,27.405328,146.6,"IVALO, FI",FM-12,99999,V020,"220,1,N,0031,1",01340199,045000199,+02171,+00961,101351,1.0,,,,"99,9,+01340,1,99,9",,,,"9,AGL ,+99999,+99999",07991999999013401999999,,,999999099641,,,00671,,SYN004BUFR,,,,,2016-06-29
2835099999,2019-02-09T03:00:00,4,68.85,28.3,123.0,"INARI NELLIM, FI",FM-12,99999,V020,"290,1,N,0020,1","99999,9,9,N",007000199,-01921,-02111,99671,,,,711,"08,1,+01250,1,99,9",,"9,MSL ,+99999,+99999",08991081999012501999999,,,,999999098041,"7,1,006,1,+999,9",,,,SYN06602835 47657 82902 11192 21211 39804 49967 57006 771// 88/// 90300=,,,,,,2019-02-09
1205099999,2019-04-03T10:00:00,4,62.3333333,5.2666666,37.17,"SVINOY LH, NO",FM-12,99999,V020,"050,1,N,0070,1","99999,9,9,N",999999999,+00601,+00371,100821,,,999999100361,"2,1,003,1,+999,9",,,,,SYN04801205 46/// /0507 10060 20037 30036 40082 52003=,,,,,,,,,,,,,,2019-04-03
3862099999,2016-03-15T16:00:00,4,50.78,-1.8425,11.58,"BOURNEMOUTH, UK",FM-12,99999,V020,"050,1,N,0072,1","00690,1,9,N",017000199,+01061,+00541,102801,,,,,,,"05,1,+00690,1,99,9",,,"9,AGL ,+99999,+99999",99999051999006901999999,,,999999102671,"6,1,007,1,+999,9",,,,,,SYN07003862 45567 /0514 10106 20054 30267 40280 56007 333 55305 21070 85/23=,,2016-03-15
72302013748,2019-11-11T00:00:00,4,34.2675,-77.8997,10.1,"WILMINGTON INTERNATIONAL AIRPORT, NC US",FM-12,99999,V020,"999,9,C,0000,1","99999,9,9,N",016000199,+00891,+00671,102131,,,,,,,,,,,,,,,,,,,,,,,2019-11-11
60729099999,2017-12-27T21:00:00,4,36.4333333,10.0833333,156.0,"ZAGHONAN MAGRANE, TS",FM-12,99999,V020,"260,1,N,0051,1","99999,9,9,N",999999999,+01211,+00761,100771,3000091.0,,,,,,,,,,,,,,999999098921,"1,1,008,1,+999,9",,,39901031999,49900821999,SYN07660729 26/// /2610 10121 20076 39892 40077 51008 333 59124 60007 91016 91120=,,2017-12-27
2072099999,2016-12-13T05:00:00,4,67.7333333,22.8333333,313.0,"PARKALOMPOLO, SW",FM-12,99999,V020,"320,1,N,0010,1","02400,1,9,N",050000199,-00761,-00861,100491,1000091.0,,,,,,"08,1,+02400,1,99,9",,,,"9,AGL ,+99999,+99999",08991081999024001999999,,,999999096591,"7,1,017,1,+999,9",,49900201999,SYN08202072 25999 83201 11076 21086 39659 40049 57017 333 60005 88/58 91002 90710 91102=,,,,2016-12-13


In [53]:
# Join Airlines data and Weather data by DATE and AIRPORT

# Stations

In [55]:
stations = spark.read.option("header", "true").csv("dbfs:/mnt/mids-w261/data/DEMO8/gsod/stations.csv.gz")

In [56]:
display(stations)

usaf,wban,name,country,state,call,lat,lon,elev,begin,end
7018,99999,WXPOD 7018,,,,0.0,0.0,7018.0,20110309,20130730
7026,99999,WXPOD 7026,AF,,,0.0,0.0,7026.0,20120713,20170822
7070,99999,WXPOD 7070,AF,,,0.0,0.0,7070.0,20140923,20150926
8268,99999,WXPOD8278,AF,,,32.95,65.567,1156.7,20100519,20120323
8307,99999,WXPOD 8318,AF,,,0.0,0.0,8318.0,20100421,20100421
10016,99999,RORVIK/RYUM,NO,,,64.85,11.233,14.0,19870116,19910806
10017,99999,FRIGG,NO,,ENFR,59.98,2.25,48.0,19880320,20050228
10071,99999,LONGYEARBYEN,SV,,,78.217,15.583,37.0,20050210,20050210
10190,99999,SVARTTANGEN,NO,,,77.517,20.817,20.0,20100825,20140523
10303,99999,TROMSO/SKATTURA,NO,,,69.7,19.017,14.0,20140522,20150108


In [57]:
from pyspark.sql import functions as f
stations.where(f.col('name').contains('JAN MAYEN NOR NAVY'))

In [58]:
stations.select('name').distinct().count()

In [59]:
display(stations.select('name').distinct())

name
HATTFJELLDAL-KRUTA
COLLAFIRTH HILL
WINDY HEAD
ST ATHAN
FINTHEN (USA-AF) &
HANAU AAF
WSCHOWA
WROCLAW/STRACHOWICE
VIGNA DI VALLE
SAMOS ISLAND


In [60]:
weather.select('NAME').distinct().count()

In [61]:
#display(weather.select('name').distinct())