# Airline delays 
## Bureau of Transportation Statistics
https://www.transtats.bts.gov/DL_SelectFields.asp?Table_ID=236   
https://www.bts.gov/topics/airlines-and-airports/understanding-reporting-causes-flight-delays-and-cancellations

2015 - 2019

### Additioinal sources
This might be useful in matching station codes to airports:
1. http://dss.ucar.edu/datasets/ds353.4/inventories/station-list.html
2. https://www.world-airport-codes.com/

In [3]:
# imports
import re
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time
from pyspark.sql import functions as f
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType, NullType, ShortType, DateType, BooleanType, BinaryType
from pyspark.sql import SQLContext
from pyspark.sql import DataFrameNaFunctions
sqlContext = SQLContext(sc)


In [4]:
display(dbutils.fs.ls("dbfs:/mnt/mids-w261/data/datasets_final_project/parquet_airlines_data"))

path,name,size
dbfs:/mnt/mids-w261/data/datasets_final_project/parquet_airlines_data/2015.parquet/,2015.parquet/,0
dbfs:/mnt/mids-w261/data/datasets_final_project/parquet_airlines_data/2016.parquet/,2016.parquet/,0
dbfs:/mnt/mids-w261/data/datasets_final_project/parquet_airlines_data/2017.parquet/,2017.parquet/,0
dbfs:/mnt/mids-w261/data/datasets_final_project/parquet_airlines_data/2018.parquet/,2018.parquet/,0
dbfs:/mnt/mids-w261/data/datasets_final_project/parquet_airlines_data/2019.parquet/,2019.parquet/,0


In [5]:
airlines = spark.read.option("header", "true").parquet(f"dbfs:/mnt/mids-w261/data/datasets_final_project/parquet_airlines_data/201*.parquet")
display(airlines.sample(False, 0.00001))

YEAR,QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,FL_DATE,OP_UNIQUE_CARRIER,ORIGIN_AIRPORT_ID,ORIGIN_AIRPORT_SEQ_ID,ORIGIN_CITY_MARKET_ID,ORIGIN,ORIGIN_CITY_NAME,ORIGIN_STATE_ABR,ORIGIN_STATE_FIPS,ORIGIN_STATE_NM,ORIGIN_WAC,DEST_AIRPORT_ID,DEST_AIRPORT_SEQ_ID,DEST_CITY_MARKET_ID,DEST,DEST_CITY_NAME,DEST_STATE_ABR,DEST_STATE_FIPS,DEST_STATE_NM,DEST_WAC,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,DEP_DELAY_NEW,DEP_DEL15,DEP_DELAY_GROUP,DEP_TIME_BLK,TAXI_OUT,WHEELS_OFF,WHEELS_ON,TAXI_IN,CRS_ARR_TIME,ARR_TIME,ARR_DELAY,ARR_DELAY_NEW,ARR_DEL15,ARR_DELAY_GROUP,ARR_TIME_BLK,CANCELLED,DIVERTED,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,AIR_TIME,FLIGHTS,DISTANCE,DISTANCE_GROUP,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY
2018,2,6,8,5,2018-06-08,YV,12266,1226603,31453,IAH,"Houston, TX",TX,48,Texas,74,12951,1295106,32951,LFT,"Lafayette, LA",LA,22,Louisiana,72,2150,2147.0,-3.0,0.0,0.0,-1.0,2100-2159,14.0,2201.0,2238.0,4.0,2252,2242.0,-10.0,0.0,0.0,-1.0,2200-2259,False,False,62.0,55.0,37.0,1.0,201.0,1,,,,,
2018,2,6,24,7,2018-06-24,UA,10721,1072102,30721,BOS,"Boston, MA",MA,25,Massachusetts,13,11618,1161802,31703,EWR,"Newark, NJ",NJ,34,New Jersey,21,940,933.0,-7.0,0.0,0.0,-1.0,0900-0959,13.0,946.0,1031.0,7.0,1100,1038.0,-22.0,0.0,0.0,-2.0,1100-1159,False,False,80.0,65.0,45.0,1.0,200.0,1,,,,,
2018,2,6,10,7,2018-06-10,OO,14771,1477104,32457,SFO,"San Francisco, CA",CA,6,California,91,13851,1385103,33851,OKC,"Oklahoma City, OK",OK,40,Oklahoma,73,1149,1147.0,-2.0,0.0,0.0,-1.0,1100-1159,37.0,1224.0,1712.0,11.0,1724,1723.0,-1.0,0.0,0.0,-1.0,1700-1759,False,False,215.0,216.0,168.0,1.0,1384.0,6,,,,,
2018,2,6,10,7,2018-06-10,WN,11259,1125903,30194,DAL,"Dallas, TX",TX,48,Texas,74,12191,1219102,31453,HOU,"Houston, TX",TX,48,Texas,74,1400,1456.0,56.0,56.0,1.0,3.0,1400-1459,14.0,1510.0,1550.0,3.0,1505,1553.0,48.0,48.0,1.0,3.0,1500-1559,False,False,65.0,57.0,40.0,1.0,239.0,1,6.0,0.0,0.0,0.0,42.0
2018,4,11,10,6,2018-11-10,YV,11298,1129806,30194,DFW,"Dallas/Fort Worth, TX",TX,48,Texas,74,13367,1336705,33367,MLI,"Moline, IL",IL,17,Illinois,41,1829,1824.0,-5.0,0.0,0.0,-1.0,1800-1859,15.0,1839.0,2006.0,5.0,2028,2011.0,-17.0,0.0,0.0,-2.0,2000-2059,False,False,119.0,107.0,87.0,1.0,691.0,3,,,,,
2018,4,11,24,6,2018-11-24,AA,10721,1072102,30721,BOS,"Boston, MA",MA,25,Massachusetts,13,12478,1247805,31703,JFK,"New York, NY",NY,36,New York,22,1208,1157.0,-11.0,0.0,0.0,-1.0,1200-1259,12.0,1209.0,1300.0,9.0,1330,1309.0,-21.0,0.0,0.0,-2.0,1300-1359,False,False,82.0,72.0,51.0,1.0,187.0,1,,,,,
2018,4,11,15,4,2018-11-15,DL,15304,1530402,33195,TPA,"Tampa, FL",FL,12,Florida,33,14869,1486903,34614,SLC,"Salt Lake City, UT",UT,49,Utah,87,745,744.0,-1.0,0.0,0.0,-1.0,0700-0759,12.0,756.0,1004.0,6.0,1030,1010.0,-20.0,0.0,0.0,-2.0,1000-1059,False,False,285.0,266.0,248.0,1.0,1888.0,8,,,,,
2018,4,11,11,7,2018-11-11,NK,11697,1169706,32467,FLL,"Fort Lauderdale, FL",FL,12,Florida,33,13577,1357702,31135,MYR,"Myrtle Beach, SC",SC,45,South Carolina,37,2015,2006.0,-9.0,0.0,0.0,-1.0,2000-2059,11.0,2017.0,2131.0,5.0,2153,2136.0,-17.0,0.0,0.0,-2.0,2100-2159,False,False,98.0,90.0,74.0,1.0,529.0,3,,,,,
2018,4,11,24,6,2018-11-24,NK,11618,1161802,31703,EWR,"Newark, NJ",NJ,34,New Jersey,21,13204,1320402,31454,MCO,"Orlando, FL",FL,12,Florida,33,730,730.0,0.0,0.0,0.0,0.0,0700-0759,13.0,743.0,949.0,23.0,1023,1012.0,-11.0,0.0,0.0,-1.0,1000-1059,False,False,173.0,162.0,126.0,1.0,937.0,4,,,,,
2018,4,11,8,4,2018-11-08,NK,13930,1393007,30977,ORD,"Chicago, IL",IL,17,Illinois,41,15304,1530402,33195,TPA,"Tampa, FL",FL,12,Florida,33,1630,1626.0,-4.0,0.0,0.0,-1.0,1600-1659,13.0,1639.0,1953.0,7.0,2008,2000.0,-8.0,0.0,0.0,-1.0,2000-2059,False,False,158.0,154.0,134.0,1.0,1011.0,5,,,,,


In [6]:
airlines.printSchema()

In [7]:
f'{airlines.count():,}'

In [8]:
display(airlines.describe())

summary,YEAR,QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_UNIQUE_CARRIER,ORIGIN_AIRPORT_ID,ORIGIN_AIRPORT_SEQ_ID,ORIGIN_CITY_MARKET_ID,ORIGIN,ORIGIN_CITY_NAME,ORIGIN_STATE_ABR,ORIGIN_STATE_FIPS,ORIGIN_STATE_NM,ORIGIN_WAC,DEST_AIRPORT_ID,DEST_AIRPORT_SEQ_ID,DEST_CITY_MARKET_ID,DEST,DEST_CITY_NAME,DEST_STATE_ABR,DEST_STATE_FIPS,DEST_STATE_NM,DEST_WAC,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,DEP_DELAY_NEW,DEP_DEL15,DEP_DELAY_GROUP,DEP_TIME_BLK,TAXI_OUT,WHEELS_OFF,WHEELS_ON,TAXI_IN,CRS_ARR_TIME,ARR_TIME,ARR_DELAY,ARR_DELAY_NEW,ARR_DEL15,ARR_DELAY_GROUP,ARR_TIME_BLK,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,AIR_TIME,FLIGHTS,DISTANCE,DISTANCE_GROUP,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY
count,31746841.0,31746841.0,31746841.0,31746841.0,31746841.0,31746841,31746841.0,31746841.0,31746841.0,31746841,31746841,31746841,31746841.0,31746841,31746841.0,31746841.0,31746841.0,31746841.0,31746841,31746841,31746841,31746841.0,31746841,31746841.0,31746841.0,31274521.0,31269545.0,31269545.0,31269545.0,31269545.0,31746841,31260424.0,31260429.0,31244917.0,31244917.0,31746841.0,31244919.0,31176201.0,31176201.0,31176201.0,31176201.0,31746841,31746677.0,31178799.0,31178799.0,31746841.0,31746841.0,31746841.0,5799114.0,5799114.0,5799114.0,5799114.0,5799114.0
mean,2017.1512498204152,2.51748770846208,6.552106365480585,15.749554640727876,3.9346285509162944,,12668.724409461716,1266875.803290192,31729.315288031336,,,,26.35374732245013,,54.91906164774001,12668.666651116562,1266870.0274082704,31729.2951808339,,,,26.354102948384693,,54.919218135750896,1330.0884999550035,1334.2122192375064,9.855285614165476,12.909587811399238,0.1820794322398998,0.0360368850905889,,16.830789563186986,1356.9563268309594,1464.4766360877195,7.5604571777227,1488.9034405659447,1468.8957719173477,4.615475952313754,12.966188215170924,0.1860109575249402,-0.2096807112579239,,143.2167191860742,138.22906985609035,113.8502422431345,1.0,823.2170183483768,3.765292206553717,19.98459350859459,3.2259498606166392,15.44036813209742,0.0891679315150555,25.364284785572416
stddev,1.4316532810210283,1.1053295681781927,3.3994302561415286,8.774238088354531,1.9917635387471784,,1526.7397787182167,152673.70669029246,1289.458802620071,,,,16.539517798596837,,26.577828324534654,1526.7212131574868,152671.85014169724,1289.4192061531862,,,,16.539679261968384,,26.57807966993097,489.86848319644025,503.2922887741843,43.50520293704072,42.44165318434855,0.3859099860819423,2.161932356946248,,9.488981863443776,504.9367808166726,531.98737292978,5.929979448174991,516.8048646426242,536.3586689058151,45.59418015238943,42.1408858475887,0.389115517632249,2.2975645036344488,,74.73117735923346,74.33716296557805,72.2402490397357,0.0,607.6826683052024,2.392350188769286,59.30797970625765,26.81202538233581,34.73908233877255,2.914798174339818,48.60358147038268
min,2015.0,1.0,1.0,1.0,1.0,9E,10135.0,1013503.0,30070.0,ABE,"Aberdeen, SD",AK,1.0,Alabama,1.0,10135.0,1013503.0,30070.0,ABE,"Aberdeen, SD",AK,1.0,Alabama,1.0,1.0,1.0,-234.0,0.0,0.0,-2.0,0001-0559,0.0,1.0,1.0,0.0,1.0,1.0,-238.0,0.0,0.0,-2.0,0001-0559,-99.0,14.0,4.0,1.0,21.0,1.0,0.0,0.0,0.0,0.0,0.0
max,2019.0,4.0,12.0,31.0,7.0,YX,16869.0,1686901.0,36133.0,YUM,"Yuma, AZ",WY,78.0,Wyoming,93.0,16869.0,1686901.0,36133.0,YUM,"Yuma, AZ",WY,78.0,Wyoming,93.0,2359.0,2400.0,2755.0,2755.0,1.0,12.0,2300-2359,227.0,2400.0,2400.0,414.0,2400.0,2400.0,2695.0,2695.0,1.0,12.0,2300-2359,948.0,1604.0,1557.0,1.0,5095.0,11.0,2695.0,2692.0,1848.0,1078.0,2454.0


In [9]:
airlines.where('MONTH == "MONTH"').count()

In [10]:
for year in range(2015, 2020):
  print(year , airlines.select('MONTH').where(f'YEAR == {year}').distinct().collect())

In [11]:
def nullDataFrame(df):
  null_feature_list = []
  count = df.count()
  for column in df.columns:
    nulls = df.filter(df[column].isNull()).count()
    nulls_perct = np.round((nulls/count)*100, 2)
    null_feature_list.append([column, nulls, nulls_perct])
  nullCounts_df = pd.DataFrame(np.array(null_feature_list), columns=['Feature_Name', 'Null_Counts', 'Percentage_Null_Counts'])
  return nullCounts_df

In [12]:
type(airlines)

In [13]:
nullCounts_df = nullDataFrame(airlines)
nullCounts_df

Unnamed: 0,Feature_Name,Null_Counts,Percentage_Null_Counts
0,YEAR,0,0.0
1,QUARTER,0,0.0
2,MONTH,0,0.0
3,DAY_OF_MONTH,0,0.0
4,DAY_OF_WEEK,0,0.0
5,FL_DATE,0,0.0
6,OP_UNIQUE_CARRIER,0,0.0
7,ORIGIN_AIRPORT_ID,0,0.0
8,ORIGIN_AIRPORT_SEQ_ID,0,0.0
9,ORIGIN_CITY_MARKET_ID,0,0.0


In [14]:
# 81.73% of records have null/NaN values for [CARRIER_DELAY, WEATHER_DELAY, NAS_DELAY, SECURITY_DELAY, LATE_AIRCRAFT_DELAY ]
airlines_filtered = airlines.filter(airlines['CARRIER_DELAY'].isNotNull())
f'{airlines_filtered.count():,}'

In [15]:
nullCounts_df2 = nullDataFrame(airlines_filtered)
nullCounts_df2

Unnamed: 0,Feature_Name,Null_Counts,Percentage_Null_Counts
0,YEAR,0,0.0
1,QUARTER,0,0.0
2,MONTH,0,0.0
3,DAY_OF_MONTH,0,0.0
4,DAY_OF_WEEK,0,0.0
5,FL_DATE,0,0.0
6,OP_UNIQUE_CARRIER,0,0.0
7,ORIGIN_AIRPORT_ID,0,0.0
8,ORIGIN_AIRPORT_SEQ_ID,0,0.0
9,ORIGIN_CITY_MARKET_ID,0,0.0


In [16]:
Delay_List = ['ARR_DELAY', 'DEP_DELAY', 'CARRIER_DELAY', 'WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY']
airlines_filtered[Delay_List].describe().show()

In [17]:
sample_airlines_df = airlines_filtered.select(Delay_List).sample(False, 0.001, 2020)
pandas_df = sample_airlines_df.toPandas()

In [18]:
pandas_df.corr()

Unnamed: 0,ARR_DELAY,DEP_DELAY,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY
ARR_DELAY,1.0,0.974018,0.574177,0.380086,0.27637,-0.012132,0.521595
DEP_DELAY,0.974018,1.0,0.576771,0.370367,0.180565,-0.009576,0.548901
CARRIER_DELAY,0.574177,0.576771,1.0,-0.014378,-0.09011,-0.016235,-0.088504
WEATHER_DELAY,0.380086,0.370367,-0.014378,1.0,-0.027145,-0.005187,-0.016429
NAS_DELAY,0.27637,0.180565,-0.09011,-0.027145,1.0,-0.012038,-0.128852
SECURITY_DELAY,-0.012132,-0.009576,-0.016235,-0.005187,-0.012038,1.0,-0.013939
LATE_AIRCRAFT_DELAY,0.521595,0.548901,-0.088504,-0.016429,-0.128852,-0.013939,1.0


In [19]:
fig = plt.figure(figsize = (15,20))
ax = fig.gca()
pandas_df.hist(ax=ax, bins=30, figsize=(3,15),)
#ax.set_yscale('log')
plt.yscale('log')
display(plt.show())

# Weather
https://data.nodc.noaa.gov/cgi-bin/iso?id=gov.noaa.ncdc:C00532

In [21]:
dbutils.fs.ls("dbfs:/mnt/mids-w261/data/datasets_final_project/parquet_weather_data")

In [22]:
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType, NullType
schema = StructType([StructField('STATION', StringType(), True), 
                      StructField('DATE', StringType(), True),
                      StructField('SOURCE', StringType(), True),
                      StructField('LATITUDE', StringType(), True),
                      StructField('LONGITUDE', StringType(), True),
                      StructField('ELEVATION', StringType(), True),
                      StructField('NAME', StringType(), True),
                      StructField('REPORT_TYPE', StringType(), True),
                      StructField('CALL_SIGN', StringType(), True),
                      StructField('QUALITY_CONTROL', StringType(), True),
                      StructField('WND', StringType(), True),
                      StructField('CIG', StringType(), True),
                      StructField('VIS', StringType(), True),
                      StructField('TMP', StringType(), True),
                      StructField('DEW', StringType(), True),
                      StructField('SLP', StringType(), True),
                      StructField('AA1', StringType(), True),
                      StructField('AA2', StringType(), True),
                      StructField('AJ1', StringType(), True),
                      StructField('AY1', StringType(), True),
                      StructField('AY2', StringType(), True),
                      StructField('GA1', StringType(), True),
                      StructField('GA2', StringType(), True),
                      StructField('GA3', StringType(), True),
                      StructField('GE1', StringType(), True),
                      StructField('GF1', StringType(), True),
                      StructField('IA1', StringType(), True),
                      StructField('KA1', StringType(), True),
                      StructField('KA2', StringType(), True),
                      StructField('MA1', StringType(), True),
                      StructField('MD1', StringType(), True),
                      StructField('MW1', StringType(), True),
                      StructField('OC1', StringType(), True),
                      StructField('OD1', StringType(), True),
                      StructField('SA1', StringType(), True),
                      StructField('UA1', StringType(), True),
                      StructField('REM', StringType(), True),
                      StructField('EQD', StringType(), True)
                    ])



In [23]:
weather = spark.read.option("header", "true")\
                      .schema(schema)\
                      .parquet(f"dbfs:/mnt/mids-w261/data/datasets_final_project/parquet_weather_data/201*a.parquet")
weather.count()


In [24]:
display(weather.where('DATE =="DATE"'))

STATION,DATE,SOURCE,LATITUDE,LONGITUDE,ELEVATION,NAME,REPORT_TYPE,CALL_SIGN,QUALITY_CONTROL,WND,CIG,VIS,TMP,DEW,SLP,AA1,AA2,AJ1,AY1,AY2,GA1,GA2,GA3,GE1,GF1,IA1,KA1,KA2,MA1,MD1,MW1,OC1,OD1,SA1,UA1,REM,EQD


In [25]:
#display(weather.describe())

In [26]:
nullCounts_weather_df = nullDataFrame(weather)
nullCounts_weather_df

Unnamed: 0,Feature_Name,Null_Counts,Percentage_Null_Counts
0,STATION,0,0.0
1,DATE,0,0.0
2,SOURCE,0,0.0
3,LATITUDE,0,0.0
4,LONGITUDE,0,0.0
5,ELEVATION,0,0.0
6,NAME,4715523,0.75
7,REPORT_TYPE,0,0.0
8,CALL_SIGN,0,0.0
9,QUALITY_CONTROL,0,0.0


In [27]:
display(weather.sample(False, 0.0000001))

STATION,DATE,SOURCE,LATITUDE,LONGITUDE,ELEVATION,NAME,REPORT_TYPE,CALL_SIGN,QUALITY_CONTROL,WND,CIG,VIS,TMP,DEW,SLP,AA1,AA2,AJ1,AY1,AY2,GA1,GA2,GA3,GE1,GF1,IA1,KA1,KA2,MA1,MD1,MW1,OC1,OD1,SA1,UA1,REM,EQD
71087099999,2018-11-07T21:00:00,4,61.3166666,-117.6,162.0,"FORT PROVIDENCE NWT, CA",SAO,99999,V020,3601900101,99999999,999999999,-01331,-01531,103261,,,,,999999101085,"1,1,001,1,+999,9",,,AWY055WID SA 2100 AUTO8 M M M 326/-13/-15/3602/M/M 1001 33MM=,,,,,,,,,,,,,
61415099999,2018-09-23T18:00:00,4,20.933067,-17.029956,4.87,"NOUADHIBOU, MR",FM-15,99999,V020,"340,1,N,0093,1","22000,1,9,N",008000199,+02301,+02001,999999,,,,,,,,,00991999999999999999999,,101101999999,,,,MET054METAR GQPP 231800Z 34018KT 8000 NSC 23/20 Q1011 NOSIG=,,,,,,,
72278803186,2019-06-13T02:47:00,7,33.41667,-112.38333,295.0,"PHOENIX GOODYEAR AIRPORT, AZ US",FM-15,KGYR,V020,"190,5,N,0026,5","22000,5,9,N","016093,5,N,5",+03805,+00005,999999,,,,,,,"02,5,+06096,5,99,9","02,5,+07620,5,99,9",,"1,99,1,+06096,5,9","1,99,1,+07620,5,9",,,"9,AGL ,+99999,+99999",02995999999060961999999,100645097175,,,,,,MET07906/12/19 19:47:01 METAR KGYR 130247Z 19005KT 10SM FEW200 FEW250 38/00 A2972 RMK
72327013897,2019-02-14T07:53:00,7,36.11889,-86.68917,182.9,"NASHVILLE INTERNATIONAL AIRPORT, TN US",FM-15,KBNA,V030,"170,5,N,0026,5","22000,5,9,N","016093,5,N,5",+00395,-00395,101845,01000095,,,,,,,,,,,,,,,,,,,,,
72688624130,2019-12-16T05:53:00,4,44.8428,-117.8086,1024.4,"BAKER CITY AIRPORT, OR US",FM-15,99999,V020,"999,9,C,0000,1","22000,1,9,N",016093199,-00671,-00891,102661,,,,,,,,,,,,,,,,,,,,,,
72094200323,2018-04-10T23:55:00,6,41.233,-96.6,373.1,"WAHOO MUNICIPAL AIRPORT, NE US",FM-15,KAHQ,V020,"140,5,N,0026,5","03048,5,M,N","016093,5,N,5",+01965,-00195,999999,,,,,,,,,,,,,"07,5,+03048,5,99,9",,,"3,99,1,+03048,5,9",,,,,,
94675099999,2018-02-18T16:00:00,4,-34.9166666,138.6166666,51.0,"KENT TOWN, AS",FM-12,99999,V020,"140,1,N,0031,1","99999,9,9,N",999999999,+01901,+01881,101311,,,,,,,,,999999100721,,,SYN04294675 46/// /1406 10190 20188 30072 40131=,,,,,,,,,,
57494099999,2019-11-06T22:00:00,4,30.783758,114.2081,34.44,"TIANHE, CH",FM-15,99999,V020,"999,9,V,0010,1","99999,9,9,Y",009900599,+01401,+01001,999999,,,,,,,,,,,,,,,,101901999999,,,,,,
2469099999,2015-11-18T19:00:00,4,59.1833333,17.9166667,54.0,"TULLINGE, SW",FM-12,99999,V020,"230,1,N,0020,1","99999,9,9,N",035000199,+00421,+00361,999999,01000091,,,,,,,,49900301999,SYN06402469 25/81 /2302 10042 20036 8/000 333 60005 91003 90710 91104=,,,,,,,,,,,,
89614099999,2017-03-26T02:00:00,4,-66.7,111.5,727.0,"CASEY UPPER PETERSON RUNWAY, AY",FM-12,99999,V020,"999,9,C,0000,1","99999,9,9,N",999999999,-01091,+99999,999999,999999088121,,,SYN04889614 46/// /0000 11109 29089 38812 4//// 5////=,,,,,,,,,,,,,,,,,,


In [28]:
weather[["DATE"]].describe().show()

In [29]:
weather_df = weather.withColumn("DATE_IN_DATEFORMAT",weather['DATE'].cast(DateType()))
weather_df.select('DATE_IN_DATEFORMAT','DATE').show(10,False)

In [30]:
display(weather_df.sample(False, 0.0000001))

STATION,DATE,SOURCE,LATITUDE,LONGITUDE,ELEVATION,NAME,REPORT_TYPE,CALL_SIGN,QUALITY_CONTROL,WND,CIG,VIS,TMP,DEW,SLP,AA1,AA2,AJ1,AY1,AY2,GA1,GA2,GA3,GE1,GF1,IA1,KA1,KA2,MA1,MD1,MW1,OC1,OD1,SA1,UA1,REM,EQD,DATE_IN_DATEFORMAT
6635099999,2018-04-08T09:00:00,4,47.1333333,7.6166666,482.0,"KOPPIGEN, SZ",FM-12,99999,V020,"040,1,N,0010,1","99999,9,9,N",999999999,+01311,+00861,100841,01000091,,,,,,,,,,,,,,,,999999095211,"8,1,007,1,+999,9",,,39900211999,,2018-04-08
60735099999,2016-01-31T23:00:00,4,35.6666667,10.1,68.0,"KAIROUAN, TS",FM-15,99999,V020,"999,9,C,0000,1","99999,9,9,Y",999999999,+01101,+00501,999999,,,,,,,,,,,,102901999999,,,,,,,MET045METAR DTTK 312300Z 00000KT CAVOK 11/05 Q1029=,,,,2016-01-31
72254312977,2017-01-14T03:41:00,7,29.62194,-95.65667,25.6,"HOUSTON SUGARLAND MEM, TX US",FM-16,KSGR,V020,"130,5,N,0021,5","01676,5,M,N","016093,5,N,5",+01945,+01895,999999,,,,,,,,,,,,,,,,,,,,,,,2017-01-14
72269593041,2017-10-11T07:55:00,7,32.28333,-106.91667,1357.6,"LAS CRUCES MUNICIPAL AIRPORT, NM US",FM-15,KLRU,V020,"140,5,N,0036,5","22000,5,9,N","016093,5,N,5",+01115,+00035,999999,,,,,,,,,,,,,,,,,,,,"00,5,+99999,9,99,9",,,2017-10-11
72788499999,2019-06-24T10:35:00,4,46.305639,-119.304194,120.09,"RICHLAND AIRPORT, WA US",FM-15,99999,V020,"999,9,C,0000,1","22000,1,9,N",016093199,+01331,+00581,999999,,,,,,,,,,00991999999999999999999,,,,,101521999999,,MET071METAR KRLD 241035Z AUTO 00000KT 10SM CLR 13/06 A2998 RMK AO2 T01330058=,,,,,,2019-06-24
72792894263,2019-08-29T01:56:00,7,47.48333,-122.76667,135.3,"BREMERTON, WA US",FM-15,KPWT,V020,"060,5,N,0031,5","22000,5,9,N","016093,5,N,5",+02835,+01395,101165,01000095,,,,,,,,,,,,,,,,,,,"00,5,+99999,9,99,9",,,2019-08-29
72223863873,2017-03-06T16:57:00,7,31.35,-85.66667,96.6,"OZARK FORT RUCKER, AL US",FM-15,KHEY,V020,"180,5,N,0062,5","22000,5,9,N","016093,5,N,5",+02105,+01005,102825,,,,,,,,,,,,,,,,,,,,,,,2017-03-06
99999953960,2018-08-24T19:25:00,I,30.0918,-91.8731,10.7,"LAFAYETTE 13 SE, LA US",CRN05,99999,V020,9999999999,"99999,9,9,N",999999999,+03261,+99999,999999,,,,05000091,"05,+00000,1,0",,,,+0284210,+0281010,+0282010,"05,+0326,1,0,0540,1,0",,,,,"99,-06",,+032610,+032610,+032510,,2018-08-24
48852099999,2019-03-12T21:00:00,4,16.4015,107.702614,14.63,"PHUBAI, VM",FM-15,99999,V020,"220,1,N,0010,1","01524,1,9,N",005000199,+02201,+02201,999999,,,,,,"04,1,+00610,1,99,9","07,1,+01524,1,99,9",,,"9,MSL ,+99999,+99999",99999041999006101999999,,101301999999,,101,,,,MET067METAR VVPB 122100Z 22002KT 5000 BR SCT020 BKN050 22/22 Q1013 NOSIG=,,,,2019-03-12
94374099999,2018-09-01T07:30:00,4,-23.381944,150.475278,10.36,"ROCKHAMPTON, AS",FM-15,99999,V020,"210,1,N,0036,1","22000,1,9,N",009999199,+02801,-00101,999999,,,,,,,,,,00991999999999999999999,,101201999999,,,,MET070METAR YBRK 010730Z AUTO 21007KT 9999 // NCD 28/M01 Q1012 RF00/0/000/0=,,,,,,,2018-09-01


# Stations

In [32]:
stations = spark.read.option("header", "true").csv("dbfs:/mnt/mids-w261/data/DEMO8/gsod/stations.csv.gz")

In [33]:
display(stations)

usaf,wban,name,country,state,call,lat,lon,elev,begin,end
7018,99999,WXPOD 7018,,,,0.0,0.0,7018.0,20110309,20130730
7026,99999,WXPOD 7026,AF,,,0.0,0.0,7026.0,20120713,20170822
7070,99999,WXPOD 7070,AF,,,0.0,0.0,7070.0,20140923,20150926
8268,99999,WXPOD8278,AF,,,32.95,65.567,1156.7,20100519,20120323
8307,99999,WXPOD 8318,AF,,,0.0,0.0,8318.0,20100421,20100421
10016,99999,RORVIK/RYUM,NO,,,64.85,11.233,14.0,19870116,19910806
10017,99999,FRIGG,NO,,ENFR,59.98,2.25,48.0,19880320,20050228
10071,99999,LONGYEARBYEN,SV,,,78.217,15.583,37.0,20050210,20050210
10190,99999,SVARTTANGEN,NO,,,77.517,20.817,20.0,20100825,20140523
10303,99999,TROMSO/SKATTURA,NO,,,69.7,19.017,14.0,20140522,20150108


In [34]:
from pyspark.sql import functions as f
stations.where(f.col('name').contains('JAN MAYEN NOR NAVY'))

In [35]:
stations.select('name').distinct().count()

In [36]:
display(stations.select('name').distinct())

name
HATTFJELLDAL-KRUTA
COLLAFIRTH HILL
WINDY HEAD
ST ATHAN
FINTHEN (USA-AF) &
HANAU AAF
WSCHOWA
WROCLAW/STRACHOWICE
VIGNA DI VALLE
SAMOS ISLAND


In [37]:
weather.select('NAME').distinct().count()

In [38]:
display(weather.select('name').distinct())

name
"HEINOLA ASEMANTAUS, FI"
"KOTKA RANKKI, FI"
"ISLE OF PORTLAND, UK"
"HERINGSDORF, GM"
"RIED IM INNKREIS, AU"
"PREITENEGG, AU"
"C. BUDEJOVICE ROZNOV, EZ"
"OBARSIA LOTRULUI, RO"
"VOINEASA, RO"
"IZMIT, TU"
