#### **Task 3**: Does the weather affect the delay? 
Use the API to pull the weather information for flights. There is no need to get weather for ALL flights. We can choose the right representative sample. Let's focus on four weather types:
- sunny
- cloudy
- rainy
- snow.
Test the hypothesis that these 4 delays are from the same distribution. If they are not, which ones are significantly different?

In [4]:
import pandas as pd
import datetime
import os
os.chdir('../..')

In [5]:
weather_table = []
with open('data/WeatherStations.txt', 'r') as f:
    for line in f:
        line_lst = []
        line_lst.append(line[0:3])
        line_lst.append(line[3:20])
        line_lst.append(line[20:24])
        line_lst.append(line[26:29])
        line_lst.append(line[30:39])
        line_lst.append(line[39:47])
        line_lst.append(line[47:55])
        weather_table.append(line_lst)

In [6]:
stations = pd.DataFrame(weather_table[1:-1], columns=weather_table[0])
stations.columns = ['state','station','ICAO','IAT','STN','LAT','LONG']
for col in stations:
    stations[col] = stations[col].str.strip()
stations.drop(['IAT','ICAO','LAT','LONG','station'], inplace = True, axis = 1)
stations = stations.loc[stations.STN != '']

https://www.cpc.ncep.noaa.gov/products/archives/short_range/2018/03/05/fcst814.20180305.data

In [7]:
stations.reset_index(inplace=True, drop = True)

In [5]:
import pandas as pd
import re
import urllib.request
from urllib.error import HTTPError
import time

dates = []
for i in pd.date_range('2017-12-25','2019-12-31').strftime('%Y-%m-%d').tolist():
    dates.append(i.split('-'))

weather = {}
for date in dates:
    year, month, day = date[0], date[1], date[2]
    url = f'https://www.cpc.ncep.noaa.gov/products/archives/short_range/{year}/{month}/{day}/fcst814.{year}{month}{day}.data'
    try:
        data = urllib.request.urlopen(url).read().decode('utf-8')
    except urllib.error.HTTPError as e:
        try:
            if e.code == 404:
                weather[year+month+day] = pd.DataFrame(lst[1:], columns=lst[0]).dropna().drop('YYMMDD/HHMM', axis = 1).copy()
                continue
            else:
                raise
        except:
            raise

    lst = []

    for i in data[47:].split('\n'):
        lst.append(re.split('\s+',i))

    weather[year+month+day] = pd.DataFrame(lst[1:], columns=lst[0]).dropna().drop('YYMMDD/HHMM', axis = 1)



In [6]:
weather_report = {}
for k in weather.keys():
    df = pd.merge(weather[k], stations, on='STN').drop('STN', axis = 1)
    df.set_index('state',drop=True, inplace = True)

    for col in df:
        df[col] = df[col].astype(float)

    df = df.groupby('state')[['TBLW','TNRM','TABV','PBLW','PNRM','PABV']].sum().div(df.index.value_counts(), axis = 0)

    df['TCAT'] = df[['TBLW','TNRM','TABV']].idxmax(axis=1).astype('category')
    df.TCAT.cat.set_categories(['TBLW','TNRM','TABV'], ordered = True)
    df.TCAT = df.TCAT.cat.codes

    df['PCAT'] = df[['PBLW','PNRM','PABV']].idxmax(axis=1).astype('category')
    df.PCAT.cat.set_categories(['PBLW','PNRM','PABV'],ordered = True)
    df.PCAT = df.PCAT.cat.codes

    df = df[['TCAT','PCAT']]
    weather_report[k] = df

0: ABV
1: BLW
2: NRM

In [9]:
# make a pickle of the feature engineered data
import pickle
feature_engineered_data6 = weather_report
with open(r'data/feature_engineered_data6.pickle','xb') as weather_data_file:
     pickle.dump(feature_engineered_data6, weather_data_file)

NameError: name 'weather_report' is not defined

In [36]:
with open(r'data/flight_data_500K.pickle','rb') as flight:
     df_flight = pickle.load(flight)
with open(r'data/feature_engineered_data6.pickle','rb') as weather_data_file:
     df_weather = pickle.load(weather_data_file)

In [37]:
df_flight.columns

Index(['fl_date', 'mkt_unique_carrier', 'branded_code_share', 'mkt_carrier',
       'mkt_carrier_fl_num', 'op_unique_carrier', 'tail_num',
       'op_carrier_fl_num', 'origin_airport_id', 'origin', 'origin_city_name',
       'dest_airport_id', 'dest', 'dest_city_name', 'crs_dep_time', 'dep_time',
       'dep_delay', 'taxi_out', 'wheels_off', 'wheels_on', 'taxi_in',
       'crs_arr_time', 'arr_time', 'arr_delay', 'cancelled', 'diverted', 'dup',
       'crs_elapsed_time', 'actual_elapsed_time', 'air_time', 'flights',
       'distance'],
      dtype='object')

In [38]:
df_flight['origin_state'] = df_flight['origin_city_name'].str[-2:]
df_flight['dest_state'] = df_flight['dest_city_name'].str[-2:]
df_flight.fl_date = pd.to_datetime(df_flight.fl_date)

In [39]:
df_flight

Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,...,cancelled,diverted,dup,crs_elapsed_time,actual_elapsed_time,air_time,flights,distance,origin_state,dest_state
0,2019-01-05,AA,AA,AA,188,AA,N969TW,188,12339,IND,...,0.0,0.0,N,156.0,128.0,107.0,1.0,761.0,IN,TX
1,2019-04-07,UA,UA_CODESHARE,UA,4620,AX,N11113,4620,12896,LBB,...,0.0,0.0,N,96.0,97.0,64.0,1.0,456.0,TX,CO
2,2019-10-10,DL,DL_CODESHARE,DL,3783,OO,N693CA,3783,13487,MSP,...,0.0,0.0,N,171.0,174.0,129.0,1.0,852.0,MN,TX
3,2018-10-11,DL,DL,DL,2445,DL,N891AT,2445,11259,DAL,...,0.0,0.0,N,124.0,109.0,89.0,1.0,721.0,TX,GA
4,2018-03-09,WN,WN,WN,6144,WN,N792SW,6144,12953,LGA,...,0.0,0.0,N,180.0,153.0,131.0,1.0,888.0,NY,MO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
499995,2018-06-01,F9,F9,F9,1639,F9,N706FR,1639,13204,MCO,...,0.0,0.0,N,174.0,171.0,153.0,1.0,1066.0,FL,WI
499996,2019-06-27,WN,WN,WN,494,WN,N280WN,494,14107,PHX,...,0.0,0.0,N,80.0,77.0,65.0,1.0,369.0,AZ,CA
499997,2019-12-03,DL,DL_CODESHARE,DL,5371,9E,N695CA,5371,13342,MKE,...,0.0,0.0,N,133.0,123.0,100.0,1.0,738.0,WI,NY
499998,2018-02-06,DL,DL_CODESHARE,DL,3498,9E,N8908D,3498,11193,CVG,...,0.0,0.0,N,104.0,85.0,68.0,1.0,507.0,OH,PA


In [40]:
len(df_weather)

737

In [238]:
t = pd.json_normalize(df_weather).apply(lambda x : x[0]['TCAT'])
p = pd.json_normalize(df_weather).apply(lambda x : x[0]['PCAT'])

In [239]:
t = t.T.reset_index().set_axis(['TCAT' for _ in range(t.shape[1])], axis = 0).T
t.columns = [t.columns, t.iloc[0]]
t = t[1:]

In [240]:
p = p.T.reset_index().set_axis(['PCAT' for _ in range(p.shape[1])], axis = 0).T
p.columns = [p.columns, p.iloc[0]]
p = p[1:]

In [258]:
t = t.merge(p, left_index=True, right_index=True)

In [287]:
a = (df_flight.fl_date.apply(lambda x : (x - datetime.timedelta(weeks = 1)).strftime('%Y%m%d')))

0         20181229
1         20190331
2         20191003
3         20181004
4         20180302
            ...   
499995    20180525
499996    20190620
499997    20191126
499998    20180130
499999    20190323
Name: fl_date, Length: 489978, dtype: object


In [295]:
c = a[0:1].apply(lambda x : t.iloc[:, t.columns.get_level_values(1)==x])[0].merge(df_flight, left_index = True, right_on='origin_state')

  c = a[0:1].apply(lambda x : t.iloc[:, t.columns.get_level_values(1)==x])[0].merge(df_flight, left_index = True, right_on='origin_state')


In [296]:
c.columns = ['origin_TCAT', 'origin_PCAT'] + c.columns[2:].to_list()

In [297]:
c = a[0:1].apply(lambda x : t.iloc[:, t.columns.get_level_values(1)==x])[0].merge(c, right_on='dest_state', left_index=True)
c.columns = ['dest_TCAT', 'dest_PCAT', 'origin_TCAT', 'origin_PCAT'] + c.columns[4:].to_list()
c

  c = a[0:1].apply(lambda x : t.iloc[:, t.columns.get_level_values(1)==x])[0].merge(c, right_on='dest_state', left_index=True)


Unnamed: 0,dest_TCAT,dest_PCAT,origin_TCAT,origin_PCAT,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,...,diverted,dup,crs_elapsed_time,actual_elapsed_time,air_time,flights,distance,origin_state,dest_state,report_date
75,2,0,2,0,2019-01-24,AS,AS,AS,55,AS,...,0.0,N,85.0,76.0,70.0,1.0,503.0,AK,AK,2019-01-17
494,2,0,2,0,2018-05-22,AS,AS,AS,55,AS,...,0.0,N,56.0,53.0,41.0,1.0,261.0,AK,AK,2018-05-15
897,2,0,2,0,2018-02-20,AS,AS,AS,65,AS,...,0.0,N,42.0,63.0,26.0,1.0,82.0,AK,AK,2018-02-13
927,2,0,2,0,2018-05-21,AS,AS,AS,154,AS,...,0.0,N,84.0,88.0,75.0,1.0,548.0,AK,AK,2018-05-14
1195,2,0,2,0,2019-05-14,AS,AS,AS,538,AS,...,0.0,N,95.0,94.0,86.0,1.0,626.0,AK,AK,2019-05-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
476771,0,2,0,0,2018-09-21,DL,DL,DL,2942,DL,...,0.0,N,57.0,56.0,33.0,1.0,205.0,UT,WY,2018-09-14
484712,0,2,0,0,2019-11-09,DL,DL_CODESHARE,DL,4225,OO,...,0.0,N,86.0,76.0,51.0,1.0,320.0,UT,WY,2019-11-02
491057,0,2,0,0,2019-04-25,DL,DL,DL,1688,DL,...,0.0,N,57.0,59.0,41.0,1.0,205.0,UT,WY,2019-04-18
497637,0,2,0,0,2018-06-16,DL,DL_CODESHARE,DL,7381,OO,...,0.0,N,80.0,78.0,48.0,1.0,298.0,UT,WY,2018-06-09


In [299]:
import pickle
flight_data_engineered6 = c[['dest_TCAT', 'dest_PCAT', 'origin_TCAT', 'origin_PCAT','arr_delay']]
with open(r'data/flight_data_engineered6.pickle','xb') as weather_data_file:
     pickle.dump(flight_data_engineered6, weather_data_file)