**Burbank 2017**
<br/>Evaluate the airports to find one that flies to less than 15 destinations.
<br/>Create and clean a dataframe of just the Burbank airport adding origin and destination weather.

In [1]:
import numpy as np
import pandas as pd

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
# increase display size to enable viewing of all data columns

pd.options.display.max_columns = 40

In [4]:
# read in the clean flights csv

flights2017_df = pd.read_csv('flights2017clean.csv', index_col=0)
flights2017_df

Unnamed: 0,DayOfWeek,FlightDate,Reporting_Airline,Tail_Number,Origin,OriginCityName,OriginState,Dest,DestCityName,DestState,CRSDepTime,DepTime,DepDelay,DepDel15,DepartureDelayGroups,TaxiOut,WheelsOff,Cancelled,CancellationCode,Distance,DistanceGroup,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,CRSArrTime,ArrTime,ArrDelay,Delayed
0,6,2017-07-01,AS,N559AS,DCA,"Washington, DC",VA,SEA,"Seattle, WA",WA,800,750.0,-10.0,0.0,-1.0,17.0,807.0,0,0,2329,10,0.0,0.0,0.0,0.0,0.0,1053,1021.0,-32.0,0
1,6,2017-07-01,AS,N513AS,SEA,"Seattle, WA",WA,DCA,"Washington, DC",VA,1335,1330.0,-5.0,0.0,-1.0,15.0,1345.0,0,0,2329,10,0.0,0.0,0.0,0.0,0.0,2153,2129.0,-24.0,0
2,6,2017-07-01,AS,N588AS,DCA,"Washington, DC",VA,SEA,"Seattle, WA",WA,1852,1856.0,4.0,0.0,0.0,9.0,1905.0,0,0,2329,10,0.0,0.0,0.0,0.0,0.0,2157,2107.0,-50.0,0
3,6,2017-07-01,AS,N538AS,SEA,"Seattle, WA",WA,DCA,"Washington, DC",VA,800,840.0,40.0,1.0,2.0,9.0,849.0,0,0,2329,10,36.0,0.0,0.0,0.0,0.0,1617,1653.0,36.0,1
4,6,2017-07-01,AS,N536AS,DCA,"Washington, DC",VA,LAX,"Los Angeles, CA",CA,900,854.0,-6.0,0.0,-1.0,13.0,907.0,0,0,2311,10,0.0,0.0,0.0,0.0,0.0,1138,1115.0,-23.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5674616,2,2017-09-05,UA,N456UA,EWR,"Newark, NJ",NJ,LAS,"Las Vegas, NV",NV,842,949.0,67.0,1.0,4.0,24.0,1013.0,0,0,2227,9,20.0,0.0,0.0,0.0,27.0,1110,1157.0,47.0,1
5674617,2,2017-09-05,UA,N809UA,RDU,"Raleigh/Durham, NC",NC,DEN,"Denver, CO",CO,1623,1616.0,-7.0,0.0,-1.0,21.0,1637.0,0,0,1436,6,0.0,0.0,0.0,0.0,0.0,1817,1828.0,11.0,1
5674618,2,2017-09-05,UA,N69816,DEN,"Denver, CO",CO,SAT,"San Antonio, TX",TX,1526,1523.0,-3.0,0.0,-1.0,12.0,1535.0,0,0,794,4,0.0,0.0,0.0,0.0,0.0,1835,1817.0,-18.0,0
5674619,2,2017-09-05,UA,N17752,EWR,"Newark, NJ",NJ,DFW,"Dallas/Fort Worth, TX",TX,605,557.0,-8.0,0.0,-1.0,24.0,621.0,0,0,1372,6,0.0,0.0,0.0,0.0,0.0,845,827.0,-18.0,0


Bases on the Atlanta findings, it seems that weather at the destination is an important feature.
<br/>Therefore, investigate airports to select a smaller one with a manageable number of destinations flown to.

In [5]:
# create a dataframe of unique destinations

dest_df = flights2017_df.groupby('Origin').Dest.nunique()
dest_df

Origin
ABE     3
ABI     1
ABQ    23
ABR     1
ABY     1
       ..
WRG     2
WYS     1
XNA     9
YAK     2
YUM     1
Name: Dest, Length: 319, dtype: int64

In [6]:
# create a dataframe of total flights per airport

total_df = flights2017_df['Origin'].value_counts()
total_df

ATL    364009
ORD    265936
DEN    222812
LAX    213911
DFW    180882
        ...  
ABI        41
SUX        28
GGG         8
BPT         7
SWO         1
Name: Origin, Length: 319, dtype: int64

In [7]:
# create a dataframe of cancelled flights per airport

cancel_df = flights2017_df.groupby('Origin').Cancelled.sum()
cancel_df

Origin
ABE     39
ABI      1
ABQ    130
ABR      8
ABY      7
      ... 
WRG     12
WYS      3
XNA     83
YAK     17
YUM      3
Name: Cancelled, Length: 319, dtype: int64

In [8]:
# create a dataframe of flights that leave their origin late

delay_df = flights2017_df.groupby('Origin').Delayed.sum()
delay_df

Origin
ABE     670
ABI      18
ABQ    6148
ABR     246
ABY     200
       ... 
WRG     182
WYS     126
XNA    1325
YAK     164
YUM     299
Name: Delayed, Length: 319, dtype: int64

In [9]:
# create a summary combining unique destinations. total flights, cancelled flights, late flights and percent of total late + cancelled flights

summary_df = pd.DataFrame(columns=['dest', 'ttl', 'cancel'])
summary_df['dest'] = dest_df
summary_df['ttl'] = total_df
summary_df['cancel'] = cancel_df
summary_df['Late'] = delay_df

summary_df['percent'] = (summary_df['Late'] / summary_df['ttl']) * 100
summary_df

Unnamed: 0_level_0,dest,ttl,cancel,Late,percent
Origin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ABE,3,2135,39,670,31.381733
ABI,1,41,1,18,43.902439
ABQ,23,19035,130,6148,32.298398
ABR,1,727,8,246,33.837689
ABY,1,616,7,200,32.467532
...,...,...,...,...,...
WRG,2,722,12,182,25.207756
WYS,1,232,3,126,54.310345
XNA,9,5128,83,1325,25.838534
YAK,2,713,17,164,23.001403


In [10]:
# from the summary dataframe, only show airports with between 12 and 20 destinations and sort ascending by late flights

short_summary_df = summary_df[(summary_df['dest'] > 11) & (summary_df['dest'] < 21)]
short_summary_df.sort_values(by='percent', ascending=False)

Unnamed: 0_level_0,dest,ttl,cancel,Late,percent
Origin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BUR,12,25105,479,9932,39.56184
ONT,16,20810,248,7742,37.203268
HDN,12,967,18,337,34.850052
RIC,14,15779,310,5392,34.172001
RNO,20,15739,157,5376,34.157189
OGG,19,24237,102,8234,33.972851
TUS,19,15385,91,5157,33.519662
PVD,18,13734,340,4508,32.823649
EGE,13,1409,49,461,32.71824
GEG,17,11668,73,3788,32.464861


The results here show that Burbank is the best airport for which to build a model incorporating weather at the origin and destination.

In [11]:
# create a dataframe of the flights originating in Burbank

burbank2017_df = flights2017_df[(flights2017_df['Origin'] == 'BUR')]
burbank2017_df

Unnamed: 0,DayOfWeek,FlightDate,Reporting_Airline,Tail_Number,Origin,OriginCityName,OriginState,Dest,DestCityName,DestState,CRSDepTime,DepTime,DepDelay,DepDel15,DepartureDelayGroups,TaxiOut,WheelsOff,Cancelled,CancellationCode,Distance,DistanceGroup,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,CRSArrTime,ArrTime,ArrDelay,Delayed
163,6,2017-07-01,AS,N520AS,BUR,"Burbank, CA",CA,SEA,"Seattle, WA",WA,700,708.0,8.0,0.0,0.0,13.0,721.0,0,0,937,4,0.0,0.0,0.0,0.0,0.0,930,942.0,12.0,1
340,6,2017-07-01,AS,N563AS,BUR,"Burbank, CA",CA,SEA,"Seattle, WA",WA,1730,1720.0,-10.0,0.0,-1.0,22.0,1742.0,0,0,937,4,0.0,0.0,0.0,0.0,0.0,1958,2010.0,12.0,1
367,6,2017-07-01,AS,N562AS,BUR,"Burbank, CA",CA,SEA,"Seattle, WA",WA,1215,1212.0,-3.0,0.0,-1.0,22.0,1234.0,0,0,937,4,0.0,0.0,16.0,0.0,0.0,1443,1459.0,16.0,1
699,7,2017-07-02,AS,N530AS,BUR,"Burbank, CA",CA,SEA,"Seattle, WA",WA,700,657.0,-3.0,0.0,-1.0,14.0,711.0,0,0,937,4,0.0,0.0,0.0,0.0,0.0,930,941.0,11.0,1
882,7,2017-07-02,AS,N527AS,BUR,"Burbank, CA",CA,SEA,"Seattle, WA",WA,1730,1721.0,-9.0,0.0,-1.0,13.0,1734.0,0,0,937,4,0.0,0.0,0.0,0.0,0.0,1958,1947.0,-11.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5671663,2,2017-09-19,UA,N36207,BUR,"Burbank, CA",CA,SFO,"San Francisco, CA",CA,708,706.0,-2.0,0.0,-1.0,32.0,738.0,0,0,326,2,0.0,0.0,0.0,0.0,0.0,833,841.0,8.0,1
5672027,2,2017-09-19,UA,N836UA,BUR,"Burbank, CA",CA,SFO,"San Francisco, CA",CA,2015,2000.0,-15.0,0.0,-1.0,18.0,2018.0,0,0,326,2,0.0,0.0,0.0,0.0,0.0,2141,2125.0,-16.0,0
5672145,2,2017-09-19,UA,N14240,BUR,"Burbank, CA",CA,DEN,"Denver, CO",CO,700,651.0,-9.0,0.0,-1.0,26.0,717.0,0,0,850,4,0.0,0.0,0.0,0.0,0.0,1029,1012.0,-17.0,0
5673747,2,2017-09-05,UA,N78285,BUR,"Burbank, CA",CA,SFO,"San Francisco, CA",CA,1220,1214.0,-6.0,0.0,-1.0,10.0,1224.0,0,0,326,2,0.0,0.0,0.0,0.0,0.0,1339,1319.0,-20.0,0


In [12]:
burbank2017_df.to_csv('burbank2017comp.csv')

In [13]:
# drop columns with redundant, 'give-away', or unuseful information

burbank2017_df.drop(['Tail_Number', 
                     'OriginCityName', 
                     'DestCityName',
                     'DepTime', 
                     'DepDel15', 
                     'DepartureDelayGroups', 
                     'TaxiOut', 
                     'WheelsOff',
                     'DistanceGroup',
                     'CarrierDelay',
                     'WeatherDelay',
                     'NASDelay',
                     'SecurityDelay',
                     'LateAircraftDelay',
                     'CancellationCode'], inplace=True, axis=1)
burbank2017_df

Unnamed: 0,DayOfWeek,FlightDate,Reporting_Airline,Origin,OriginState,Dest,DestState,CRSDepTime,DepDelay,Cancelled,Distance,CRSArrTime,ArrTime,ArrDelay,Delayed
163,6,2017-07-01,AS,BUR,CA,SEA,WA,700,8.0,0,937,930,942.0,12.0,1
340,6,2017-07-01,AS,BUR,CA,SEA,WA,1730,-10.0,0,937,1958,2010.0,12.0,1
367,6,2017-07-01,AS,BUR,CA,SEA,WA,1215,-3.0,0,937,1443,1459.0,16.0,1
699,7,2017-07-02,AS,BUR,CA,SEA,WA,700,-3.0,0,937,930,941.0,11.0,1
882,7,2017-07-02,AS,BUR,CA,SEA,WA,1730,-9.0,0,937,1958,1947.0,-11.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5671663,2,2017-09-19,UA,BUR,CA,SFO,CA,708,-2.0,0,326,833,841.0,8.0,1
5672027,2,2017-09-19,UA,BUR,CA,SFO,CA,2015,-15.0,0,326,2141,2125.0,-16.0,0
5672145,2,2017-09-19,UA,BUR,CA,DEN,CO,700,-9.0,0,850,1029,1012.0,-17.0,0
5673747,2,2017-09-05,UA,BUR,CA,SFO,CA,1220,-6.0,0,326,1339,1319.0,-20.0,0


In [14]:
# summary statistics for the dataframe

burbank2017_df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
DayOfWeek,25105.0,3.856045,1.969086,1.0,2.0,4.0,5.0,7.0
CRSDepTime,25105.0,1355.79291,454.141528,649.0,935.0,1338.0,1740.0,2155.0
DepDelay,25105.0,8.952639,33.845599,-29.0,-5.0,-1.0,8.0,1418.0
Cancelled,25105.0,0.01908,0.136809,0.0,0.0,0.0,0.0,1.0
Distance,25105.0,457.112886,333.129069,223.0,296.0,326.0,369.0,2465.0
CRSArrTime,25105.0,1493.193029,458.891451,2.0,1055.0,1501.0,1920.0,2359.0
ArrTime,25105.0,1461.232025,513.601519,0.0,1042.0,1455.0,1917.0,2400.0
ArrDelay,25105.0,6.075682,40.097833,-2199.0,-10.0,-3.0,9.0,1424.0
Delayed,25105.0,0.395618,0.488993,0.0,0.0,0.0,1.0,1.0


In [15]:
# import all the weather data

weather_df = pd.read_csv('data/weather/all13_weather.csv')
weather_df

Unnamed: 0,STATION,NAME,AIRPORT,DATE,AWND,PGTM,PRCP,SNOW,SNWD,TAVG,TMAX,TMIN,TOBS,WDF2,WDF5,WSF2,WSF5
0,USW00024233,"SEATTLE TACOMA INTERNATIONAL AIRPORT, WA US",SEA,2017-01-01,9.62,,0.43,3.0,2.0,35.0,37.0,28.0,,10.0,10.0,16.1,19.0
1,USW00024233,"SEATTLE TACOMA INTERNATIONAL AIRPORT, WA US",SEA,2017-01-02,10.51,,0.00,0.0,2.0,32.0,34.0,26.0,,70.0,60.0,15.0,19.0
2,USW00024233,"SEATTLE TACOMA INTERNATIONAL AIRPORT, WA US",SEA,2017-01-03,11.86,,0.00,0.0,2.0,27.0,33.0,21.0,,20.0,20.0,18.1,21.9
3,USW00024233,"SEATTLE TACOMA INTERNATIONAL AIRPORT, WA US",SEA,2017-01-04,10.96,,0.00,0.0,1.2,27.0,36.0,22.0,,20.0,20.0,16.1,19.0
4,USW00024233,"SEATTLE TACOMA INTERNATIONAL AIRPORT, WA US",SEA,2017-01-05,5.82,,0.00,0.0,0.0,28.0,35.0,21.0,,40.0,40.0,15.0,16.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14230,USW00094817,"PONTIAC OAKLAND CO INTERNATIONAL AIRPORT, MI US",OAK,2019-12-27,11.41,838.0,0.00,,,,53.0,34.0,,290.0,280.0,23.0,28.0
14231,USW00094817,"PONTIAC OAKLAND CO INTERNATIONAL AIRPORT, MI US",OAK,2019-12-28,6.04,8.0,0.00,,,,40.0,30.0,,100.0,290.0,12.1,15.0
14232,USW00094817,"PONTIAC OAKLAND CO INTERNATIONAL AIRPORT, MI US",OAK,2019-12-29,8.28,1055.0,0.87,,,,44.0,36.0,,110.0,100.0,19.9,27.1
14233,USW00094817,"PONTIAC OAKLAND CO INTERNATIONAL AIRPORT, MI US",OAK,2019-12-30,16.11,1238.0,0.44,,,,44.0,31.0,,220.0,220.0,29.1,44.1


In [16]:
# create a dataframe of the 2017 weather

weather2017_df = weather_df[(pd.DatetimeIndex(weather_df['DATE']).year == 2017)]
weather2017_df

Unnamed: 0,STATION,NAME,AIRPORT,DATE,AWND,PGTM,PRCP,SNOW,SNWD,TAVG,TMAX,TMIN,TOBS,WDF2,WDF5,WSF2,WSF5
0,USW00024233,"SEATTLE TACOMA INTERNATIONAL AIRPORT, WA US",SEA,2017-01-01,9.62,,0.43,3.0,2.0,35.0,37.0,28.0,,10.0,10.0,16.1,19.0
1,USW00024233,"SEATTLE TACOMA INTERNATIONAL AIRPORT, WA US",SEA,2017-01-02,10.51,,0.00,0.0,2.0,32.0,34.0,26.0,,70.0,60.0,15.0,19.0
2,USW00024233,"SEATTLE TACOMA INTERNATIONAL AIRPORT, WA US",SEA,2017-01-03,11.86,,0.00,0.0,2.0,27.0,33.0,21.0,,20.0,20.0,18.1,21.9
3,USW00024233,"SEATTLE TACOMA INTERNATIONAL AIRPORT, WA US",SEA,2017-01-04,10.96,,0.00,0.0,1.2,27.0,36.0,22.0,,20.0,20.0,16.1,19.0
4,USW00024233,"SEATTLE TACOMA INTERNATIONAL AIRPORT, WA US",SEA,2017-01-05,5.82,,0.00,0.0,0.0,28.0,35.0,21.0,,40.0,40.0,15.0,16.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13500,USW00094817,"PONTIAC OAKLAND CO INTERNATIONAL AIRPORT, MI US",OAK,2017-12-27,6.26,1329.0,0.00,,,,11.0,-6.0,,300.0,300.0,16.1,19.9
13501,USW00094817,"PONTIAC OAKLAND CO INTERNATIONAL AIRPORT, MI US",OAK,2017-12-28,2.68,1447.0,0.00,,,,12.0,-10.0,,120.0,120.0,10.1,13.0
13502,USW00094817,"PONTIAC OAKLAND CO INTERNATIONAL AIRPORT, MI US",OAK,2017-12-29,6.49,1315.0,0.00,,,,17.0,8.0,,280.0,270.0,15.0,19.0
13503,USW00094817,"PONTIAC OAKLAND CO INTERNATIONAL AIRPORT, MI US",OAK,2017-12-30,12.30,1210.0,0.00,,,,14.0,3.0,,290.0,300.0,23.9,30.0


In [17]:
# look at the total number of NaN values for each column

weather2017_df.isna().sum()

STATION       0
NAME          0
AIRPORT       0
DATE          0
AWND          1
PGTM       4226
PRCP          3
SNOW       2335
SNWD       2423
TAVG       1460
TMAX          1
TMIN          7
TOBS       4380
WDF2        730
WDF5        753
WSF2        730
WSF5        753
dtype: int64

In [18]:
# TMAX and TMIN will be used to generate missing values for TAVG and are only missing a few values -- fill using forward fill

weather2017_df['TMAX'].ffill(inplace=True)
weather2017_df['TMIN'].ffill(inplace=True)
weather2017_df.isna().sum()

STATION       0
NAME          0
AIRPORT       0
DATE          0
AWND          1
PGTM       4226
PRCP          3
SNOW       2335
SNWD       2423
TAVG       1460
TMAX          0
TMIN          0
TOBS       4380
WDF2        730
WDF5        753
WSF2        730
WSF5        753
dtype: int64

In [19]:
# create a temporary column calculating the average temperature using TMAX and TMIN

weather2017_df['temp_avg'] = (weather2017_df['TMAX'] + weather2017_df['TMIN']) / 2
weather2017_df

Unnamed: 0,STATION,NAME,AIRPORT,DATE,AWND,PGTM,PRCP,SNOW,SNWD,TAVG,TMAX,TMIN,TOBS,WDF2,WDF5,WSF2,WSF5,temp_avg
0,USW00024233,"SEATTLE TACOMA INTERNATIONAL AIRPORT, WA US",SEA,2017-01-01,9.62,,0.43,3.0,2.0,35.0,37.0,28.0,,10.0,10.0,16.1,19.0,32.5
1,USW00024233,"SEATTLE TACOMA INTERNATIONAL AIRPORT, WA US",SEA,2017-01-02,10.51,,0.00,0.0,2.0,32.0,34.0,26.0,,70.0,60.0,15.0,19.0,30.0
2,USW00024233,"SEATTLE TACOMA INTERNATIONAL AIRPORT, WA US",SEA,2017-01-03,11.86,,0.00,0.0,2.0,27.0,33.0,21.0,,20.0,20.0,18.1,21.9,27.0
3,USW00024233,"SEATTLE TACOMA INTERNATIONAL AIRPORT, WA US",SEA,2017-01-04,10.96,,0.00,0.0,1.2,27.0,36.0,22.0,,20.0,20.0,16.1,19.0,29.0
4,USW00024233,"SEATTLE TACOMA INTERNATIONAL AIRPORT, WA US",SEA,2017-01-05,5.82,,0.00,0.0,0.0,28.0,35.0,21.0,,40.0,40.0,15.0,16.1,28.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13500,USW00094817,"PONTIAC OAKLAND CO INTERNATIONAL AIRPORT, MI US",OAK,2017-12-27,6.26,1329.0,0.00,,,,11.0,-6.0,,300.0,300.0,16.1,19.9,2.5
13501,USW00094817,"PONTIAC OAKLAND CO INTERNATIONAL AIRPORT, MI US",OAK,2017-12-28,2.68,1447.0,0.00,,,,12.0,-10.0,,120.0,120.0,10.1,13.0,1.0
13502,USW00094817,"PONTIAC OAKLAND CO INTERNATIONAL AIRPORT, MI US",OAK,2017-12-29,6.49,1315.0,0.00,,,,17.0,8.0,,280.0,270.0,15.0,19.0,12.5
13503,USW00094817,"PONTIAC OAKLAND CO INTERNATIONAL AIRPORT, MI US",OAK,2017-12-30,12.30,1210.0,0.00,,,,14.0,3.0,,290.0,300.0,23.9,30.0,8.5


In [20]:
# fill in the missing TAVG values using the newly created average temperature column

weather2017_df['TAVG'].fillna(weather2017_df['temp_avg'], inplace=True)
weather2017_df.isna().sum()

STATION        0
NAME           0
AIRPORT        0
DATE           0
AWND           1
PGTM        4226
PRCP           3
SNOW        2335
SNWD        2423
TAVG           0
TMAX           0
TMIN           0
TOBS        4380
WDF2         730
WDF5         753
WSF2         730
WSF5         753
temp_avg       0
dtype: int64

In [21]:
# look at the correlation between weather features

weather2017_df.corr()

Unnamed: 0,AWND,PGTM,PRCP,SNOW,SNWD,TAVG,TMAX,TMIN,TOBS,WDF2,WDF5,WSF2,WSF5,temp_avg
AWND,1.0,-0.000758,0.15816,0.078027,0.03622,-0.101453,-0.156871,-0.061318,0.109502,0.032963,0.047298,0.814069,0.800105,-0.116648
PGTM,-0.000758,1.0,0.039028,-0.180615,-0.157369,0.076539,0.105446,0.059647,,-0.072795,-0.008026,0.054013,0.054276,0.086166
PRCP,0.15816,0.039028,1.0,0.142682,0.021775,-0.122424,-0.182338,-0.057033,-0.158929,-0.080588,-0.080074,0.200823,0.211841,-0.128964
SNOW,0.078027,-0.180615,0.142682,1.0,0.264226,-0.179636,-0.196548,-0.176746,,0.046371,0.03922,0.055313,0.043195,-0.192194
SNWD,0.03622,-0.157369,0.021775,0.264226,1.0,-0.298263,-0.29317,-0.285548,,0.0522,0.060127,0.006398,-0.009026,-0.296989
TAVG,-0.101453,0.076539,-0.122424,-0.179636,-0.298263,1.0,0.96982,0.9653,0.858933,-0.008754,0.008379,0.019598,0.051093,0.993758
TMAX,-0.156871,0.105446,-0.182338,-0.196548,-0.29317,0.96982,1.0,0.895849,0.680895,0.002941,0.021788,-0.012565,0.015528,0.978416
TMIN,-0.061318,0.059647,-0.057033,-0.176746,-0.285548,0.9653,0.895849,1.0,0.93804,-0.025504,-0.011859,0.011931,0.042264,0.968337
TOBS,0.109502,,-0.158929,,,0.858933,0.680895,0.93804,1.0,,,,,0.858933
WDF2,0.032963,-0.072795,-0.080588,0.046371,0.0522,-0.008754,0.002941,-0.025504,,1.0,0.771192,0.069098,0.066544,-0.010373


In [22]:
# drop unnecessary and/or redundant columns

weather2017_df.drop(['STATION', 
                     'NAME', 
                     'PGTM', 
                     'SNWD', 
                     'TMAX', 
                     'TMIN', 
                     'TOBS', 
                     'WDF2', 
                     'WDF5', 
                     'WSF2', 
                     'WSF5',
                     'temp_avg'], axis=1, inplace=True)
weather2017_df

Unnamed: 0,AIRPORT,DATE,AWND,PRCP,SNOW,TAVG
0,SEA,2017-01-01,9.62,0.43,3.0,35.0
1,SEA,2017-01-02,10.51,0.00,0.0,32.0
2,SEA,2017-01-03,11.86,0.00,0.0,27.0
3,SEA,2017-01-04,10.96,0.00,0.0,27.0
4,SEA,2017-01-05,5.82,0.00,0.0,28.0
...,...,...,...,...,...,...
13500,OAK,2017-12-27,6.26,0.00,,2.5
13501,OAK,2017-12-28,2.68,0.00,,1.0
13502,OAK,2017-12-29,6.49,0.00,,12.5
13503,OAK,2017-12-30,12.30,0.00,,8.5


In [23]:
# check for any remaining NaN values

weather2017_df.isna().sum()

AIRPORT       0
DATE          0
AWND          1
PRCP          3
SNOW       2335
TAVG          0
dtype: int64

In [24]:
# investigate which airports are missing values for SNOW

weather2017_df['AIRPORT'][(weather2017_df['SNOW'].isna())].value_counts()

BUR    365
SJC    365
OAK    365
SFO    362
PHX    362
PDX    273
SMF    243
Name: AIRPORT, dtype: int64

In [25]:
# all airports in warm climates where snow is unlikely... so input zeros for the NaNs

weather2017_df['SNOW'].fillna(0, inplace=True)
weather2017_df.isna().sum()

AIRPORT    0
DATE       0
AWND       1
PRCP       3
SNOW       0
TAVG       0
dtype: int64

In [26]:
# investigate the row missing wind information

weather2017_df[(weather2017_df['AWND'].isna())]

Unnamed: 0,AIRPORT,DATE,AWND,PRCP,SNOW,TAVG
13395,OAK,2017-09-13,,,0.0,69.0


In [27]:
# investigate the row missing precipitation information

weather2017_df[(weather2017_df['PRCP'].isna())]

Unnamed: 0,AIRPORT,DATE,AWND,PRCP,SNOW,TAVG
13306,OAK,2017-06-16,6.26,,0.0,73.5
13317,OAK,2017-06-27,10.74,,0.0,61.5
13395,OAK,2017-09-13,,,0.0,69.0


In [28]:
# fill in these missing values using forward fill

weather2017_df['AWND'].ffill(inplace=True)
weather2017_df['PRCP'].ffill(inplace=True)
weather2017_df.isna().sum()

AIRPORT    0
DATE       0
AWND       0
PRCP       0
SNOW       0
TAVG       0
dtype: int64

In [29]:
# rename the columns with more intuitive names

weather2017_df.columns = ['AIRPORT', 
                          'DATE', 
                          'WIND', 
                          'PRCP', 
                          'SNOW', 
                          'TEMP']
weather2017_df

Unnamed: 0,AIRPORT,DATE,WIND,PRCP,SNOW,TEMP
0,SEA,2017-01-01,9.62,0.43,3.0,35.0
1,SEA,2017-01-02,10.51,0.00,0.0,32.0
2,SEA,2017-01-03,11.86,0.00,0.0,27.0
3,SEA,2017-01-04,10.96,0.00,0.0,27.0
4,SEA,2017-01-05,5.82,0.00,0.0,28.0
...,...,...,...,...,...,...
13500,OAK,2017-12-27,6.26,0.00,0.0,2.5
13501,OAK,2017-12-28,2.68,0.00,0.0,1.0
13502,OAK,2017-12-29,6.49,0.00,0.0,12.5
13503,OAK,2017-12-30,12.30,0.00,0.0,8.5


In [30]:
# ensure that the date column in both datafranes is of type datetime

pd.to_datetime(burbank2017_df['FlightDate'])
pd.to_datetime(weather2017_df['DATE'])

0       2017-01-01
1       2017-01-02
2       2017-01-03
3       2017-01-04
4       2017-01-05
           ...    
13500   2017-12-27
13501   2017-12-28
13502   2017-12-29
13503   2017-12-30
13504   2017-12-31
Name: DATE, Length: 4745, dtype: datetime64[ns]

In [31]:
# ensure there is an entire year of data

display(burbank2017_df['FlightDate'].min())
display(burbank2017_df['FlightDate'].max())

'2017-01-01'

'2017-12-31'

In [32]:
# add in the origin airport weather to the burbank dataframe

burbank2017_df = pd.merge(burbank2017_df, weather2017_df, left_on=['FlightDate', 'Origin'], right_on=['DATE', 'AIRPORT'], how='left')
burbank2017_df.drop(columns=['AIRPORT', 'DATE'], axis=1, inplace=True)
burbank2017_df

Unnamed: 0,DayOfWeek,FlightDate,Reporting_Airline,Origin,OriginState,Dest,DestState,CRSDepTime,DepDelay,Cancelled,Distance,CRSArrTime,ArrTime,ArrDelay,Delayed,WIND,PRCP,SNOW,TEMP
0,6,2017-07-01,AS,BUR,CA,SEA,WA,700,8.0,0,937,930,942.0,12.0,1,7.38,0.0,0.0,69.5
1,6,2017-07-01,AS,BUR,CA,SEA,WA,1730,-10.0,0,937,1958,2010.0,12.0,1,7.38,0.0,0.0,69.5
2,6,2017-07-01,AS,BUR,CA,SEA,WA,1215,-3.0,0,937,1443,1459.0,16.0,1,7.38,0.0,0.0,69.5
3,7,2017-07-02,AS,BUR,CA,SEA,WA,700,-3.0,0,937,930,941.0,11.0,1,7.38,0.0,0.0,71.5
4,7,2017-07-02,AS,BUR,CA,SEA,WA,1730,-9.0,0,937,1958,1947.0,-11.0,0,7.38,0.0,0.0,71.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25100,2,2017-09-19,UA,BUR,CA,SFO,CA,708,-2.0,0,326,833,841.0,8.0,1,6.71,0.0,0.0,67.5
25101,2,2017-09-19,UA,BUR,CA,SFO,CA,2015,-15.0,0,326,2141,2125.0,-16.0,0,6.71,0.0,0.0,67.5
25102,2,2017-09-19,UA,BUR,CA,DEN,CO,700,-9.0,0,850,1029,1012.0,-17.0,0,6.71,0.0,0.0,67.5
25103,2,2017-09-05,UA,BUR,CA,SFO,CA,1220,-6.0,0,326,1339,1319.0,-20.0,0,10.51,0.0,0.0,77.5


In [33]:
# ensure the start and end dates are still correct

display(burbank2017_df['FlightDate'].min())
display(burbank2017_df['FlightDate'].max())

'2017-01-01'

'2017-12-31'

In [34]:
# check that the new columns have the correct values for Jan 1, 2017

burbank2017_df[(burbank2017_df['FlightDate'] == '2017-01-01')].head()

Unnamed: 0,DayOfWeek,FlightDate,Reporting_Airline,Origin,OriginState,Dest,DestState,CRSDepTime,DepDelay,Cancelled,Distance,CRSArrTime,ArrTime,ArrDelay,Delayed,WIND,PRCP,SNOW,TEMP
15050,7,2017-01-01,B6,BUR,CA,JFK,NY,2130,-4.0,0,2465,546,535.0,-11.0,0,4.92,0.0,0.0,48.0
15080,7,2017-01-01,OO,BUR,CA,SFO,CA,1745,-7.0,0,326,1910,1904.0,-6.0,0,4.92,0.0,0.0,48.0
15081,7,2017-01-01,OO,BUR,CA,SFO,CA,2000,-14.0,0,326,2127,2108.0,-19.0,0,4.92,0.0,0.0,48.0
15082,7,2017-01-01,OO,BUR,CA,SFO,CA,830,23.0,0,326,1002,1101.0,59.0,1,4.92,0.0,0.0,48.0
15612,7,2017-01-01,AS,BUR,CA,SEA,WA,1100,0.0,1,937,1338,0.0,0.0,0,4.92,0.0,0.0,48.0


In [35]:
# rename the new weather columns so that we know they pertain to the origin airport

burbank2017_df.rename(columns={'WIND' : 'Origin_WIND',
                               'PRCP' : 'Origin_PRCP',
                               'SNOW' : 'Origin_SNOW',
                               'TEMP' : 'Origin_TEMP'}, inplace=True)
burbank2017_df.head()

Unnamed: 0,DayOfWeek,FlightDate,Reporting_Airline,Origin,OriginState,Dest,DestState,CRSDepTime,DepDelay,Cancelled,Distance,CRSArrTime,ArrTime,ArrDelay,Delayed,Origin_WIND,Origin_PRCP,Origin_SNOW,Origin_TEMP
0,6,2017-07-01,AS,BUR,CA,SEA,WA,700,8.0,0,937,930,942.0,12.0,1,7.38,0.0,0.0,69.5
1,6,2017-07-01,AS,BUR,CA,SEA,WA,1730,-10.0,0,937,1958,2010.0,12.0,1,7.38,0.0,0.0,69.5
2,6,2017-07-01,AS,BUR,CA,SEA,WA,1215,-3.0,0,937,1443,1459.0,16.0,1,7.38,0.0,0.0,69.5
3,7,2017-07-02,AS,BUR,CA,SEA,WA,700,-3.0,0,937,930,941.0,11.0,1,7.38,0.0,0.0,71.5
4,7,2017-07-02,AS,BUR,CA,SEA,WA,1730,-9.0,0,937,1958,1947.0,-11.0,0,7.38,0.0,0.0,71.5


In [36]:
# add in the destination airport weather

burbank2017_df = pd.merge(burbank2017_df, weather2017_df, left_on=['FlightDate', 'Dest'], right_on=['DATE', 'AIRPORT'], how='left')
burbank2017_df.drop(columns=['AIRPORT', 'DATE'], axis=1, inplace=True)
burbank2017_df

Unnamed: 0,DayOfWeek,FlightDate,Reporting_Airline,Origin,OriginState,Dest,DestState,CRSDepTime,DepDelay,Cancelled,Distance,CRSArrTime,ArrTime,ArrDelay,Delayed,Origin_WIND,Origin_PRCP,Origin_SNOW,Origin_TEMP,WIND,PRCP,SNOW,TEMP
0,6,2017-07-01,AS,BUR,CA,SEA,WA,700,8.0,0,937,930,942.0,12.0,1,7.38,0.0,0.0,69.5,6.49,0.0,0.0,64.0
1,6,2017-07-01,AS,BUR,CA,SEA,WA,1730,-10.0,0,937,1958,2010.0,12.0,1,7.38,0.0,0.0,69.5,6.49,0.0,0.0,64.0
2,6,2017-07-01,AS,BUR,CA,SEA,WA,1215,-3.0,0,937,1443,1459.0,16.0,1,7.38,0.0,0.0,69.5,6.49,0.0,0.0,64.0
3,7,2017-07-02,AS,BUR,CA,SEA,WA,700,-3.0,0,937,930,941.0,11.0,1,7.38,0.0,0.0,71.5,4.92,0.0,0.0,62.0
4,7,2017-07-02,AS,BUR,CA,SEA,WA,1730,-9.0,0,937,1958,1947.0,-11.0,0,7.38,0.0,0.0,71.5,4.92,0.0,0.0,62.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25100,2,2017-09-19,UA,BUR,CA,SFO,CA,708,-2.0,0,326,833,841.0,8.0,1,6.71,0.0,0.0,67.5,15.66,0.0,0.0,64.0
25101,2,2017-09-19,UA,BUR,CA,SFO,CA,2015,-15.0,0,326,2141,2125.0,-16.0,0,6.71,0.0,0.0,67.5,15.66,0.0,0.0,64.0
25102,2,2017-09-19,UA,BUR,CA,DEN,CO,700,-9.0,0,850,1029,1012.0,-17.0,0,6.71,0.0,0.0,67.5,14.09,0.0,0.0,72.0
25103,2,2017-09-05,UA,BUR,CA,SFO,CA,1220,-6.0,0,326,1339,1319.0,-20.0,0,10.51,0.0,0.0,77.5,7.61,0.0,0.0,76.0


In [37]:
# rename the new weather columns so that we know they pertain to the destination airport

burbank2017_df.rename(columns={'WIND' : 'Dest_WIND',
                               'PRCP' : 'Dest_PRCP',
                               'SNOW' : 'Dest_SNOW',
                               'TEMP' : 'Dest_TEMP'}, inplace=True)                               
burbank2017_df

Unnamed: 0,DayOfWeek,FlightDate,Reporting_Airline,Origin,OriginState,Dest,DestState,CRSDepTime,DepDelay,Cancelled,Distance,CRSArrTime,ArrTime,ArrDelay,Delayed,Origin_WIND,Origin_PRCP,Origin_SNOW,Origin_TEMP,Dest_WIND,Dest_PRCP,Dest_SNOW,Dest_TEMP
0,6,2017-07-01,AS,BUR,CA,SEA,WA,700,8.0,0,937,930,942.0,12.0,1,7.38,0.0,0.0,69.5,6.49,0.0,0.0,64.0
1,6,2017-07-01,AS,BUR,CA,SEA,WA,1730,-10.0,0,937,1958,2010.0,12.0,1,7.38,0.0,0.0,69.5,6.49,0.0,0.0,64.0
2,6,2017-07-01,AS,BUR,CA,SEA,WA,1215,-3.0,0,937,1443,1459.0,16.0,1,7.38,0.0,0.0,69.5,6.49,0.0,0.0,64.0
3,7,2017-07-02,AS,BUR,CA,SEA,WA,700,-3.0,0,937,930,941.0,11.0,1,7.38,0.0,0.0,71.5,4.92,0.0,0.0,62.0
4,7,2017-07-02,AS,BUR,CA,SEA,WA,1730,-9.0,0,937,1958,1947.0,-11.0,0,7.38,0.0,0.0,71.5,4.92,0.0,0.0,62.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25100,2,2017-09-19,UA,BUR,CA,SFO,CA,708,-2.0,0,326,833,841.0,8.0,1,6.71,0.0,0.0,67.5,15.66,0.0,0.0,64.0
25101,2,2017-09-19,UA,BUR,CA,SFO,CA,2015,-15.0,0,326,2141,2125.0,-16.0,0,6.71,0.0,0.0,67.5,15.66,0.0,0.0,64.0
25102,2,2017-09-19,UA,BUR,CA,DEN,CO,700,-9.0,0,850,1029,1012.0,-17.0,0,6.71,0.0,0.0,67.5,14.09,0.0,0.0,72.0
25103,2,2017-09-05,UA,BUR,CA,SFO,CA,1220,-6.0,0,326,1339,1319.0,-20.0,0,10.51,0.0,0.0,77.5,7.61,0.0,0.0,76.0


In [38]:
# ensure there are no NaN values in the burbank dataframe

burbank2017_df.isna().sum()

DayOfWeek            0
FlightDate           0
Reporting_Airline    0
Origin               0
OriginState          0
Dest                 0
DestState            0
CRSDepTime           0
DepDelay             0
Cancelled            0
Distance             0
CRSArrTime           0
ArrTime              0
ArrDelay             0
Delayed              0
Origin_WIND          0
Origin_PRCP          0
Origin_SNOW          0
Origin_TEMP          0
Dest_WIND            0
Dest_PRCP            0
Dest_SNOW            0
Dest_TEMP            0
dtype: int64

In [39]:
# set the Date to be a datecolumn and extract each of day, month and year separately, create own column for each
# drop the orignal Date column and check that the new columns are correct

burbank2017_df['Day'] = pd.DatetimeIndex(burbank2017_df['FlightDate']).day
burbank2017_df['Month'] = pd.DatetimeIndex(burbank2017_df['FlightDate']).month
burbank2017_df['Year'] = pd.DatetimeIndex(burbank2017_df['FlightDate']).year
burbank2017_df.drop(['FlightDate'], axis=1, inplace=True)
burbank2017_df

Unnamed: 0,DayOfWeek,Reporting_Airline,Origin,OriginState,Dest,DestState,CRSDepTime,DepDelay,Cancelled,Distance,CRSArrTime,ArrTime,ArrDelay,Delayed,Origin_WIND,Origin_PRCP,Origin_SNOW,Origin_TEMP,Dest_WIND,Dest_PRCP,Dest_SNOW,Dest_TEMP,Day,Month,Year
0,6,AS,BUR,CA,SEA,WA,700,8.0,0,937,930,942.0,12.0,1,7.38,0.0,0.0,69.5,6.49,0.0,0.0,64.0,1,7,2017
1,6,AS,BUR,CA,SEA,WA,1730,-10.0,0,937,1958,2010.0,12.0,1,7.38,0.0,0.0,69.5,6.49,0.0,0.0,64.0,1,7,2017
2,6,AS,BUR,CA,SEA,WA,1215,-3.0,0,937,1443,1459.0,16.0,1,7.38,0.0,0.0,69.5,6.49,0.0,0.0,64.0,1,7,2017
3,7,AS,BUR,CA,SEA,WA,700,-3.0,0,937,930,941.0,11.0,1,7.38,0.0,0.0,71.5,4.92,0.0,0.0,62.0,2,7,2017
4,7,AS,BUR,CA,SEA,WA,1730,-9.0,0,937,1958,1947.0,-11.0,0,7.38,0.0,0.0,71.5,4.92,0.0,0.0,62.0,2,7,2017
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25100,2,UA,BUR,CA,SFO,CA,708,-2.0,0,326,833,841.0,8.0,1,6.71,0.0,0.0,67.5,15.66,0.0,0.0,64.0,19,9,2017
25101,2,UA,BUR,CA,SFO,CA,2015,-15.0,0,326,2141,2125.0,-16.0,0,6.71,0.0,0.0,67.5,15.66,0.0,0.0,64.0,19,9,2017
25102,2,UA,BUR,CA,DEN,CO,700,-9.0,0,850,1029,1012.0,-17.0,0,6.71,0.0,0.0,67.5,14.09,0.0,0.0,72.0,19,9,2017
25103,2,UA,BUR,CA,SFO,CA,1220,-6.0,0,326,1339,1319.0,-20.0,0,10.51,0.0,0.0,77.5,7.61,0.0,0.0,76.0,5,9,2017


In [40]:
# create dummy variables for the airlines

airline_df = pd.get_dummies(burbank2017_df['Reporting_Airline'], drop_first=True)
airline_df

Unnamed: 0,B6,OO,UA,WN
0,0,0,0,0
1,0,0,0,0
2,0,0,0,0
3,0,0,0,0
4,0,0,0,0
...,...,...,...,...
25100,0,0,1,0
25101,0,0,1,0
25102,0,0,1,0
25103,0,0,1,0


In [41]:
# create dummy variables for the destination airports

dest_df = pd.get_dummies(burbank2017_df['Dest'], drop_first=True)
dest_df = dest_df.add_prefix('da_')
dest_df

Unnamed: 0,da_DEN,da_JFK,da_LAS,da_OAK,da_PDX,da_PHX,da_SEA,da_SFO,da_SJC,da_SLC,da_SMF
0,0,0,0,0,0,0,1,0,0,0,0
1,0,0,0,0,0,0,1,0,0,0,0
2,0,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
25100,0,0,0,0,0,0,0,1,0,0,0
25101,0,0,0,0,0,0,0,1,0,0,0
25102,1,0,0,0,0,0,0,0,0,0,0
25103,0,0,0,0,0,0,0,1,0,0,0


In [42]:
# combine the dummy variables and the original dataframe

burbank2017_df = pd.concat([burbank2017_df, airline_df, dest_df], axis=1) 
burbank2017_df.drop(['Reporting_Airline', 'Dest'], inplace=True, axis=1)
burbank2017_df

Unnamed: 0,DayOfWeek,Origin,OriginState,DestState,CRSDepTime,DepDelay,Cancelled,Distance,CRSArrTime,ArrTime,ArrDelay,Delayed,Origin_WIND,Origin_PRCP,Origin_SNOW,Origin_TEMP,Dest_WIND,Dest_PRCP,Dest_SNOW,Dest_TEMP,Day,Month,Year,B6,OO,UA,WN,da_DEN,da_JFK,da_LAS,da_OAK,da_PDX,da_PHX,da_SEA,da_SFO,da_SJC,da_SLC,da_SMF
0,6,BUR,CA,WA,700,8.0,0,937,930,942.0,12.0,1,7.38,0.0,0.0,69.5,6.49,0.0,0.0,64.0,1,7,2017,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
1,6,BUR,CA,WA,1730,-10.0,0,937,1958,2010.0,12.0,1,7.38,0.0,0.0,69.5,6.49,0.0,0.0,64.0,1,7,2017,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
2,6,BUR,CA,WA,1215,-3.0,0,937,1443,1459.0,16.0,1,7.38,0.0,0.0,69.5,6.49,0.0,0.0,64.0,1,7,2017,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,7,BUR,CA,WA,700,-3.0,0,937,930,941.0,11.0,1,7.38,0.0,0.0,71.5,4.92,0.0,0.0,62.0,2,7,2017,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
4,7,BUR,CA,WA,1730,-9.0,0,937,1958,1947.0,-11.0,0,7.38,0.0,0.0,71.5,4.92,0.0,0.0,62.0,2,7,2017,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25100,2,BUR,CA,CA,708,-2.0,0,326,833,841.0,8.0,1,6.71,0.0,0.0,67.5,15.66,0.0,0.0,64.0,19,9,2017,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0
25101,2,BUR,CA,CA,2015,-15.0,0,326,2141,2125.0,-16.0,0,6.71,0.0,0.0,67.5,15.66,0.0,0.0,64.0,19,9,2017,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0
25102,2,BUR,CA,CO,700,-9.0,0,850,1029,1012.0,-17.0,0,6.71,0.0,0.0,67.5,14.09,0.0,0.0,72.0,19,9,2017,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0
25103,2,BUR,CA,CA,1220,-6.0,0,326,1339,1319.0,-20.0,0,10.51,0.0,0.0,77.5,7.61,0.0,0.0,76.0,5,9,2017,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0


In [43]:
# move the delayed (target) column to the end

delay_df = burbank2017_df.pop('Delayed')
burbank2017_df['Delayed'] = delay_df
burbank2017_df

Unnamed: 0,DayOfWeek,Origin,OriginState,DestState,CRSDepTime,DepDelay,Cancelled,Distance,CRSArrTime,ArrTime,ArrDelay,Origin_WIND,Origin_PRCP,Origin_SNOW,Origin_TEMP,Dest_WIND,Dest_PRCP,Dest_SNOW,Dest_TEMP,Day,Month,Year,B6,OO,UA,WN,da_DEN,da_JFK,da_LAS,da_OAK,da_PDX,da_PHX,da_SEA,da_SFO,da_SJC,da_SLC,da_SMF,Delayed
0,6,BUR,CA,WA,700,8.0,0,937,930,942.0,12.0,7.38,0.0,0.0,69.5,6.49,0.0,0.0,64.0,1,7,2017,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
1,6,BUR,CA,WA,1730,-10.0,0,937,1958,2010.0,12.0,7.38,0.0,0.0,69.5,6.49,0.0,0.0,64.0,1,7,2017,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
2,6,BUR,CA,WA,1215,-3.0,0,937,1443,1459.0,16.0,7.38,0.0,0.0,69.5,6.49,0.0,0.0,64.0,1,7,2017,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
3,7,BUR,CA,WA,700,-3.0,0,937,930,941.0,11.0,7.38,0.0,0.0,71.5,4.92,0.0,0.0,62.0,2,7,2017,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
4,7,BUR,CA,WA,1730,-9.0,0,937,1958,1947.0,-11.0,7.38,0.0,0.0,71.5,4.92,0.0,0.0,62.0,2,7,2017,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25100,2,BUR,CA,CA,708,-2.0,0,326,833,841.0,8.0,6.71,0.0,0.0,67.5,15.66,0.0,0.0,64.0,19,9,2017,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1
25101,2,BUR,CA,CA,2015,-15.0,0,326,2141,2125.0,-16.0,6.71,0.0,0.0,67.5,15.66,0.0,0.0,64.0,19,9,2017,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
25102,2,BUR,CA,CO,700,-9.0,0,850,1029,1012.0,-17.0,6.71,0.0,0.0,67.5,14.09,0.0,0.0,72.0,19,9,2017,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0
25103,2,BUR,CA,CA,1220,-6.0,0,326,1339,1319.0,-20.0,10.51,0.0,0.0,77.5,7.61,0.0,0.0,76.0,5,9,2017,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0


In [44]:
# look at the proportion of delayed flights

burbank2017_df['Delayed'].value_counts()

0    15173
1     9932
Name: Delayed, dtype: int64

In [45]:
# check that all values are numerical (float or int)

burbank2017_df.dtypes

DayOfWeek        int64
Origin          object
OriginState     object
DestState       object
CRSDepTime       int64
DepDelay       float64
Cancelled        int64
Distance         int64
CRSArrTime       int64
ArrTime        float64
ArrDelay       float64
Origin_WIND    float64
Origin_PRCP    float64
Origin_SNOW    float64
Origin_TEMP    float64
Dest_WIND      float64
Dest_PRCP      float64
Dest_SNOW      float64
Dest_TEMP      float64
Day              int64
Month            int64
Year             int64
B6               uint8
OO               uint8
UA               uint8
WN               uint8
da_DEN           uint8
da_JFK           uint8
da_LAS           uint8
da_OAK           uint8
da_PDX           uint8
da_PHX           uint8
da_SEA           uint8
da_SFO           uint8
da_SJC           uint8
da_SLC           uint8
da_SMF           uint8
Delayed          int64
dtype: object

In [46]:
# drop the Year. Origin and OriginState columns because they only have one value each
# drop information about actual arrival times (gives away too much information)

burbank2017_df.drop(columns=['Year',
                             'Origin',
                             'OriginState',
                            #'DepDelay',
                             'ArrTime',
                             'ArrDelay',
                             'DestState'], axis=1, inplace=True)
burbank2017_df 

Unnamed: 0,DayOfWeek,CRSDepTime,DepDelay,Cancelled,Distance,CRSArrTime,Origin_WIND,Origin_PRCP,Origin_SNOW,Origin_TEMP,Dest_WIND,Dest_PRCP,Dest_SNOW,Dest_TEMP,Day,Month,B6,OO,UA,WN,da_DEN,da_JFK,da_LAS,da_OAK,da_PDX,da_PHX,da_SEA,da_SFO,da_SJC,da_SLC,da_SMF,Delayed
0,6,700,8.0,0,937,930,7.38,0.0,0.0,69.5,6.49,0.0,0.0,64.0,1,7,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
1,6,1730,-10.0,0,937,1958,7.38,0.0,0.0,69.5,6.49,0.0,0.0,64.0,1,7,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
2,6,1215,-3.0,0,937,1443,7.38,0.0,0.0,69.5,6.49,0.0,0.0,64.0,1,7,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
3,7,700,-3.0,0,937,930,7.38,0.0,0.0,71.5,4.92,0.0,0.0,62.0,2,7,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
4,7,1730,-9.0,0,937,1958,7.38,0.0,0.0,71.5,4.92,0.0,0.0,62.0,2,7,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25100,2,708,-2.0,0,326,833,6.71,0.0,0.0,67.5,15.66,0.0,0.0,64.0,19,9,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1
25101,2,2015,-15.0,0,326,2141,6.71,0.0,0.0,67.5,15.66,0.0,0.0,64.0,19,9,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
25102,2,700,-9.0,0,850,1029,6.71,0.0,0.0,67.5,14.09,0.0,0.0,72.0,19,9,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0
25103,2,1220,-6.0,0,326,1339,10.51,0.0,0.0,77.5,7.61,0.0,0.0,76.0,5,9,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0


In [47]:
# ensure all values are numerical

burbank2017_df.dtypes

DayOfWeek        int64
CRSDepTime       int64
DepDelay       float64
Cancelled        int64
Distance         int64
CRSArrTime       int64
Origin_WIND    float64
Origin_PRCP    float64
Origin_SNOW    float64
Origin_TEMP    float64
Dest_WIND      float64
Dest_PRCP      float64
Dest_SNOW      float64
Dest_TEMP      float64
Day              int64
Month            int64
B6               uint8
OO               uint8
UA               uint8
WN               uint8
da_DEN           uint8
da_JFK           uint8
da_LAS           uint8
da_OAK           uint8
da_PDX           uint8
da_PHX           uint8
da_SEA           uint8
da_SFO           uint8
da_SJC           uint8
da_SLC           uint8
da_SMF           uint8
Delayed          int64
dtype: object

In [48]:
burbank2017_df.to_csv('burbank2017.csv')