**Atlanta 2017**
<br/>Create and clean a dataframe of just the Atlanta airport adding in origin weather.
<br/>Perform a basic logistic regression model and review results.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
# increase display size to enable viewing of all data columns

pd.options.display.max_columns = 35

In [4]:
# read in the clean flights csv

flights2017_df = pd.read_csv('flights2017clean.csv', index_col=0)
flights2017_df

Unnamed: 0,DayOfWeek,FlightDate,Reporting_Airline,Tail_Number,Origin,OriginCityName,OriginState,Dest,DestCityName,DestState,CRSDepTime,DepTime,DepDelay,DepDel15,DepartureDelayGroups,TaxiOut,WheelsOff,Cancelled,CancellationCode,Distance,DistanceGroup,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,CRSArrTime,ArrTime,ArrDelay,Delayed
0,6,2017-07-01,AS,N559AS,DCA,"Washington, DC",VA,SEA,"Seattle, WA",WA,800,750.0,-10.0,0.0,-1.0,17.0,807.0,0,0,2329,10,0.0,0.0,0.0,0.0,0.0,1053,1021.0,-32.0,0
1,6,2017-07-01,AS,N513AS,SEA,"Seattle, WA",WA,DCA,"Washington, DC",VA,1335,1330.0,-5.0,0.0,-1.0,15.0,1345.0,0,0,2329,10,0.0,0.0,0.0,0.0,0.0,2153,2129.0,-24.0,0
2,6,2017-07-01,AS,N588AS,DCA,"Washington, DC",VA,SEA,"Seattle, WA",WA,1852,1856.0,4.0,0.0,0.0,9.0,1905.0,0,0,2329,10,0.0,0.0,0.0,0.0,0.0,2157,2107.0,-50.0,0
3,6,2017-07-01,AS,N538AS,SEA,"Seattle, WA",WA,DCA,"Washington, DC",VA,800,840.0,40.0,1.0,2.0,9.0,849.0,0,0,2329,10,36.0,0.0,0.0,0.0,0.0,1617,1653.0,36.0,1
4,6,2017-07-01,AS,N536AS,DCA,"Washington, DC",VA,LAX,"Los Angeles, CA",CA,900,854.0,-6.0,0.0,-1.0,13.0,907.0,0,0,2311,10,0.0,0.0,0.0,0.0,0.0,1138,1115.0,-23.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5674616,2,2017-09-05,UA,N456UA,EWR,"Newark, NJ",NJ,LAS,"Las Vegas, NV",NV,842,949.0,67.0,1.0,4.0,24.0,1013.0,0,0,2227,9,20.0,0.0,0.0,0.0,27.0,1110,1157.0,47.0,1
5674617,2,2017-09-05,UA,N809UA,RDU,"Raleigh/Durham, NC",NC,DEN,"Denver, CO",CO,1623,1616.0,-7.0,0.0,-1.0,21.0,1637.0,0,0,1436,6,0.0,0.0,0.0,0.0,0.0,1817,1828.0,11.0,1
5674618,2,2017-09-05,UA,N69816,DEN,"Denver, CO",CO,SAT,"San Antonio, TX",TX,1526,1523.0,-3.0,0.0,-1.0,12.0,1535.0,0,0,794,4,0.0,0.0,0.0,0.0,0.0,1835,1817.0,-18.0,0
5674619,2,2017-09-05,UA,N17752,EWR,"Newark, NJ",NJ,DFW,"Dallas/Fort Worth, TX",TX,605,557.0,-8.0,0.0,-1.0,24.0,621.0,0,0,1372,6,0.0,0.0,0.0,0.0,0.0,845,827.0,-18.0,0


In [5]:
# determine which airports had the largest number of cancelled flights

cancelled_df = flights2017_df.groupby(['Origin']).sum()[['Cancelled']]
cancelled_df.sort_values(by = ['Cancelled'], ascending=False).head()

Unnamed: 0_level_0,Cancelled
Origin,Unnamed: 1_level_1
ATL,5419
ORD,3719
IAH,3529
SFO,3299
LGA,3018


In [6]:
# list airports in descending order based on total flights in 2017

total_df = flights2017_df.groupby(['Origin']).count()[['FlightDate']]
total_df.rename(columns={'FlightDate' : 'TotalFlights'}, inplace=True)
total_df.sort_values(by = ['TotalFlights'], ascending=False).head()

Unnamed: 0_level_0,TotalFlights
Origin,Unnamed: 1_level_1
ATL,364009
ORD,265936
DEN,222812
LAX,213911
DFW,180882


In [7]:
# look at aiports based on percent of cancelled flights

percent_df = pd.merge(cancelled_df, total_df, on='Origin')
percent_df['Percent'] = (percent_df['Cancelled'] / percent_df['TotalFlights']) * 100
display(percent_df.sort_values(by = ['Percent'], ascending=False).head(10))
display(percent_df.sort_values(by = ['Cancelled'], ascending=False).head(10))

Unnamed: 0_level_0,Cancelled,TotalFlights,Percent
Origin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
BPT,3,7,42.857143
MMH,30,105,28.571429
GGG,2,8,25.0
SUN,157,1336,11.751497
PSE,63,740,8.513514
BQN,127,1707,7.439953
PLN,75,1022,7.338552
CMX,49,726,6.749311
IAG,36,535,6.728972
PGD,13,203,6.403941


Unnamed: 0_level_0,Cancelled,TotalFlights,Percent
Origin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ATL,5419,364009,1.488699
ORD,3719,265936,1.398457
IAH,3529,129045,2.734705
SFO,3299,174304,1.89267
LGA,3018,93154,3.239796
EWR,2982,115772,2.575752
BOS,2665,125208,2.128458
MCO,2606,131715,1.978514
FLL,2429,89875,2.702643
LAX,2107,213911,0.984989


Based on these findings, will run an inital analysis on Atlanta.

In [8]:
# creat a dataframe containing just the flights originating in Atlanta

atlanta2017_df = flights2017_df[(flights2017_df['Origin'] == 'ATL')]
atlanta2017_df

Unnamed: 0,DayOfWeek,FlightDate,Reporting_Airline,Tail_Number,Origin,OriginCityName,OriginState,Dest,DestCityName,DestState,CRSDepTime,DepTime,DepDelay,DepDel15,DepartureDelayGroups,TaxiOut,WheelsOff,Cancelled,CancellationCode,Distance,DistanceGroup,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,CRSArrTime,ArrTime,ArrDelay,Delayed
106,6,2017-07-01,AS,N474AS,ATL,"Atlanta, GA",GA,SEA,"Seattle, WA",WA,630,643.0,13.0,0.0,0.0,19.0,702.0,0,0,2182,9,0.0,0.0,0.0,0.0,0.0,904,905.0,1.0,1
424,6,2017-07-01,AS,N479AS,ATL,"Atlanta, GA",GA,SEA,"Seattle, WA",WA,1612,1606.0,-6.0,0.0,-1.0,20.0,1626.0,0,0,2182,9,0.0,0.0,0.0,0.0,0.0,1840,1812.0,-28.0,0
438,6,2017-07-01,AS,N461AS,ATL,"Atlanta, GA",GA,PDX,"Portland, OR",OR,900,902.0,2.0,0.0,0.0,15.0,917.0,0,0,2172,9,0.0,0.0,0.0,0.0,0.0,1118,1100.0,-18.0,0
640,7,2017-07-02,AS,N265AK,ATL,"Atlanta, GA",GA,SEA,"Seattle, WA",WA,630,630.0,0.0,0.0,0.0,15.0,645.0,0,0,2182,9,0.0,0.0,0.0,0.0,0.0,904,832.0,-32.0,0
974,7,2017-07-02,AS,N532AS,ATL,"Atlanta, GA",GA,SEA,"Seattle, WA",WA,1612,1607.0,-5.0,0.0,-1.0,15.0,1622.0,0,0,2182,9,0.0,0.0,0.0,0.0,0.0,1840,1812.0,-28.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5673780,2,2017-09-05,UA,N449UA,ATL,"Atlanta, GA",GA,ORD,"Chicago, IL",IL,1843,1921.0,38.0,1.0,2.0,69.0,2030.0,0,0,606,3,0.0,0.0,46.0,0.0,33.0,2008,2127.0,79.0,1
5673985,2,2017-09-05,UA,N18223,ATL,"Atlanta, GA",GA,DEN,"Denver, CO",CO,905,906.0,1.0,0.0,0.0,20.0,926.0,0,0,1199,5,0.0,0.0,0.0,0.0,0.0,1016,1015.0,-1.0,0
5674231,2,2017-09-05,UA,N68843,ATL,"Atlanta, GA",GA,IAH,"Houston, TX",TX,1920,2023.0,63.0,1.0,4.0,40.0,2103.0,0,0,689,3,0.0,0.0,14.0,0.0,63.0,2036,2153.0,77.0,1
5674258,2,2017-09-05,UA,N404UA,ATL,"Atlanta, GA",GA,EWR,"Newark, NJ",NJ,1248,1302.0,14.0,0.0,0.0,52.0,1354.0,0,0,746,3,0.0,0.0,30.0,0.0,0.0,1506,1536.0,30.0,1


In [9]:
# add in the weather data

weather2017_df = pd.read_csv('data/weather/atl_weather2017.csv')
weather2017_df.head()

Unnamed: 0,STATION,NAME,DATE,AWND,PGTM,PRCP,SNOW,SNWD,TAVG,TMAX,TMIN,WDF2,WDF5,WSF2,WSF5,WT01,WT02,WT03,WT04,WT05,WT06,WT08
0,USW00013874,"ATLANTA HARTSFIELD INTERNATIONAL AIRPORT, GA US",2017-01-01,3.5,,21.3,0.0,0.0,8.1,10.6,5.0,170,170,9.8,12.1,1.0,1.0,,,,,
1,USW00013874,"ATLANTA HARTSFIELD INTERNATIONAL AIRPORT, GA US",2017-01-02,3.4,,57.4,0.0,0.0,11.2,15.0,10.0,80,70,14.3,16.1,1.0,1.0,1.0,,,,
2,USW00013874,"ATLANTA HARTSFIELD INTERNATIONAL AIRPORT, GA US",2017-01-03,3.8,,0.0,0.0,0.0,15.2,18.3,12.8,240,230,7.2,9.4,1.0,1.0,,,,,
3,USW00013874,"ATLANTA HARTSFIELD INTERNATIONAL AIRPORT, GA US",2017-01-04,5.8,,0.0,0.0,0.0,11.2,12.8,3.3,330,340,11.2,14.3,,,,,,,
4,USW00013874,"ATLANTA HARTSFIELD INTERNATIONAL AIRPORT, GA US",2017-01-05,2.7,,0.0,0.0,0.0,5.8,13.9,0.0,190,190,7.2,9.4,,,,,,,


In [10]:
# check for NaN values in the weather data

weather2017_df.isna().sum()

STATION      0
NAME         0
DATE         0
AWND         0
PGTM       334
PRCP         0
SNOW         0
SNWD         0
TAVG         0
TMAX         0
TMIN         0
WDF2         0
WDF5         0
WSF2         0
WSF5         0
WT01       232
WT02       350
WT03       288
WT04       364
WT05       365
WT06       363
WT08       352
dtype: int64

In [11]:
# list of all the columns in the data dataframe with missing values and the percent that's missing

round(np.mean(weather2017_df[weather2017_df.columns[weather2017_df.isnull().any()]].isna(), axis = 0) * 100,2)

PGTM     91.51
WT01     63.56
WT02     95.89
WT03     78.90
WT04     99.73
WT05    100.00
WT06     99.45
WT08     96.44
dtype: float64

In [12]:
# ensure there are no missing days for the year

weather2017_df['STATION'].value_counts()

USW00013874    365
Name: STATION, dtype: int64

In [13]:
# drop unnecessary columns
# Station and Name because there is only one value, so the column provides no information

weather2017_df.drop(['STATION',
                     'NAME',
                     'PGTM',
                     'WT01', 
                     'WT02',
                     'WT03',
                     'WT04',
                     'WT05',
                     'WT06',
                     'WT08'], axis=1, inplace=True)

weather2017_df.head()

Unnamed: 0,DATE,AWND,PRCP,SNOW,SNWD,TAVG,TMAX,TMIN,WDF2,WDF5,WSF2,WSF5
0,2017-01-01,3.5,21.3,0.0,0.0,8.1,10.6,5.0,170,170,9.8,12.1
1,2017-01-02,3.4,57.4,0.0,0.0,11.2,15.0,10.0,80,70,14.3,16.1
2,2017-01-03,3.8,0.0,0.0,0.0,15.2,18.3,12.8,240,230,7.2,9.4
3,2017-01-04,5.8,0.0,0.0,0.0,11.2,12.8,3.3,330,340,11.2,14.3
4,2017-01-05,2.7,0.0,0.0,0.0,5.8,13.9,0.0,190,190,7.2,9.4


In [14]:
weather2017_df.isna().sum()

DATE    0
AWND    0
PRCP    0
SNOW    0
SNWD    0
TAVG    0
TMAX    0
TMIN    0
WDF2    0
WDF5    0
WSF2    0
WSF5    0
dtype: int64

In [15]:
# ensure the date columns in both dataframes are formatted correctly

pd.to_datetime(atlanta2017_df['FlightDate'])
pd.to_datetime(weather2017_df['DATE'])

0     2017-01-01
1     2017-01-02
2     2017-01-03
3     2017-01-04
4     2017-01-05
         ...    
360   2017-12-27
361   2017-12-28
362   2017-12-29
363   2017-12-30
364   2017-12-31
Name: DATE, Length: 365, dtype: datetime64[ns]

In [16]:
# rename the Data columns to match

atlanta2017_df.rename(columns={'FlightDate' : 'Date'}, inplace=True)
weather2017_df.rename(columns={'DATE' : 'Date'}, inplace=True)

In [17]:
# merge the weather information with the fight information

atlanta2017_df = pd.merge(atlanta2017_df, weather2017_df, on='Date')
atlanta2017_df

Unnamed: 0,DayOfWeek,Date,Reporting_Airline,Tail_Number,Origin,OriginCityName,OriginState,Dest,DestCityName,DestState,CRSDepTime,DepTime,DepDelay,DepDel15,DepartureDelayGroups,TaxiOut,WheelsOff,...,SecurityDelay,LateAircraftDelay,CRSArrTime,ArrTime,ArrDelay,Delayed,AWND,PRCP,SNOW,SNWD,TAVG,TMAX,TMIN,WDF2,WDF5,WSF2,WSF5
0,6,2017-07-01,AS,N474AS,ATL,"Atlanta, GA",GA,SEA,"Seattle, WA",WA,630,643.0,13.0,0.0,0.0,19.0,702.0,...,0.0,0.0,904,905.0,1.0,1,3.4,0.3,0.0,0.0,24.7,30.0,21.7,320,330,8.9,10.3
1,6,2017-07-01,AS,N479AS,ATL,"Atlanta, GA",GA,SEA,"Seattle, WA",WA,1612,1606.0,-6.0,0.0,-1.0,20.0,1626.0,...,0.0,0.0,1840,1812.0,-28.0,0,3.4,0.3,0.0,0.0,24.7,30.0,21.7,320,330,8.9,10.3
2,6,2017-07-01,AS,N461AS,ATL,"Atlanta, GA",GA,PDX,"Portland, OR",OR,900,902.0,2.0,0.0,0.0,15.0,917.0,...,0.0,0.0,1118,1100.0,-18.0,0,3.4,0.3,0.0,0.0,24.7,30.0,21.7,320,330,8.9,10.3
3,6,2017-07-01,OO,N755SK,ATL,"Atlanta, GA",GA,ORD,"Chicago, IL",IL,1115,1429.0,194.0,1.0,12.0,33.0,1502.0,...,0.0,194.0,1225,1541.0,196.0,1,3.4,0.3,0.0,0.0,24.7,30.0,21.7,320,330,8.9,10.3
4,6,2017-07-01,OO,N755SK,ATL,"Atlanta, GA",GA,ORD,"Chicago, IL",IL,640,710.0,30.0,1.0,2.0,19.0,729.0,...,0.0,0.0,747,804.0,17.0,1,3.4,0.3,0.0,0.0,24.7,30.0,21.7,320,330,8.9,10.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
364004,7,2017-09-10,UA,N62883,ATL,"Atlanta, GA",GA,ORD,"Chicago, IL",IL,2045,2047.0,2.0,0.0,0.0,14.0,2101.0,...,0.0,0.0,2152,2138.0,-14.0,0,9.1,0.0,0.0,0.0,19.3,23.3,15.6,70,60,13.9,18.8
364005,7,2017-09-10,UA,N37466,ATL,"Atlanta, GA",GA,ORD,"Chicago, IL",IL,1445,1437.0,-8.0,0.0,-1.0,19.0,1456.0,...,0.0,0.0,1556,1537.0,-19.0,0,9.1,0.0,0.0,0.0,19.3,23.3,15.6,70,60,13.9,18.8
364006,7,2017-09-10,UA,N69819,ATL,"Atlanta, GA",GA,EWR,"Newark, NJ",NJ,1302,1256.0,-6.0,0.0,-1.0,16.0,1312.0,...,0.0,0.0,1517,1501.0,-16.0,0,9.1,0.0,0.0,0.0,19.3,23.3,15.6,70,60,13.9,18.8
364007,7,2017-09-10,UA,N497UA,ATL,"Atlanta, GA",GA,ORD,"Chicago, IL",IL,652,655.0,3.0,0.0,0.0,10.0,705.0,...,0.0,0.0,803,752.0,-11.0,0,9.1,0.0,0.0,0.0,19.3,23.3,15.6,70,60,13.9,18.8


In [18]:
# ensure there are no NaN values in the data

atlanta2017_df.isna().sum()

DayOfWeek               0
Date                    0
Reporting_Airline       0
Tail_Number             0
Origin                  0
OriginCityName          0
OriginState             0
Dest                    0
DestCityName            0
DestState               0
CRSDepTime              0
DepTime                 0
DepDelay                0
DepDel15                0
DepartureDelayGroups    0
TaxiOut                 0
WheelsOff               0
Cancelled               0
CancellationCode        0
Distance                0
DistanceGroup           0
CarrierDelay            0
WeatherDelay            0
NASDelay                0
SecurityDelay           0
LateAircraftDelay       0
CRSArrTime              0
ArrTime                 0
ArrDelay                0
Delayed                 0
AWND                    0
PRCP                    0
SNOW                    0
SNWD                    0
TAVG                    0
TMAX                    0
TMIN                    0
WDF2                    0
WDF5        

In [19]:
# set the FlightDate to be a datecolumn and extract each of day, month and year separately, create own column for each
# drop the orignal FlightDate column and check that the new columns are correct

pd.to_datetime(atlanta2017_df['Date'])
atlanta2017_df['Day'] = pd.DatetimeIndex(atlanta2017_df['Date']).day
atlanta2017_df['Month'] = pd.DatetimeIndex(atlanta2017_df['Date']).month
atlanta2017_df['Year'] = pd.DatetimeIndex(atlanta2017_df['Date']).year

atlanta2017_df.drop(['Date'], axis=1, inplace=True)

atlanta2017_df.head()

Unnamed: 0,DayOfWeek,Reporting_Airline,Tail_Number,Origin,OriginCityName,OriginState,Dest,DestCityName,DestState,CRSDepTime,DepTime,DepDelay,DepDel15,DepartureDelayGroups,TaxiOut,WheelsOff,Cancelled,...,ArrTime,ArrDelay,Delayed,AWND,PRCP,SNOW,SNWD,TAVG,TMAX,TMIN,WDF2,WDF5,WSF2,WSF5,Day,Month,Year
0,6,AS,N474AS,ATL,"Atlanta, GA",GA,SEA,"Seattle, WA",WA,630,643.0,13.0,0.0,0.0,19.0,702.0,0,...,905.0,1.0,1,3.4,0.3,0.0,0.0,24.7,30.0,21.7,320,330,8.9,10.3,1,7,2017
1,6,AS,N479AS,ATL,"Atlanta, GA",GA,SEA,"Seattle, WA",WA,1612,1606.0,-6.0,0.0,-1.0,20.0,1626.0,0,...,1812.0,-28.0,0,3.4,0.3,0.0,0.0,24.7,30.0,21.7,320,330,8.9,10.3,1,7,2017
2,6,AS,N461AS,ATL,"Atlanta, GA",GA,PDX,"Portland, OR",OR,900,902.0,2.0,0.0,0.0,15.0,917.0,0,...,1100.0,-18.0,0,3.4,0.3,0.0,0.0,24.7,30.0,21.7,320,330,8.9,10.3,1,7,2017
3,6,OO,N755SK,ATL,"Atlanta, GA",GA,ORD,"Chicago, IL",IL,1115,1429.0,194.0,1.0,12.0,33.0,1502.0,0,...,1541.0,196.0,1,3.4,0.3,0.0,0.0,24.7,30.0,21.7,320,330,8.9,10.3,1,7,2017
4,6,OO,N755SK,ATL,"Atlanta, GA",GA,ORD,"Chicago, IL",IL,640,710.0,30.0,1.0,2.0,19.0,729.0,0,...,804.0,17.0,1,3.4,0.3,0.0,0.0,24.7,30.0,21.7,320,330,8.9,10.3,1,7,2017


In [20]:
# drop columns with redundant, 'give-away', or unuseful information

atlanta2017_df.drop(['Tail_Number', 
                     'OriginCityName', 
                     'DestCityName',
                     'DepTime', 
                     'DepDel15', 
                     'DepartureDelayGroups', 
                     'TaxiOut', 
                     'WheelsOff',
                     'DistanceGroup',
                     'CarrierDelay',
                     'WeatherDelay',
                     'NASDelay',
                     'SecurityDelay',
                     'LateAircraftDelay',
                     'CancellationCode'], inplace=True, axis=1)
atlanta2017_df

Unnamed: 0,DayOfWeek,Reporting_Airline,Origin,OriginState,Dest,DestState,CRSDepTime,DepDelay,Cancelled,Distance,CRSArrTime,ArrTime,ArrDelay,Delayed,AWND,PRCP,SNOW,SNWD,TAVG,TMAX,TMIN,WDF2,WDF5,WSF2,WSF5,Day,Month,Year
0,6,AS,ATL,GA,SEA,WA,630,13.0,0,2182,904,905.0,1.0,1,3.4,0.3,0.0,0.0,24.7,30.0,21.7,320,330,8.9,10.3,1,7,2017
1,6,AS,ATL,GA,SEA,WA,1612,-6.0,0,2182,1840,1812.0,-28.0,0,3.4,0.3,0.0,0.0,24.7,30.0,21.7,320,330,8.9,10.3,1,7,2017
2,6,AS,ATL,GA,PDX,OR,900,2.0,0,2172,1118,1100.0,-18.0,0,3.4,0.3,0.0,0.0,24.7,30.0,21.7,320,330,8.9,10.3,1,7,2017
3,6,OO,ATL,GA,ORD,IL,1115,194.0,0,606,1225,1541.0,196.0,1,3.4,0.3,0.0,0.0,24.7,30.0,21.7,320,330,8.9,10.3,1,7,2017
4,6,OO,ATL,GA,ORD,IL,640,30.0,0,606,747,804.0,17.0,1,3.4,0.3,0.0,0.0,24.7,30.0,21.7,320,330,8.9,10.3,1,7,2017
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
364004,7,UA,ATL,GA,ORD,IL,2045,2.0,0,606,2152,2138.0,-14.0,0,9.1,0.0,0.0,0.0,19.3,23.3,15.6,70,60,13.9,18.8,10,9,2017
364005,7,UA,ATL,GA,ORD,IL,1445,-8.0,0,606,1556,1537.0,-19.0,0,9.1,0.0,0.0,0.0,19.3,23.3,15.6,70,60,13.9,18.8,10,9,2017
364006,7,UA,ATL,GA,EWR,NJ,1302,-6.0,0,746,1517,1501.0,-16.0,0,9.1,0.0,0.0,0.0,19.3,23.3,15.6,70,60,13.9,18.8,10,9,2017
364007,7,UA,ATL,GA,ORD,IL,652,3.0,0,606,803,752.0,-11.0,0,9.1,0.0,0.0,0.0,19.3,23.3,15.6,70,60,13.9,18.8,10,9,2017


In [21]:
airline_df = pd.get_dummies(atlanta2017_df['Reporting_Airline'], drop_first=True)
airline_df

Unnamed: 0,AS,B6,DL,EV,F9,NK,OO,UA,WN
0,1,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,1,0,0
4,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...
364004,0,0,0,0,0,0,0,1,0
364005,0,0,0,0,0,0,0,1,0
364006,0,0,0,0,0,0,0,1,0
364007,0,0,0,0,0,0,0,1,0


In [22]:
dest_df = pd.get_dummies(atlanta2017_df['Dest'], drop_first=True)
dest_df = dest_df.add_prefix('da_')
dest_df

Unnamed: 0,da_ABQ,da_ABY,da_ACY,da_AEX,da_AGS,da_ALB,da_ANC,da_ASE,da_ATW,da_AUS,da_AVL,da_AVP,da_BDL,da_BHM,da_BMI,da_BNA,da_BOS,...,da_SNA,da_SRQ,da_STL,da_STT,da_STX,da_SYR,da_TLH,da_TPA,da_TRI,da_TTN,da_TUL,da_TUS,da_TVC,da_TYS,da_VLD,da_VPS,da_XNA
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
364004,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
364005,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
364006,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
364007,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [23]:
dest_state_df = pd.get_dummies(atlanta2017_df['DestState'], drop_first=True)
dest_state_df = dest_state_df.add_prefix('ds_')
dest_state_df

Unnamed: 0,ds_AL,ds_AR,ds_AZ,ds_CA,ds_CO,ds_CT,ds_FL,ds_GA,ds_HI,ds_IA,ds_IL,ds_IN,ds_KS,ds_KY,ds_LA,ds_MA,ds_MD,...,ds_OK,ds_OR,ds_PA,ds_PR,ds_RI,ds_SC,ds_SD,ds_TN,ds_TX,ds_UT,ds_VA,ds_VI,ds_VT,ds_WA,ds_WI,ds_WV,ds_WY
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
364004,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
364005,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
364006,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
364007,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [24]:
atlanta2017_df = pd.concat([atlanta2017_df, airline_df, dest_df, dest_state_df], axis=1) 
atlanta2017_df.drop(['Reporting_Airline', 'Origin', 'OriginState', 'Dest', 'DestState'], inplace=True, axis=1)
atlanta2017_df

Unnamed: 0,DayOfWeek,CRSDepTime,DepDelay,Cancelled,Distance,CRSArrTime,ArrTime,ArrDelay,Delayed,AWND,PRCP,SNOW,SNWD,TAVG,TMAX,TMIN,WDF2,...,ds_OK,ds_OR,ds_PA,ds_PR,ds_RI,ds_SC,ds_SD,ds_TN,ds_TX,ds_UT,ds_VA,ds_VI,ds_VT,ds_WA,ds_WI,ds_WV,ds_WY
0,6,630,13.0,0,2182,904,905.0,1.0,1,3.4,0.3,0.0,0.0,24.7,30.0,21.7,320,...,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
1,6,1612,-6.0,0,2182,1840,1812.0,-28.0,0,3.4,0.3,0.0,0.0,24.7,30.0,21.7,320,...,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
2,6,900,2.0,0,2172,1118,1100.0,-18.0,0,3.4,0.3,0.0,0.0,24.7,30.0,21.7,320,...,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,6,1115,194.0,0,606,1225,1541.0,196.0,1,3.4,0.3,0.0,0.0,24.7,30.0,21.7,320,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,6,640,30.0,0,606,747,804.0,17.0,1,3.4,0.3,0.0,0.0,24.7,30.0,21.7,320,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
364004,7,2045,2.0,0,606,2152,2138.0,-14.0,0,9.1,0.0,0.0,0.0,19.3,23.3,15.6,70,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
364005,7,1445,-8.0,0,606,1556,1537.0,-19.0,0,9.1,0.0,0.0,0.0,19.3,23.3,15.6,70,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
364006,7,1302,-6.0,0,746,1517,1501.0,-16.0,0,9.1,0.0,0.0,0.0,19.3,23.3,15.6,70,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
364007,7,652,3.0,0,606,803,752.0,-11.0,0,9.1,0.0,0.0,0.0,19.3,23.3,15.6,70,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [25]:
cancel_df = atlanta2017_df.pop('Cancelled')
atlanta2017_df['Cancelled'] = cancel_df
atlanta2017_df

Unnamed: 0,DayOfWeek,CRSDepTime,DepDelay,Distance,CRSArrTime,ArrTime,ArrDelay,Delayed,AWND,PRCP,SNOW,SNWD,TAVG,TMAX,TMIN,WDF2,WDF5,...,ds_OR,ds_PA,ds_PR,ds_RI,ds_SC,ds_SD,ds_TN,ds_TX,ds_UT,ds_VA,ds_VI,ds_VT,ds_WA,ds_WI,ds_WV,ds_WY,Cancelled
0,6,630,13.0,2182,904,905.0,1.0,1,3.4,0.3,0.0,0.0,24.7,30.0,21.7,320,330,...,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
1,6,1612,-6.0,2182,1840,1812.0,-28.0,0,3.4,0.3,0.0,0.0,24.7,30.0,21.7,320,330,...,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
2,6,900,2.0,2172,1118,1100.0,-18.0,0,3.4,0.3,0.0,0.0,24.7,30.0,21.7,320,330,...,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,6,1115,194.0,606,1225,1541.0,196.0,1,3.4,0.3,0.0,0.0,24.7,30.0,21.7,320,330,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,6,640,30.0,606,747,804.0,17.0,1,3.4,0.3,0.0,0.0,24.7,30.0,21.7,320,330,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
364004,7,2045,2.0,606,2152,2138.0,-14.0,0,9.1,0.0,0.0,0.0,19.3,23.3,15.6,70,60,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
364005,7,1445,-8.0,606,1556,1537.0,-19.0,0,9.1,0.0,0.0,0.0,19.3,23.3,15.6,70,60,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
364006,7,1302,-6.0,746,1517,1501.0,-16.0,0,9.1,0.0,0.0,0.0,19.3,23.3,15.6,70,60,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
364007,7,652,3.0,606,803,752.0,-11.0,0,9.1,0.0,0.0,0.0,19.3,23.3,15.6,70,60,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [26]:
# create a dataframe of Jan-Oct to be used as training data

train_df = atlanta2017_df[atlanta2017_df['Month'] <= 10]
train_df

Unnamed: 0,DayOfWeek,CRSDepTime,DepDelay,Distance,CRSArrTime,ArrTime,ArrDelay,Delayed,AWND,PRCP,SNOW,SNWD,TAVG,TMAX,TMIN,WDF2,WDF5,...,ds_OR,ds_PA,ds_PR,ds_RI,ds_SC,ds_SD,ds_TN,ds_TX,ds_UT,ds_VA,ds_VI,ds_VT,ds_WA,ds_WI,ds_WV,ds_WY,Cancelled
0,6,630,13.0,2182,904,905.0,1.0,1,3.4,0.3,0.0,0.0,24.7,30.0,21.7,320,330,...,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
1,6,1612,-6.0,2182,1840,1812.0,-28.0,0,3.4,0.3,0.0,0.0,24.7,30.0,21.7,320,330,...,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
2,6,900,2.0,2172,1118,1100.0,-18.0,0,3.4,0.3,0.0,0.0,24.7,30.0,21.7,320,330,...,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,6,1115,194.0,606,1225,1541.0,196.0,1,3.4,0.3,0.0,0.0,24.7,30.0,21.7,320,330,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,6,640,30.0,606,747,804.0,17.0,1,3.4,0.3,0.0,0.0,24.7,30.0,21.7,320,330,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
364004,7,2045,2.0,606,2152,2138.0,-14.0,0,9.1,0.0,0.0,0.0,19.3,23.3,15.6,70,60,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
364005,7,1445,-8.0,606,1556,1537.0,-19.0,0,9.1,0.0,0.0,0.0,19.3,23.3,15.6,70,60,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
364006,7,1302,-6.0,746,1517,1501.0,-16.0,0,9.1,0.0,0.0,0.0,19.3,23.3,15.6,70,60,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
364007,7,652,3.0,606,803,752.0,-11.0,0,9.1,0.0,0.0,0.0,19.3,23.3,15.6,70,60,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [27]:
# create a dataframe of Nov-Dec to be used as validation data

validation_df = atlanta2017_df[atlanta2017_df['Month'] >= 11]
validation_df

Unnamed: 0,DayOfWeek,CRSDepTime,DepDelay,Distance,CRSArrTime,ArrTime,ArrDelay,Delayed,AWND,PRCP,SNOW,SNWD,TAVG,TMAX,TMIN,WDF2,WDF5,...,ds_OR,ds_PA,ds_PR,ds_RI,ds_SC,ds_SD,ds_TN,ds_TX,ds_UT,ds_VA,ds_VI,ds_VT,ds_WA,ds_WI,ds_WV,ds_WY,Cancelled
64228,5,1752,-5.0,731,1920,1906.0,-14.0,0,2.2,0.0,0.0,0.0,15.3,18.9,12.2,350,350,...,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
64229,5,1042,-3.0,946,1311,1322.0,11.0,1,2.2,0.0,0.0,0.0,15.3,18.9,12.2,350,350,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
64230,5,742,-2.0,581,930,926.0,-4.0,0,2.2,0.0,0.0,0.0,15.3,18.9,12.2,350,350,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
64231,5,2056,-5.0,404,2220,2211.0,-9.0,0,2.2,0.0,0.0,0.0,15.3,18.9,12.2,350,350,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
64232,5,929,-3.0,554,1113,1121.0,8.0,1,2.2,0.0,0.0,0.0,15.3,18.9,12.2,350,350,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
181715,2,1751,-3.0,547,1934,1932.0,-2.0,0,3.0,0.0,0.0,0.0,11.2,18.9,3.9,110,160,...,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
181716,2,957,-3.0,2092,1219,1146.0,-33.0,0,3.0,0.0,0.0,0.0,11.2,18.9,3.9,110,160,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
181717,2,905,-3.0,432,1036,1020.0,-16.0,0,3.0,0.0,0.0,0.0,11.2,18.9,3.9,110,160,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
181718,2,940,6.0,547,1119,1121.0,2.0,1,3.0,0.0,0.0,0.0,11.2,18.9,3.9,110,160,...,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0


In [28]:
# split the data in X (independent variables) and y (dependent variable)

X_train = train_df.iloc[:, :-1]
y_train = train_df.iloc[:,-1]

X_validation = validation_df.iloc[:, :-1]
y_validation = validation_df.iloc[:,-1]

In [29]:
atlanta_lr = LogisticRegression()
atlanta_lr.fit(X_train, y_train)
print(atlanta_lr.score(X_train, y_train))
print(atlanta_lr.score(X_validation, y_validation))

0.9999804862183918
1.0


In [30]:
y_pred = atlanta_lr.predict(X_validation)

In [31]:
print(classification_report(y_validation, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     55033
           1       1.00      1.00      1.00      1501

    accuracy                           1.00     56534
   macro avg       1.00      1.00      1.00     56534
weighted avg       1.00      1.00      1.00     56534



In [32]:
coefficient_df = pd.DataFrame(columns=['Feature', 'Coefficient'])

coefficient_df['Feature'] = atlanta2017_df.iloc[:, :-1].columns.T
coefficient_df['Coefficient'] = atlanta_lr.coef_.T.reshape(243)

display(coefficient_df.sort_values(by = ['Coefficient'], ascending=False).head(10))
display(coefficient_df.sort_values(by = ['Coefficient']).head(10))

Unnamed: 0,Feature,Coefficient
25,EV,0.177564
0,DayOfWeek,0.155932
27,NK,0.151114
234,ds_TX,0.126648
102,da_HOU,0.121184
200,ds_FL,0.097963
20,Month,0.093896
29,UA,0.086148
87,da_FLL,0.084411
17,WSF2,0.082814


Unnamed: 0,Feature,Coefficient
5,ArrTime,-4.348787
7,Delayed,-1.986931
24,DL,-0.453036
225,ds_OH,-0.103148
12,TAVG,-0.094789
40,da_AUS,-0.086897
214,ds_MO,-0.086325
236,ds_VA,-0.075943
184,da_TPA,-0.074984
157,da_RDU,-0.071219
