In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn import linear_model
from sklearn import model_selection
from sklearn.linear_model import LinearRegression

In [2]:
data = pd.read_csv('Airlines.csv')
data

Unnamed: 0,id,Airline,Flight,AirportFrom,AirportTo,DayOfWeek,Time,Length,Delay
0,1,CO,269,SFO,IAH,3,15,205,1
1,2,US,1558,PHX,CLT,3,15,222,1
2,3,AA,2400,LAX,DFW,3,20,165,1
3,4,AA,2466,SFO,DFW,3,20,195,1
4,5,AS,108,ANC,SEA,3,30,202,0
...,...,...,...,...,...,...,...,...,...
539378,539379,CO,178,OGG,SNA,5,1439,326,0
539379,539380,FL,398,SEA,ATL,5,1439,305,0
539380,539381,FL,609,SFO,MKE,5,1439,255,0
539381,539382,UA,78,HNL,SFO,5,1439,313,1


In [3]:
df = data.drop('id', axis = 'columns')
df.head()

Unnamed: 0,Airline,Flight,AirportFrom,AirportTo,DayOfWeek,Time,Length,Delay
0,CO,269,SFO,IAH,3,15,205,1
1,US,1558,PHX,CLT,3,15,222,1
2,AA,2400,LAX,DFW,3,20,165,1
3,AA,2466,SFO,DFW,3,20,195,1
4,AS,108,ANC,SEA,3,30,202,0


In [4]:
# check if there is any null value
df.isnull().sum()

Airline        0
Flight         0
AirportFrom    0
AirportTo      0
DayOfWeek      0
Time           0
Length         0
Delay          0
dtype: int64

In [5]:
# check for any 0 length flight 
df[df['Length'] == 0 ]

Unnamed: 0,Airline,Flight,AirportFrom,AirportTo,DayOfWeek,Time,Length,Delay
179149,F9,106,DEN,MSP,6,635,0,0
182840,F9,107,MSP,DEN,6,851,0,0
188953,F9,103,MSP,DEN,7,375,0,0
445578,B6,493,BOS,SEA,7,1060,0,1


In [6]:
df["Airline"].unique()

array(['CO', 'US', 'AA', 'AS', 'DL', 'B6', 'HA', 'OO', '9E', 'OH', 'EV',
       'XE', 'YV', 'UA', 'MQ', 'FL', 'F9', 'WN'], dtype=object)

In [7]:
airlines = df ["Airline"].unique()
mapping = dict(zip(airlines,range(len(airlines))))
df['Airline'] = df['Airline'].map(mapping)
df

Unnamed: 0,Airline,Flight,AirportFrom,AirportTo,DayOfWeek,Time,Length,Delay
0,0,269,SFO,IAH,3,15,205,1
1,1,1558,PHX,CLT,3,15,222,1
2,2,2400,LAX,DFW,3,20,165,1
3,2,2466,SFO,DFW,3,20,195,1
4,3,108,ANC,SEA,3,30,202,0
...,...,...,...,...,...,...,...,...
539378,0,178,OGG,SNA,5,1439,326,0
539379,15,398,SEA,ATL,5,1439,305,0
539380,15,609,SFO,MKE,5,1439,255,0
539381,13,78,HNL,SFO,5,1439,313,1


In [8]:
df['AirportFrom'].unique()

array(['SFO', 'PHX', 'LAX', 'ANC', 'LAS', 'SLC', 'DEN', 'ONT', 'FAI',
       'BQN', 'PSE', 'HNL', 'BIS', 'IYK', 'EWR', 'BOS', 'MKE', 'GFK',
       'OMA', 'GSO', 'LMT', 'SEA', 'MCO', 'TPA', 'DLH', 'MSP', 'FAR',
       'MFE', 'MSY', 'VPS', 'BWI', 'MAF', 'LWS', 'RST', 'ALB', 'DSM',
       'CHS', 'MSN', 'JAX', 'SAT', 'PNS', 'BHM', 'LIT', 'SAV', 'BNA',
       'ICT', 'ECP', 'DHN', 'MGM', 'CAE', 'PWM', 'ACV', 'EKO', 'PHL',
       'ATL', 'PDX', 'RIC', 'BTR', 'HRL', 'MYR', 'TUS', 'SBN', 'CAK',
       'TVC', 'CLE', 'ORD', 'DAY', 'MFR', 'BTV', 'TLH', 'TYS', 'DFW',
       'FLL', 'AUS', 'CHA', 'CMH', 'LRD', 'BRO', 'CRP', 'LAN', 'PVD',
       'FWA', 'JFK', 'LGA', 'OKC', 'PIT', 'PBI', 'ORF', 'DCA', 'AEX',
       'SYR', 'SHV', 'VLD', 'BDL', 'FAT', 'BZN', 'RDM', 'LFT', 'IPL',
       'EAU', 'ERI', 'BUF', 'IAH', 'MCI', 'AGS', 'ABI', 'GRR', 'LBB',
       'CLT', 'LEX', 'MBS', 'MOD', 'AMA', 'SGF', 'AZO', 'ABE', 'SWF',
       'BGM', 'AVP', 'FNT', 'GSP', 'ATW', 'ITH', 'TUL', 'COS', 'ELP',
       'ABQ', 'SMF',

In [9]:
len(df['AirportFrom'].unique()) 


293

In [10]:
airportFrom = df['AirportFrom'].unique()
mapping = dict(zip(airportFrom, range(len(airportFrom))))
df['AirportFrom'] = df['AirportFrom'].map(mapping)
df

Unnamed: 0,Airline,Flight,AirportFrom,AirportTo,DayOfWeek,Time,Length,Delay
0,0,269,0,IAH,3,15,205,1
1,1,1558,1,CLT,3,15,222,1
2,2,2400,2,DFW,3,20,165,1
3,2,2466,0,DFW,3,20,195,1
4,3,108,3,SEA,3,30,202,0
...,...,...,...,...,...,...,...,...
539378,0,178,193,SNA,5,1439,326,0
539379,15,398,21,ATL,5,1439,305,0
539380,15,609,0,MKE,5,1439,255,0
539381,13,78,11,SFO,5,1439,313,1


In [11]:
df['AirportTo'].unique()

array(['IAH', 'CLT', 'DFW', 'SEA', 'MSP', 'DTW', 'ORD', 'ATL', 'PDX',
       'JFK', 'SLC', 'HNL', 'PHX', 'MCO', 'OGG', 'LAX', 'KOA', 'ITO',
       'SFO', 'MIA', 'IAD', 'SMF', 'PHL', 'LIH', 'DEN', 'LGA', 'MEM',
       'CVG', 'YUM', 'CWA', 'MKE', 'BQN', 'FAI', 'LAS', 'ANC', 'BOS',
       'LGB', 'FLL', 'SJU', 'EWR', 'DCA', 'BWI', 'RDU', 'MCI', 'TYS',
       'SAN', 'ONT', 'OAK', 'MDW', 'BNA', 'DAL', 'CLE', 'JAX', 'JNU',
       'RNO', 'ELP', 'SAT', 'OTZ', 'MBS', 'BDL', 'STL', 'HOU', 'AUS',
       'SNA', 'SJC', 'LIT', 'TUS', 'TUL', 'CMH', 'LAN', 'IND', 'AMA',
       'CRP', 'PIT', 'RKS', 'FWA', 'TPA', 'PBI', 'JAN', 'DSM', 'ADQ',
       'GRB', 'PVD', 'ABQ', 'SDF', 'RSW', 'MSY', 'BUR', 'BOI', 'TLH',
       'BHM', 'ACV', 'ORF', 'BET', 'KTN', 'RIC', 'SRQ', 'BTR', 'XNA',
       'MHT', 'GRR', 'SBN', 'SBA', 'ROA', 'CID', 'GPT', 'MFR', 'SGU',
       'HPN', 'OMA', 'OTH', 'GSP', 'LMT', 'BUF', 'MSN', 'BFL', 'CAE',
       'HRL', 'OKC', 'SYR', 'COS', 'BTV', 'CDC', 'SCC', 'DAY', 'SJT',
       'TVC', 'ROC',

In [12]:
len(df['AirportTo'].unique())

293

In [13]:
# Using Label Encoder instead of dictionary mapping
le_portTo = LabelEncoder()
df['AirportTo'] = le_portTo.fit_transform(df['AirportTo'])
df

Unnamed: 0,Airline,Flight,AirportFrom,AirportTo,DayOfWeek,Time,Length,Delay
0,0,269,0,135,3,15,205,1
1,1,1558,1,60,3,15,222,1
2,2,2400,2,80,3,20,165,1
3,2,2466,0,80,3,20,195,1
4,3,108,3,252,3,30,202,0
...,...,...,...,...,...,...,...,...
539378,0,178,193,264,5,1439,326,0
539379,15,398,21,16,5,1439,305,0
539380,15,609,0,184,5,1439,255,0
539381,13,78,11,253,5,1439,313,1


In [14]:
# Remove flights that have length 0
df = df[df['Length'] != 0]
df

Unnamed: 0,Airline,Flight,AirportFrom,AirportTo,DayOfWeek,Time,Length,Delay
0,0,269,0,135,3,15,205,1
1,1,1558,1,60,3,15,222,1
2,2,2400,2,80,3,20,165,1
3,2,2466,0,80,3,20,195,1
4,3,108,3,252,3,30,202,0
...,...,...,...,...,...,...,...,...
539378,0,178,193,264,5,1439,326,0
539379,15,398,21,16,5,1439,305,0
539380,15,609,0,184,5,1439,255,0
539381,13,78,11,253,5,1439,313,1


In [15]:
len(df['Flight'].unique())

6585

In [16]:
from sklearn.model_selection import train_test_split
train,test = train_test_split(df, test_size = 0.2, random_state = 42)

In [17]:
target = test['Delay']
test = test.drop('Delay', axis = 'columns')

In [18]:
test

Unnamed: 0,Airline,Flight,AirportFrom,AirportTo,DayOfWeek,Time,Length
388757,8,3657,54,238,4,575,126
30685,14,3228,89,80,4,1015,75
253130,16,363,138,79,3,993,175
38945,4,1302,171,197,5,505,134
408595,17,2191,0,243,5,660,85
...,...,...,...,...,...,...,...
534994,4,1130,171,16,5,940,84
270806,1,596,196,217,4,975,101
502270,15,102,71,16,3,1117,118
192409,14,2784,71,284,7,585,45


In [34]:
# Normalize training set
colList = list(train.columns)
colList.pop()
train.loc[:, colList] = train.loc[:, colList].apply(lambda col: (col - col.min()) / (col.max() - col.min()))
train

Unnamed: 0,Airline,Flight,AirportFrom,AirportTo,DayOfWeek,Time,Length,Delay
328146,0.411765,0.848950,0.020548,0.958904,1.000000,0.897831,0.143987,1
210186,0.588235,0.681772,0.585616,0.452055,0.000000,0.391882,0.061709,0
66053,0.235294,0.361495,0.051370,0.674658,0.833333,0.755773,0.291139,0
462811,0.764706,0.014465,0.222603,0.592466,0.000000,0.685794,0.221519,0
354941,0.411765,0.852791,0.020548,0.318493,0.166667,0.503849,0.237342,0
...,...,...,...,...,...,...,...,...
110268,0.058824,0.002432,0.003425,0.438356,0.166667,0.493352,0.601266,0
259181,0.764706,0.101767,0.448630,0.458904,0.500000,0.255423,0.072785,0
365841,0.529412,0.853431,0.123288,0.544521,0.333333,0.244927,0.156646,0
131932,0.764706,0.119944,0.020548,0.712329,0.333333,0.661302,0.191456,0


In [35]:
# Normalize test set
colList = list(test.columns)
test.loc[:, colList] = test.loc[:, colList].apply(lambda col: (col - col.min()) / (col.max() - col.min()))
test

Unnamed: 0,Airline,Flight,AirportFrom,AirportTo,DayOfWeek,Time,Length
388757,0.470588,0.467938,0.184932,0.815068,0.500000,0.395381,0.161648
30685,0.823529,0.413030,0.304795,0.273973,0.500000,0.703289,0.080824
253130,0.941176,0.046333,0.472603,0.270548,0.333333,0.687894,0.239303
38945,0.235294,0.166517,0.585616,0.674658,0.666667,0.346396,0.174326
408595,1.000000,0.280302,0.000000,0.832192,0.666667,0.454864,0.096672
...,...,...,...,...,...,...,...
534994,0.235294,0.144503,0.585616,0.054795,0.666667,0.650805,0.095087
270806,0.058824,0.076155,0.671233,0.743151,0.500000,0.675297,0.122029
502270,0.882353,0.012927,0.243151,0.054795,0.333333,0.774668,0.148970
192409,0.823529,0.356201,0.243151,0.972603,1.000000,0.402379,0.033281


In [36]:
trainInput = train.drop("Delay",axis = "columns")
trainOutput  = train["Delay"]

In [37]:
def accuracyOfActualVsPredicted(actualOutputSeries, predOutputSeries):
    compare = (actualOutputSeries == predOutputSeries).value_counts()
    # if there are no Trues in compare, then compare[True] throws an error. So we have to check:
    if (True in compare):
        accuracy = compare[True] / actualOutputSeries.size
    else:
        accuracy = 0

    return accuracy

In [46]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import make_scorer

model = KNeighborsClassifier(n_neighbors=11)
scorer = make_scorer(accuracyOfActualVsPredicted, greater_is_better=True)
accuracies = model_selection.cross_val_score(model, trainInput, trainOutput, 
    cv=10, scoring=scorer)
mean_score = sum(accuracies)/len(accuracies)
print(mean_score)

0.6316757912263311


In [43]:
from sklearn.linear_model import LogisticRegression

model2 = LogisticRegression()
scorer = make_scorer(accuracyOfActualVsPredicted, greater_is_better=True)
accuracies = model_selection.cross_val_score(model2, trainInput, trainOutput, 
    cv=10, scoring=scorer)
mean_score = sum(accuracies)/len(accuracies)
print(mean_score)

0.5938521927082812


In [47]:
from sklearn.metrics import accuracy_score
model.fit(trainInput,trainOutput)
predictions = model.predict(test)
accuracy = accuracy_score(target,predictions)
accuracy

0.6311135006859728