In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [2]:
data = pd.read_csv('Airlines.csv')
data

Unnamed: 0,id,Airline,Flight,AirportFrom,AirportTo,DayOfWeek,Time,Length,Delay
0,1,CO,269,SFO,IAH,3,15,205,1
1,2,US,1558,PHX,CLT,3,15,222,1
2,3,AA,2400,LAX,DFW,3,20,165,1
3,4,AA,2466,SFO,DFW,3,20,195,1
4,5,AS,108,ANC,SEA,3,30,202,0
5,6,CO,1094,LAX,IAH,3,30,181,1
6,7,DL,1768,LAX,MSP,3,30,220,0
7,8,DL,2722,PHX,DTW,3,30,228,0
8,9,DL,2606,SFO,MSP,3,35,216,1
9,10,AA,2538,LAS,ORD,3,40,200,1


In [3]:
df = data.drop('id', axis = 'columns')
df.head()

Unnamed: 0,Airline,Flight,AirportFrom,AirportTo,DayOfWeek,Time,Length,Delay
0,CO,269,SFO,IAH,3,15,205,1
1,US,1558,PHX,CLT,3,15,222,1
2,AA,2400,LAX,DFW,3,20,165,1
3,AA,2466,SFO,DFW,3,20,195,1
4,AS,108,ANC,SEA,3,30,202,0


In [4]:
# check if there is any null value
df.isnull().sum()

Airline        0
Flight         0
AirportFrom    0
AirportTo      0
DayOfWeek      0
Time           0
Length         0
Delay          0
dtype: int64

In [5]:
# check for any 0 length flight 
df[df['Length'] == 0 ]

Unnamed: 0,Airline,Flight,AirportFrom,AirportTo,DayOfWeek,Time,Length,Delay
179149,F9,106,DEN,MSP,6,635,0,0
182840,F9,107,MSP,DEN,6,851,0,0
188953,F9,103,MSP,DEN,7,375,0,0
445578,B6,493,BOS,SEA,7,1060,0,1


In [6]:
df["Airline"].unique()

array(['CO', 'US', 'AA', 'AS', 'DL', 'B6', 'HA', 'OO', '9E', 'OH', 'EV',
       'XE', 'YV', 'UA', 'MQ', 'FL', 'F9', 'WN'], dtype=object)

In [7]:
airlines = df ["Airline"].unique()
mapping = dict(zip(airlines,range(len(airlines))))
df['Airline'] = df['Airline'].map(mapping)
df

Unnamed: 0,Airline,Flight,AirportFrom,AirportTo,DayOfWeek,Time,Length,Delay
0,0,269,SFO,IAH,3,15,205,1
1,1,1558,PHX,CLT,3,15,222,1
2,2,2400,LAX,DFW,3,20,165,1
3,2,2466,SFO,DFW,3,20,195,1
4,3,108,ANC,SEA,3,30,202,0
5,0,1094,LAX,IAH,3,30,181,1
6,4,1768,LAX,MSP,3,30,220,0
7,4,2722,PHX,DTW,3,30,228,0
8,4,2606,SFO,MSP,3,35,216,1
9,2,2538,LAS,ORD,3,40,200,1


In [8]:
df['AirportFrom'].unique()

array(['SFO', 'PHX', 'LAX', 'ANC', 'LAS', 'SLC', 'DEN', 'ONT', 'FAI',
       'BQN', 'PSE', 'HNL', 'BIS', 'IYK', 'EWR', 'BOS', 'MKE', 'GFK',
       'OMA', 'GSO', 'LMT', 'SEA', 'MCO', 'TPA', 'DLH', 'MSP', 'FAR',
       'MFE', 'MSY', 'VPS', 'BWI', 'MAF', 'LWS', 'RST', 'ALB', 'DSM',
       'CHS', 'MSN', 'JAX', 'SAT', 'PNS', 'BHM', 'LIT', 'SAV', 'BNA',
       'ICT', 'ECP', 'DHN', 'MGM', 'CAE', 'PWM', 'ACV', 'EKO', 'PHL',
       'ATL', 'PDX', 'RIC', 'BTR', 'HRL', 'MYR', 'TUS', 'SBN', 'CAK',
       'TVC', 'CLE', 'ORD', 'DAY', 'MFR', 'BTV', 'TLH', 'TYS', 'DFW',
       'FLL', 'AUS', 'CHA', 'CMH', 'LRD', 'BRO', 'CRP', 'LAN', 'PVD',
       'FWA', 'JFK', 'LGA', 'OKC', 'PIT', 'PBI', 'ORF', 'DCA', 'AEX',
       'SYR', 'SHV', 'VLD', 'BDL', 'FAT', 'BZN', 'RDM', 'LFT', 'IPL',
       'EAU', 'ERI', 'BUF', 'IAH', 'MCI', 'AGS', 'ABI', 'GRR', 'LBB',
       'CLT', 'LEX', 'MBS', 'MOD', 'AMA', 'SGF', 'AZO', 'ABE', 'SWF',
       'BGM', 'AVP', 'FNT', 'GSP', 'ATW', 'ITH', 'TUL', 'COS', 'ELP',
       'ABQ', 'SMF',

In [9]:
len(df['AirportFrom'].unique()) 


293

In [10]:
airportFrom = df['AirportFrom'].unique()
mapping = dict(zip(airportFrom, range(len(airportFrom))))
df['AirportFrom'] = df['AirportFrom'].map(mapping)
df

Unnamed: 0,Airline,Flight,AirportFrom,AirportTo,DayOfWeek,Time,Length,Delay
0,0,269,0,IAH,3,15,205,1
1,1,1558,1,CLT,3,15,222,1
2,2,2400,2,DFW,3,20,165,1
3,2,2466,0,DFW,3,20,195,1
4,3,108,3,SEA,3,30,202,0
5,0,1094,2,IAH,3,30,181,1
6,4,1768,2,MSP,3,30,220,0
7,4,2722,1,DTW,3,30,228,0
8,4,2606,0,MSP,3,35,216,1
9,2,2538,4,ORD,3,40,200,1


In [11]:
df['AirportTo'].unique()

array(['IAH', 'CLT', 'DFW', 'SEA', 'MSP', 'DTW', 'ORD', 'ATL', 'PDX',
       'JFK', 'SLC', 'HNL', 'PHX', 'MCO', 'OGG', 'LAX', 'KOA', 'ITO',
       'SFO', 'MIA', 'IAD', 'SMF', 'PHL', 'LIH', 'DEN', 'LGA', 'MEM',
       'CVG', 'YUM', 'CWA', 'MKE', 'BQN', 'FAI', 'LAS', 'ANC', 'BOS',
       'LGB', 'FLL', 'SJU', 'EWR', 'DCA', 'BWI', 'RDU', 'MCI', 'TYS',
       'SAN', 'ONT', 'OAK', 'MDW', 'BNA', 'DAL', 'CLE', 'JAX', 'JNU',
       'RNO', 'ELP', 'SAT', 'OTZ', 'MBS', 'BDL', 'STL', 'HOU', 'AUS',
       'SNA', 'SJC', 'LIT', 'TUS', 'TUL', 'CMH', 'LAN', 'IND', 'AMA',
       'CRP', 'PIT', 'RKS', 'FWA', 'TPA', 'PBI', 'JAN', 'DSM', 'ADQ',
       'GRB', 'PVD', 'ABQ', 'SDF', 'RSW', 'MSY', 'BUR', 'BOI', 'TLH',
       'BHM', 'ACV', 'ORF', 'BET', 'KTN', 'RIC', 'SRQ', 'BTR', 'XNA',
       'MHT', 'GRR', 'SBN', 'SBA', 'ROA', 'CID', 'GPT', 'MFR', 'SGU',
       'HPN', 'OMA', 'OTH', 'GSP', 'LMT', 'BUF', 'MSN', 'BFL', 'CAE',
       'HRL', 'OKC', 'SYR', 'COS', 'BTV', 'CDC', 'SCC', 'DAY', 'SJT',
       'TVC', 'ROC',

In [12]:
len(df['AirportTo'].unique())

293

In [13]:
# Using Label Encoder instead of dictionary mapping
le_portTo = LabelEncoder()
df['AirportTo'] = le_portTo.fit_transform(df['AirportTo'])
df

Unnamed: 0,Airline,Flight,AirportFrom,AirportTo,DayOfWeek,Time,Length,Delay
0,0,269,0,135,3,15,205,1
1,1,1558,1,60,3,15,222,1
2,2,2400,2,80,3,20,165,1
3,2,2466,0,80,3,20,195,1
4,3,108,3,252,3,30,202,0
5,0,1094,2,135,3,30,181,1
6,4,1768,2,197,3,30,220,0
7,4,2722,1,85,3,30,228,0
8,4,2606,0,197,3,35,216,1
9,2,2538,4,208,3,40,200,1


In [14]:
# Remove flights that have length 0
df = df[df['Length'] != 0]
df

Unnamed: 0,Airline,Flight,AirportFrom,AirportTo,DayOfWeek,Time,Length,Delay
0,0,269,0,135,3,15,205,1
1,1,1558,1,60,3,15,222,1
2,2,2400,2,80,3,20,165,1
3,2,2466,0,80,3,20,195,1
4,3,108,3,252,3,30,202,0
5,0,1094,2,135,3,30,181,1
6,4,1768,2,197,3,30,220,0
7,4,2722,1,85,3,30,228,0
8,4,2606,0,197,3,35,216,1
9,2,2538,4,208,3,40,200,1


In [15]:
from sklearn import tree
from sklearn.model_selection import train_test_split

In [16]:
model = tree.DecisionTreeClassifier()

In [17]:
train,test = train_test_split(df, test_size = 0.2, random_state = 42)

In [18]:
target = train['Delay']
train = train.drop('Delay', axis = 'columns')

In [21]:
test

Unnamed: 0,Airline,Flight,AirportFrom,AirportTo,DayOfWeek,Time,Length,Delay
388757,8,3657,54,238,4,575,126,0
30685,14,3228,89,80,4,1015,75,0
253130,16,363,138,79,3,993,175,1
38945,4,1302,171,197,5,505,134,1
408595,17,2191,0,243,5,660,85,0
387713,0,345,64,135,4,520,195,1
181936,17,836,197,18,6,795,50,0
421879,4,2783,93,16,6,628,161,0
148764,14,3693,56,183,4,895,160,1
50667,7,6753,2,57,5,1110,41,1


In [20]:
model.fit(train,target)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [22]:
model.score(train,target)

0.8243951953984098

In [25]:
# Predict from a test case
model.predict([[8,3857,54,238,4,575,126]])

array([0], dtype=int64)

In [26]:
model.predict([[14,3228,89,80,4,1015,75]])

array([1], dtype=int64)

In [29]:
# Build a decision tree model
param={'criterion':['gini','entropy'],'min_samples_split':[5,10], 'max_depth':[None,2],'min_samples_leaf':[1,10],'max_features':[None,'sqrt','log2']}

In [31]:
GSDT = GridSearchCV(model, param, cv=5, n_jobs=n_thread, scoring='accuracy')

NameError: name 'GridSearchCV' is not defined