In [1]:
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDRegressor
from sklearn import preprocessing
from sklearn.utils import resample
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import recall_score
from sklearn import linear_model
from sklearn.metrics import confusion_matrix

  from collections import Sequence


In [2]:
airline = pd.read_csv(r'/Users/admin/Documents/Supervised_learning/Supervised_learning/2008.csv')

In [3]:
sample = airline.sample(frac = 0.5, replace=True, random_state = 108)

In [4]:
sample.shape

(3504864, 29)

In [5]:
sample.head()

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,...,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
5107811,2008,9,30,2,750.0,750,943.0,1015,US,1477,...,5.0,16.0,0,,0,,,,,
4160279,2008,7,24,4,1950.0,1950,2157.0,2159,AS,332,...,7.0,14.0,0,,0,,,,,
5087377,2008,9,13,6,929.0,937,1045.0,1034,US,1711,...,5.0,28.0,0,,0,,,,,
1092043,2008,2,18,1,1346.0,1350,1448.0,1445,AA,1853,...,7.0,15.0,0,,0,,,,,
46446,2008,1,17,4,1013.0,1015,1222.0,1225,WN,790,...,4.0,14.0,0,,0,,,,,


# Train_Test_Split

In [6]:
sample['late_arrival'] = np.where(sample['ArrDelay'] >=30, 1, 0)

In [7]:
sample['CRSDepTime_mod'] =sample['CRSDepTime'] //100

In [8]:
X =sample

In [9]:
X= X.dropna(axis=1)

In [10]:
X.head()

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,CRSDepTime,CRSArrTime,UniqueCarrier,FlightNum,Origin,Dest,Distance,Cancelled,Diverted,late_arrival,CRSDepTime_mod
5107811,2008,9,30,2,750,1015,US,1477,PHL,LAS,2176,0,0,0,7
4160279,2008,7,24,4,1950,2159,AS,332,SEA,SJC,697,0,0,0,19
5087377,2008,9,13,6,937,1034,US,1711,ILM,CLT,185,0,0,0,9
1092043,2008,2,18,1,1350,1445,AA,1853,DFW,COS,592,0,0,0,13
46446,2008,1,17,4,1015,1225,WN,790,LAS,PHX,256,0,0,0,10


In [11]:
Y=X['late_arrival']

In [12]:
X =X.loc[:, ~ X.columns.isin(['late_arrival'])]

In [13]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=108)

# Converting Categorical Variables into Probabilities of Being Late

In [14]:
def converter(df_orig, column, df_dest):
    dictionary = {}

    keys = pd.DataFrame(df_orig[column].value_counts(normalize=True)).index.values
    values = np.array(df_orig[column].value_counts(normalize=True))

    dictionary = dict(zip(keys, values))
    series = df_dest[column].map(dictionary)
    return  series

# Dealing with Class Imbalance (Manual Approach)

In [15]:
late = X_train[Y_train==1]
ontime = X_train[Y_train==0]

In [16]:
late.shape

(369975, 14)

In [17]:
ontime.shape

(2433916, 14)

In [18]:
late_resampled = resample(late,replace=True, n_samples=300000, random_state=108)
ontime_resampled = resample(ontime,replace=True, n_samples=300000, random_state=108)

In [19]:
df_balanced= pd.concat([late_resampled, ontime_resampled])

In [20]:
df_balanced['Mod_CRSDepTime'] = converter(late_resampled, 'CRSDepTime_mod', df_balanced)
df_balanced['Mod_Month'] = converter(late_resampled, 'Month', df_balanced)
df_balanced['Mod_DayOfWeek'] = converter(late_resampled, 'DayOfWeek', df_balanced)
df_balanced['Mod_DayofMonth'] = converter(late_resampled, 'DayofMonth', df_balanced)
df_balanced['Mod_UniqueCarrier'] = converter(late_resampled, 'UniqueCarrier', df_balanced)
#df_balanced['Mod_Dest'] = converter(late_resampled, 'Dest', df_balanced)
#df_balanced['Mod_Origin'] = converter(late_resampled, 'Origin', df_balanced)

In [22]:
late_resampled['late_arrival'] = [1] *300000
ontime_resampled['late_arrival'] = [0]*300000

#recreating the Y_train_balanced for validation purposes from the training set
df_Y_b = pd.concat([late_resampled, ontime_resampled])
Y_train_balanced = df_Y_b['late_arrival']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [23]:
X_train_balanced = df_balanced.loc[:, ~ df_balanced.columns.isin(['Year','Cancelled', 'CancellationCode', 'Diverted', 'CarrierDelay',
                       'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay', 'TailNum', 'UniqueCarrier',
                        'ActualElapsedTime', 'Year','Cancelled', 'CancellationCode', 'Diverted', 'CarrierDelay',
                        'WeatherDelay', 'NASDelay', 'SecurityDelay',  'CRSDepTime','Month', 
                        'TailNum', 'UniqueCarrier', 'Origin', 'Dest', 'DepTime', 'ArrTime', 'DayOfWeek','CRSElapsedTime','CRSArrTime',
                        'Dest', 'DepTime', 'ArrTime', 'ActualElapsedTime', 'ArrDelay', 'DayofMonth', 'DepDelay', 'TaxiIn', 'TaxiOut',
                        '9E', 'AA', 'AQ', 'AS', 'B6', 'CO', 'DL', 'EV', 'F9', 'FL', 'HA', 'MQ', 'NW',
                        'OH', 'OO', 'UA', 'US', 'WN', 'XE', 'YV', 'FlightNum', 
                        'Month_mod', 'DayOfWeek_mod', 'CRSDepTime_mod'])]

In [24]:
X_train_balanced.columns

Index(['Distance', 'Mod_CRSDepTime', 'Mod_Month', 'Mod_DayOfWeek',
       'Mod_DayofMonth', 'Mod_UniqueCarrier'],
      dtype='object')

In [25]:
Y_train_balanced.isnull().values.any()

False

In [26]:
X_test['Mod_CRSDepTime'] = converter(late, 'CRSDepTime_mod', X_test)
X_test['Mod_Month'] = converter(late, 'Month', X_test)
X_test['Mod_DayOfWeek'] = converter(late, 'DayOfWeek', X_test)
X_test['Mod_DayofMonth'] = converter(late, 'DayofMonth', X_test)
X_test['Mod_UniqueCarrier'] = converter(late, 'UniqueCarrier', X_test)
#X_test['Mod_Dest'] = converter(late, 'Dest', X_test)
#X_test['Mod_Origin'] = converter(late, 'Origin', X_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See

In [27]:
X_train_balanced.isnull().values.any()

False

# Random Forest

In [29]:
roc_auc_scores = []
parameters = []

est_number = [100, 500,700] 

for value in est_number:
    rfc = RandomForestClassifier(n_jobs = -1, n_estimators = value)
    roc_auc = np.mean(cross_val_score(rfc, X_train_balanced, Y_train_balanced, scoring = 'roc_auc', n_jobs=-1))
    roc_auc_scores.append(roc_auc)
    parameters.append(value)

In [34]:
df = pd.DataFrame(roc_auc_scores)
df['params'] = parameters
df.columns=['roc_auc_scores', 'params']

df.sort_values(by = 'roc_auc_scores', ascending=False).reset_index(drop=True)

Unnamed: 0,roc_auc_scores,params
0,0.845194,700
1,0.844948,500
2,0.843049,100


In [35]:
roc_auc_scores = []
parameters = []

depth = [8, 20, 50] 

for value in depth:
    rfc = RandomForestClassifier(n_jobs = -1, n_estimators = 700, max_depth = value)
    roc_auc = np.mean(cross_val_score(rfc, X_train_balanced, Y_train_balanced, scoring = 'roc_auc', n_jobs=-1))
    roc_auc_scores.append(roc_auc)
    parameters.append(value)

In [36]:
df = pd.DataFrame(roc_auc_scores)
df['params'] = parameters
df.columns=['roc_auc_scores', 'params']

df.sort_values(by = 'roc_auc_scores', ascending=False).reset_index(drop=True)

Unnamed: 0,roc_auc_scores,params
0,0.845194,50
1,0.819901,20
2,0.682616,8


# Gradient Boosting

In [43]:
roc_auc_scores = []
parameters = []

loss = ['deviance', 'exponential']

for value in loss:
    gbc = GradientBoostingClassifier(loss = value)
    roc_auc = np.mean(cross_val_score(gbc, X_train_balanced, Y_train_balanced, scoring = 'roc_auc', n_jobs=-1))
    roc_auc_scores.append(roc_auc)
    parameters.append(value)

In [50]:
roc_auc_scores = []
parameters = []

est_number = [100, 500,700]

for value in est_number:
    gbc = GradientBoostingClassifier(loss = 'deviance', n_estimators = value)
    roc_auc = np.mean(cross_val_score(gbc, X_train_balanced, Y_train_balanced, scoring = 'roc_auc', n_jobs=-1))
    roc_auc_scores.append(roc_auc)
    parameters.append(value)

In [51]:
df = pd.DataFrame(roc_auc_scores)
df['params'] = parameters
df.columns=['roc_auc_scores', 'params']

df.sort_values(by = 'roc_auc_scores', ascending=False).reset_index(drop=True)

Unnamed: 0,roc_auc_scores,params
0,0.708272,700
1,0.702774,500
2,0.68243,100


In [53]:
roc_auc_scores = []
parameters = []

depth = [3, 6, 8]

for value in depth:
    gbc = GradientBoostingClassifier(loss = 'deviance', n_estimators = 700, max_depth = value)
    roc_auc = np.mean(cross_val_score(gbc, X_train_balanced, Y_train_balanced, scoring = 'roc_auc', n_jobs=-1))
    roc_auc_scores.append(roc_auc)
    parameters.append(value)

In [54]:
df = pd.DataFrame(roc_auc_scores)
df['params'] = parameters
df.columns=['roc_auc_scores', 'params']

df.sort_values(by = 'roc_auc_scores', ascending=False).reset_index(drop=True)

Unnamed: 0,roc_auc_scores,params
0,0.786576,8
1,0.755304,6
2,0.708272,3


In [56]:
df = pd.DataFrame(roc_auc_scores)
df['params'] = parameters
df.columns=['roc_auc_scores', 'params']

df.sort_values(by = 'roc_auc_scores', ascending=False).reset_index(drop=True)

Unnamed: 0,roc_auc_scores,params
0,0.808078,0.5
1,0.786576,0.1
2,0.770039,0.05


# Validating on the Test Set

In [57]:
X_test_not_balanced = X_test.loc[:, ~ X_test.columns.isin(['Year','Cancelled', 'CancellationCode', 'Diverted', 'CarrierDelay',
                       'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay', 'TailNum', 'UniqueCarrier',
                        'ActualElapsedTime', 'Year','Cancelled', 'CancellationCode', 'Diverted', 'CarrierDelay',
                        'WeatherDelay', 'NASDelay', 'SecurityDelay',  'CRSDepTime','Month', 'FlightNum','CRSDepTime_mod',
                        'TailNum', 'Origin', 'Dest', 'DepTime', 'ArrTime', 'DayOfWeek','CRSElapsedTime','CRSArrTime',
                         'Dest', 'DepTime', 'ArrTime', 'ActualElapsedTime', 'ArrDelay', 'DayofMonth', 'DepDelay', 'TaxiIn', 'TaxiOut'])]

In [58]:
rfc= RandomForestClassifier(n_estimators = 700, max_depth = 50, n_jobs=-1)
rfc.fit(X_train_balanced, Y_train_balanced)
roc_auc_score(Y_test, rfc.predict_proba(X_test_not_balanced)[:, 1])

0.7138317440103987

In [62]:
gbc = GradientBoostingClassifier(n_estimators = 700, max_depth = 8, loss = 'deviance', learning_rate=0.5)

gbc.fit(X_train_balanced, Y_train_balanced)
roc_auc_score(Y_test, gbc.predict_proba(X_test_not_balanced)[:, 1])

0.7039375177245429

# Experimentation with Different Probability Settings

In [59]:
def prediction(classifier, feature_set, prob):
    y_predicted = []
    for i in classifier.predict_proba(feature_set)[:, 1]:
        if i > prob:
            y_predicted.append(1)
        else: 
            y_predicted.append(0)
    return y_predicted

y_predicted = prediction(rfc, X_test_not_balanced, 0.95)      

In [60]:
confusion_matrix(Y_test, y_predicted)

array([[605907,   2116],
       [ 90213,   2737]])

In [64]:
def prediction(classifier, feature_set, prob):
    y_predicted = []
    for i in classifier.predict_proba(feature_set)[:, 1]:
        if i > prob:
            y_predicted.append(1)
        else: 
            y_predicted.append(0)
    return y_predicted

y_predicted = prediction(rfc, X_test_not_balanced, 0.5)   

In [65]:
confusion_matrix(Y_test, y_predicted)

array([[453622, 154401],
       [ 41225,  51725]])