In [1]:
import warnings
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier

In [2]:
PATH = Path('../input/flight-delays-fall-2018/')
train_df = pd.read_csv(PATH / 'flight_delays_train.csv')
# PATH = Path('../../downloads/flight-delays-fall-2018/')

In [3]:
train_df = pd.read_csv(PATH / 'flight_delays_train.csv')

# Exploratory Data Analysis

In [4]:
# Viz with Plotly
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly_express as px

# Seaborn and matplotlib
import seaborn as sns
import matplotlib.pyplot as plt

In [5]:
vis_df = train_df.copy()
# Clean month, day of month and day of week
vis_df['Month'] = vis_df['Month'].str[2:].astype('int')
vis_df['DayofMonth'] = vis_df['DayofMonth'].str[2:].astype('int')
vis_df['DayOfWeek'] = vis_df['DayOfWeek'].str[2:].astype('int')
vis_df.sample(10)

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,dep_delayed_15min
95290,2,9,3,1854,DL,ATL,DFW,732,Y
91978,2,15,3,736,AA,SMF,DFW,1431,N
78244,1,13,4,1805,DL,PHL,CVG,507,Y
58624,11,7,2,1604,OO,LAX,COS,833,Y
32533,2,16,3,809,AA,ORD,PSP,1652,N
49810,10,14,5,603,UA,SEA,ORD,1721,N
71184,9,24,6,915,HP,PHX,DFW,868,N
68915,3,12,6,1019,NW,MEM,MSP,700,Y
2396,11,23,4,718,WN,PHX,HOU,1020,N
69956,4,23,6,1020,XE,EWR,IND,644,Y


## 01 Check the target variable dep_delayed_15min


In [6]:
# Check the target variable
trace = [go.Bar(
            x = vis_df['dep_delayed_15min'].value_counts().index.values,
            y = vis_df['dep_delayed_15min'].value_counts().values,
            #x = levels,
            #text='Distribution of target variable',
            marker = dict(color='red', opacity=0.6)
    )]


layout = dict(title="Target variable,15 min Delay, distribution", 
              margin=dict(l=100), 
              width=400, 
              height=400)

fig = go.Figure(data=trace, layout=layout)

iplot(fig)

## 02 Plot the UniqueCarrier and their frequencies.

In [7]:
trace = [go.Bar(
    x = vis_df['UniqueCarrier'].value_counts().index.values,
    y = vis_df['UniqueCarrier'].value_counts().values,
    marker = dict(color='blue', opacity=0.6)
)]

layout = dict(title='Carrier wise flight distribution',
             width=800,
             height=400,
             xaxis = dict(title="Unique Carrier", tickmode='linear', tickangle=-45))
fig = go.Figure(data=trace, layout=layout)

iplot(fig)

## 03 Plot the UniqueCarrier and Delay

In [8]:
trace1 = go.Bar(
    x = vis_df[vis_df['dep_delayed_15min'] == "Y"]['UniqueCarrier'].value_counts().index.values,
    y = vis_df[vis_df['dep_delayed_15min'] == "Y"]['UniqueCarrier'].value_counts().values,
    name= 'Yes',
    marker = dict(color='red', opacity=0.6)
)

trace2 = go.Bar(
    x = vis_df[vis_df['dep_delayed_15min'] == "N"]['UniqueCarrier'].value_counts().index.values,
    y = vis_df[vis_df['dep_delayed_15min'] == "N"]['UniqueCarrier'].value_counts().values,
    name= 'No',
    marker = dict(color='grey', opacity=0.6)
)

layout = go.Layout(title= 'Carrier wise flight distribution by Delay',
                   xaxis=dict(title='Unique Carrier', tickangle=-45),
                  barmode='group')

data = [trace1, trace2]

fig = go.Figure(data=data, layout=layout)
iplot(fig)

## 04 Plot histogram for Distance.

In [9]:
# data = [go.Histogram(
#     x = vis_df['Distance'].value_counts().index.values,
#     y = vis_df['Distance'].value_counts().values
# )]

data = [go.Histogram(
    x = vis_df['Distance']
)]
iplot(data)

## 05 Plot the histogram of Distance with Delay

In [10]:
# trace1 = go.Histogram(
# )
trace1 = go.Histogram(
    x = vis_df[vis_df['dep_delayed_15min'] == 'Y']['Distance'],
    marker=dict(color='red', opacity=0.6),
    name = 'Yes'
)
trace2 = go.Histogram(
    x = vis_df[vis_df['dep_delayed_15min'] == 'N']['Distance'],
    marker=dict(color='blue', opacity=0.6),
    name = 'No'
)
layout = go.Layout(title='Distance traveled and Delay', barmode='overlay')
data=[trace1, trace2]
fig = go.Figure(data=data, layout=layout)
iplot(fig)

## Create new features

In [11]:
vis_df['Dep_hour'] =  vis_df['DepTime']//100
vis_df['Dep_minute'] =  vis_df['DepTime']%100
vis_df['Dep_hour'].replace(to_replace=[24,25], value=0, inplace=True)
vis_df.sample(10)

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,dep_delayed_15min,Dep_hour,Dep_minute
29121,5,18,3,1930,MQ,DFW,PNS,604,N,19,30
61723,10,25,2,1630,MQ,DFW,SGF,364,N,16,30
38521,10,29,7,1233,TZ,OGG,OAK,2349,Y,12,33
72602,12,3,6,1600,EV,SAT,MCO,1040,N,16,0
75232,1,4,2,1240,HP,LAS,JFK,2248,Y,12,40
81207,11,17,5,842,UA,LAS,DEN,629,N,8,42
5785,11,16,3,1415,XE,EWR,SDF,642,N,14,15
33846,6,19,7,1631,AA,MIA,ORD,1197,Y,16,31
12936,10,21,5,2108,OH,CVG,MKE,318,N,21,8
14404,10,11,3,1559,FL,MCO,ATL,403,N,15,59


## 06 Plot the departure hour and the delay

In [12]:
df_t = pd.crosstab(vis_df.Dep_hour,vis_df.dep_delayed_15min)
trace1 = go.Scatter(
    x = df_t.index,
    y = df_t.N,
    mode = 'lines+markers',
    name='No Delays')
trace2 = go.Scatter(
    x = df_t.index,
    y = df_t.Y,
    mode = 'lines+markers',
    name='Delays')
data = [trace1, trace2]
layout = go.Layout(title='Departure hour and Delay')
fig = go.Figure(data=data, layout=layout)
iplot(fig)

In [13]:
vis_df['Dep_hour_flag'] = ((vis_df['Dep_hour'] >= 6) & (vis_df['Dep_hour'] < 23)).astype("int")
df_t = pd.crosstab(vis_df.Dep_hour_flag, vis_df.dep_delayed_15min)
df_t

dep_delayed_15min,N,Y
Dep_hour_flag,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2272,629
1,78684,18415


# Feature Engineering

In [14]:
def seasonify(monthvalues):
    summer = []
    for i in monthvalues:
        if i in ['c-6','c-7','c-8']:
            summer.append('c-1')
        elif i in ['c-9','c-10','c-11']:
            summer.append('c-2')
        elif i in ['c-12','c-1','c-2']:
            summer.append('c-3')
        elif i in ['c-3','c-4','c-5']:
            summer.append('c-4')
    return summer

In [15]:
encoder = LabelEncoder()
test_df = pd.read_csv(PATH / 'flight_delays_test.csv')
# test_df = test_df.drop('Dest', axis=1)
# test_df['Dest'] = b
# test_df = test_df.drop('DayofMonth', axis=1)
# test_df['UniqueCarrier'] = encoder.fit_transform(test_df['UniqueCarrier']).astype('str')
# test_df['UniqueCarrier'] = test_df['UniqueCarrier'].map(lambda x: 'c-'+str(x+1))
test_df['flight'] = test_df['Origin'] + '-->' + train_df['Dest']
test_df['Season'] = seasonify(test_df.Month.values)

test_df['Month'] = test_df['Month'].str[2:].astype('int')
test_df['DayofMonth'] = test_df['DayofMonth'].str[2:].astype('int')
test_df['DayOfWeek'] = test_df['DayOfWeek'].str[2:].astype('int')

test_df['Dep_hour'] =  test_df['DepTime']//100
test_df['Dep_minute'] =  test_df['DepTime']%100
test_df['Dep_hour'].replace(to_replace=[24,25], value=0, inplace=True)
test_df['Dep_hour_flag'] = ((test_df['Dep_hour'] >= 2) & (test_df['Dep_hour'] < 21)).astype("int")
test_df['Dep_week_flag'] = ((test_df['DayOfWeek']>=4) & (test_df['DayOfWeek']<=5)).astype("int").astype('str')

test_df['Dep_dom_flag'] = (((test_df['DayofMonth']>=14)&(test_df['DayofMonth']<=17))|((test_df['DayofMonth']>=21)&(test_df['DayofMonth']<=22))|(test_df['DayofMonth']==26)|(test_df['DayofMonth']==28)).astype('int').astype('str')
test_df['Dep_month_flag'] = ((test_df['Month']>=6)&(test_df['Month']<=8)| (test_df['Month']==11)).astype('int').astype('str')

test_df['Dep_hour'] =  test_df['Dep_hour'].astype('str')
test_df['Dep_minute'] =  test_df['Dep_minute'].astype('str')
test_df['Month'] = test_df['Month'].astype('str').map(lambda x: 'c-'+x)
test_df['DayofMonth'] = test_df['DayofMonth'].astype('str').map(lambda x: 'c-'+x)
test_df['DayOfWeek'] = test_df['DayOfWeek'].astype('str').map(lambda x: 'c-'+x)

# test_df['Dep_hour_flag'] = test_df['Dep_hour_flag'].map(lambda x: 'c-'+str(x))
# test_df = test_df.drop(['DayofMonth'], axis=1)
# test_df = test_df.drop(['Dep_hour','Dep_minute'], axis=1)

# test_df['Dep_hour'] = test_df['Dep_hour'].map(lambda x: 'h-'+str(x))
# test_df['Dep_minute'] = test_df['Dep_minute'].map(lambda x: 'm-'+str(x))
test_df

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,flight,Season,Dep_hour,Dep_minute,Dep_hour_flag,Dep_week_flag,Dep_dom_flag,Dep_month_flag
0,c-7,c-25,c-3,615,YV,MRY,PHX,598,MRY-->DFW,c-1,6,15,1,0,0,1
1,c-4,c-17,c-2,739,WN,LAS,HOU,1235,LAS-->MCO,c-4,7,39,1,0,1,0
2,c-12,c-2,c-7,651,MQ,GSP,ORD,577,GSP-->CLE,c-3,6,51,1,0,0,0
3,c-3,c-25,c-7,1614,WN,BWI,MHT,377,BWI-->MEM,c-4,16,14,1,0,0,0
4,c-6,c-6,c-3,1505,UA,ORD,STL,258,ORD-->OMA,c-1,15,5,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,c-6,c-5,c-2,852,WN,CRP,HOU,187,CRP-->RDD,c-1,8,52,1,0,0,1
99996,c-11,c-24,c-6,1446,UA,ORD,LAS,1515,ORD-->DAB,c-2,14,46,1,0,0,1
99997,c-1,c-30,c-2,1509,OO,ORD,SGF,438,ORD-->IAH,c-3,15,9,1,0,0,0
99998,c-1,c-5,c-5,804,DL,LGA,ATL,761,LGA-->GGG,c-3,8,4,1,1,0,0


In [16]:
dft = df.copy()
dft['Month'] = dft['Month'].str[2:].astype('int')
dft = dft.rename(columns={'Month':'Season'})
# dft['Season'] = seasonify(dft.Season.values)
trace1 = go.Bar(
    x = dft[dft['dep_delayed_15min'] == 'Y']['Season'].value_counts().index.values,
    y = dft[dft['dep_delayed_15min'] == 'Y']['Season'].value_counts().values,
    name = 'Delay'
#     marker= dict(color=)
)
trace2 = go.Bar(
    x = dft[dft['dep_delayed_15min'] == 'N']['Season'].value_counts().index.values,
    y = dft[dft['dep_delayed_15min'] == 'N']['Season'].value_counts().values,
    name = 'No Delay'
)
data = [trace1, trace2]
layout = go.Layout(title='Season and delay')
fig = go.Figure(data=data, layout=layout)
iplot(fig)

NameError: name 'df' is not defined

In [17]:
train_df['flight'] = train_df['Origin'] + '-->' + train_df['Dest']

In [18]:
df = train_df.copy()

df['Season'] = seasonify(df.Month.values)

# encoder = LabelEncoder()
# df.UniqueCarrier.nunique()

# df['UniqueCarrier'] = encoder.fit_transform(df['UniqueCarrier']).astype('str')
# df['UniqueCarrier'] = df['UniqueCarrier'].map(lambda x: 'c-'+str(x+1))

# df['DepTime'] = df['DepTime'].map(lambda x: x//100)
# df['DepTime'] = df['DepTime'].map(lambda x: 'c-'+str(x))


# df = df.drop(['DayofMonth','Origin','Dest'], axis=1)
# df['summer'] = (df['Month'].isin(['c-6','c-7','c-8']).astype(np.int32))
# df['autumn'] = (df['Month'].isin(['c-9','c-10','c-11']).astype(np.int32))
# df['winter'] = (df['Month'].isin(['c-12','c-1','c-2']).astype(np.int32))
# df['spring'] = (df['Month'].isin(['c-3','c-4','c-5']).astype(np.int32))

df['Month'] = df['Month'].str[2:].astype('int')
df['DayofMonth'] = df['DayofMonth'].str[2:].astype('int')
df['DayOfWeek'] = df['DayOfWeek'].str[2:].astype('int')

df['Dep_hour'] =  df['DepTime']//100
df['Dep_minute'] =  df['DepTime']%100
df['Dep_hour'].replace(to_replace=[24,25], value=0, inplace=True)
df['Dep_hour_flag'] = ((df['Dep_hour'] >= 2) & (df['Dep_hour'] < 21)).astype("int").astype('str')
df['Dep_week_flag'] = ((df['DayOfWeek']>=4) & (df['DayOfWeek']<=5)).astype("int").astype('str')

df['Dep_dom_flag'] = (((df['DayofMonth']>=14)&(df['DayofMonth']<=17))|((df['DayofMonth']>=21)&(df['DayofMonth']<=22))|(df['DayofMonth']==26)|(df['DayofMonth']==28)).astype('int').astype('str')
df['Dep_month_flag'] = ((df['Month']>=6)&(df['Month']<=8)|(df['Month']==11)).astype('int').astype('str')

df['Month'] = df['Month'].astype('str').map(lambda x: 'c-'+x)
df['DayofMonth'] = df['DayofMonth'].astype('str').map(lambda x: 'c-'+x)
df['DayOfWeek'] = df['DayOfWeek'].astype('str').map(lambda x: 'c-'+x)

df['Dep_hour'] = df['Dep_hour'].astype('str')
df['Dep_minute'] = df['Dep_minute'].astype('str')
# df['Dep_hour_flag'] = df['Dep_hour_flag'].map(lambda x: 'c-'+str(x))
# df['Dep_hour'] = df['Dep_hour'].map(lambda x: 'h-'+str(x))
# df['Dep_minute'] = df['Dep_minute'].map(lambda x: 'm-'+str(x))


# df = df.drop(['DayofMonth'], axis=1)
# df = df.drop(['Dep_hour','Dep_minute'], axis=1)
df

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,dep_delayed_15min,flight,Season,Dep_hour,Dep_minute,Dep_hour_flag,Dep_week_flag,Dep_dom_flag,Dep_month_flag
0,c-8,c-21,c-7,1934,AA,ATL,DFW,732,N,ATL-->DFW,c-1,19,34,1,0,1,1
1,c-4,c-20,c-3,1548,US,PIT,MCO,834,N,PIT-->MCO,c-4,15,48,1,0,0,0
2,c-9,c-2,c-5,1422,XE,RDU,CLE,416,N,RDU-->CLE,c-2,14,22,1,1,0,0
3,c-11,c-25,c-6,1015,OO,DEN,MEM,872,N,DEN-->MEM,c-2,10,15,1,0,0,1
4,c-10,c-7,c-6,1828,WN,MDW,OMA,423,Y,MDW-->OMA,c-2,18,28,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,c-5,c-4,c-3,1618,OO,SFO,RDD,199,N,SFO-->RDD,c-4,16,18,1,0,0,0
99996,c-1,c-18,c-3,804,CO,EWR,DAB,884,N,EWR-->DAB,c-3,8,4,1,0,0,0
99997,c-1,c-24,c-2,1901,NW,DTW,IAH,1076,N,DTW-->IAH,c-3,19,1,1,0,0,0
99998,c-4,c-27,c-4,1515,MQ,DFW,GGG,140,N,DFW-->GGG,c-4,15,15,1,1,0,0


In [19]:
categ_feat_idx = np.where(df.drop('dep_delayed_15min', axis=1).dtypes == 'object')[0]
categ_feat_idx
# condition = True
# cat_features_names = [col for col in df.columns if condition]
# cat_features = [df.columns.get_loc(col) for col in cat_features_names]
# print(cat_features)

array([ 0,  1,  2,  4,  5,  6,  8,  9, 10, 11, 12, 13, 14, 15])

In [20]:
X_train = df.drop('dep_delayed_15min', axis=1).values
y_train = df['dep_delayed_15min'].map({'Y': 1, 'N': 0}).values
X_test = test_df.values

In [21]:
X_train_part, X_valid, y_train_part, y_valid = train_test_split(X_train, y_train, 
                                                                test_size=0.3, 
                                                                random_state=17)

In [22]:
params = {'loss_function':'Logloss',
          'eval_metric':'AUC',
          'cat_features': categ_feat_idx,
          'task_type': 'GPU',
          'early_stopping_rounds': 200,
          'verbose': 200,
          'random_seed': 17
         }

ctb = CatBoostClassifier(**params)
ctb.fit(X_train_part, y_train_part,
          eval_set=(X_valid, y_valid),
          use_best_model=True,
          plot=True
         );

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.121895
0:	learn: 0.7012890	test: 0.7112428	best: 0.7112428 (0)	total: 50.3ms	remaining: 50.2s
200:	learn: 0.8083470	test: 0.7785960	best: 0.7785960 (200)	total: 7.77s	remaining: 30.9s
400:	learn: 0.8310754	test: 0.7814431	best: 0.7814807 (393)	total: 15.2s	remaining: 22.7s
600:	learn: 0.8507020	test: 0.7831319	best: 0.7832960 (573)	total: 22.7s	remaining: 15s
bestTest = 0.783295989
bestIteration = 573
Shrink model to first 574 iterations.


In [23]:
ctb_valid_pred = ctb.predict_proba(X_valid)[:, 1]

In [24]:
roc_auc_score(y_valid, ctb_valid_pred)

0.7832960455077708

In [25]:
from catboost import Pool

train_data = Pool(data=X_train_part,
                  label=y_train_part,
                  cat_features=categ_feat_idx
                 )

valid_data = Pool(data=X_valid,
                  label=y_valid,
                  cat_features=categ_feat_idx
                 )

params = {'loss_function':'Logloss',
          'eval_metric':'AUC',
          'task_type': 'GPU',
          'early_stopping_rounds': 200,
          'verbose': 200,
          'random_seed': 17
         }

cbc = CatBoostClassifier(**params)
cbc.fit(train_data, # instead of X_train, y_train
          eval_set=valid_data, # instead of (X_valid, y_valid)
          use_best_model=True, 
          plot=True
         );

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.121895
0:	learn: 0.7012890	test: 0.7112428	best: 0.7112428 (0)	total: 41.4ms	remaining: 41.3s
200:	learn: 0.8075884	test: 0.7784454	best: 0.7784454 (200)	total: 7.98s	remaining: 31.7s
400:	learn: 0.8307686	test: 0.7808055	best: 0.7809621 (392)	total: 15.7s	remaining: 23.5s
600:	learn: 0.8507219	test: 0.7823200	best: 0.7824004 (593)	total: 23.2s	remaining: 15.4s
800:	learn: 0.8680646	test: 0.7825583	best: 0.7826480 (729)	total: 30.6s	remaining: 7.6s
999:	learn: 0.8831264	test: 0.7827274	best: 0.7827274 (999)	total: 38.3s	remaining: 0us
bestTest = 0.7827274203
bestIteration = 999
Shrink model to first 1000 iterations.


In [26]:
from catboost import cv

params = {'loss_function':'Logloss',
          'eval_metric':'AUC',
          'task_type': 'GPU',
          'early_stopping_rounds': 200,
          'verbose': 200,
          'random_seed': 17
         }

all_train_data = Pool(data=X_train,
                      label=y_train,
                      cat_features=categ_feat_idx
                     )

scores = cv(pool=all_train_data,
            params=params, 
            fold_count=4,
            seed=17, 
            shuffle=True,
            stratified=True, # if True the folds are made by preserving the percentage of samples for each class
            plot=True
           )

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	test: 0.7237412	best: 0.7237412 (0)
200:	test: 0.7758683	best: 0.7758683 (200)
400:	test: 0.7847815	best: 0.7847815 (400)
600:	test: 0.7884706	best: 0.7884706 (600)
800:	test: 0.7903611	best: 0.7903611 (800)
999:	test: 0.7913546	best: 0.7913546 (999)	total: 6m 8s	remaining: 0us


In [27]:
cbc_valid_pred = cbc.predict_proba(X_valid)[:, 1]
roc_auc_score(y_valid, cbc_valid_pred)

0.7827274885293919

In [28]:
cbc.get_feature_importance(prettified=True)

Unnamed: 0,Feature Id,Importances
0,3,12.635611
1,10,12.504672
2,4,11.001047
3,8,9.068821
4,11,8.942928
5,5,8.228472
6,6,7.700395
7,1,7.40468
8,0,6.098128
9,2,5.73559


In [29]:
cbc.fit(X_train, y_train, #ctb
        cat_features=categ_feat_idx,
        eval_set=(X_valid, y_valid),
        use_best_model=True,
        plot=True
        );

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.126547
0:	learn: 0.7104962	test: 0.8050192	best: 0.8050192 (0)	total: 44.7ms	remaining: 44.6s
200:	learn: 0.8146564	test: 0.9666928	best: 0.9667288 (198)	total: 7.6s	remaining: 30.2s
400:	learn: 0.8319974	test: 0.9722978	best: 0.9724752 (386)	total: 15s	remaining: 22.4s
600:	learn: 0.8463284	test: 0.9781601	best: 0.9783463 (597)	total: 22.5s	remaining: 15s
bestTest = 0.9783462882
bestIteration = 597
Shrink model to first 598 iterations.


In [30]:
cbc_test_pred = cbc.predict_proba(X_test)[:, 1] #or ctb
# X_test[0]

In [31]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    
    sample_sub = pd.read_csv(PATH / 'sample_submission.csv', 
                             index_col='id')
    sample_sub['dep_delayed_15min'] = cbc_test_pred #or ctb
    sample_sub.to_csv('ctb_pred.csv')