In [2]:
import warnings
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
# import xgboost as xgb

In [19]:
train_df = pd.read_csv('competitions/data/flight_delays_train.csv')
test_df = pd.read_csv('competitions/data/flight_delays_test.csv')

In [20]:
#Flight
train_df['flight'] = train_df['Origin'] + '-->' + train_df['Dest']
test_df['flight'] = test_df['Origin'] + '-->' + test_df['Dest']

#Hour, minute
train_df['hour'] = train_df['DepTime'] // 100
train_df.loc[train_df['hour'] == 24, 'hour'] = 0
train_df.loc[train_df['hour'] == 25, 'hour'] = 1
train_df['minute'] = train_df['DepTime'] % 100

test_df['hour'] = test_df['DepTime'] // 100
test_df.loc[test_df['hour'] == 24, 'hour'] = 0
test_df.loc[test_df['hour'] == 25, 'hour'] = 1
test_df['minute'] = test_df['DepTime'] % 100

#Season 
train_df['summer'] = (train_df['Month'].isin(['c-6', 'c-7', 'c-8'])).astype('int')
train_df['autumn'] = (train_df['Month'].isin(["c-9", "c-10", 'c-11'])).astype('int')
train_df['winter'] = (train_df['Month'].isin(["c-12", "c-1", 'c-2'])).astype('int')
train_df['spring'] = (train_df['Month'].isin(["c-3", "c-4", "c-5"])).astype('int')

test_df['summer'] = (test_df['Month'].isin(["c-6", "c-7", "c-8"])).astype('int')
test_df['autumn'] = (test_df['Month'].isin(["c-9", "c-10", 'c-11'])).astype('int')
test_df['winter'] = (test_df['Month'].isin(["c-12", "c-1", 'c-2'])).astype('int')
test_df['spring'] = (test_df['Month'].isin(["c-3", "c-4", "c-5"])).astype('int')

#weekend
def weekend(day):
    if day in ['c-6', 'c-7']:
        return 1
    else:
        return 0

train_df['Weekend'] = train_df['DayOfWeek'].apply(weekend)
test_df['Weekend'] = test_df['DayOfWeek'].apply(weekend)


In [21]:
#UniqueCarrier
def carrier_quality(x):    
    if x <= 10:
        return 'Good'
    if 17.4875 >= x > 10:
        return 'Average'
    else:
        return 'Bad'

total_flights_by_carrier = train_df.groupby(['UniqueCarrier']).count().sort_values(by='dep_delayed_15min', ascending=False)['dep_delayed_15min']
number_of_delays_by_carrier = train_df[train_df['dep_delayed_15min'] == 'Y'].groupby(['UniqueCarrier']).count().sort_values(by='dep_delayed_15min', ascending=False)['dep_delayed_15min']
prcnt_of_delays = ((number_of_delays_by_carrier*100)/total_flights_by_carrier).sort_values(ascending=False)
prcnt_of_delays = prcnt_of_delays.reset_index().rename({'dep_delayed_15min':'% of delayed flights'}, axis=1)
prcnt_of_delays['carrier_quality'] = prcnt_of_delays['% of delayed flights'].apply(carrier_quality)
train_df['carrier_quality'] = train_df.merge(prcnt_of_delays, on='UniqueCarrier')

ValueError: Wrong number of items passed 19, placement implies 1

In [22]:
train_df['dep_delayed_15min'].map({'N': 0, 'Y': 1})

0        0
1        0
2        0
3        0
4        1
        ..
99995    0
99996    0
99997    0
99998    0
99999    0
Name: dep_delayed_15min, Length: 100000, dtype: int64

In [23]:
X = train_df.drop('dep_delayed_15min', axis = 1)
y = train_df['dep_delayed_15min'].map({'N': 0, 'Y': 1})

In [24]:
X.head(1)

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,flight,hour,minute,summer,autumn,winter,spring,Weekend
0,c-8,c-21,c-7,1934,AA,ATL,DFW,732,ATL-->DFW,19,34,1,0,0,0,1


In [25]:
idx_cat = [0, 1, 2, 4, 5, 6, 7, 8, 9]

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 17)
X_train.columns[idx_cat]

Index(['Month', 'DayofMonth', 'DayOfWeek', 'UniqueCarrier', 'Origin', 'Dest',
       'Distance', 'flight', 'hour'],
      dtype='object')

In [27]:
ctb = CatBoostClassifier(random_seed=17, silent=True)

In [28]:
ctb.fit(X_train, y_train, cat_features= idx_cat)

<catboost.core.CatBoostClassifier at 0x2a156066fc8>

In [29]:
ctb_valid_pred = ctb.predict_proba(X_test)[:, 1]

In [30]:
roc_auc_score(y_test, ctb_valid_pred)

0.7998410535785886

In [32]:
train_df['Distance'].apply(np.log)

0        6.595781
1        6.726233
2        6.030685
3        6.770789
4        6.047372
           ...   
99995    5.293305
99996    6.784457
99997    6.981006
99998    4.941642
99999    6.405228
Name: Distance, Length: 100000, dtype: float64