In [209]:
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns
import os
import numpy as np
import pandas as pd
import pickle
from scipy.sparse import csr_matrix, hstack, vstack
from functools import lru_cache
from collections import Counter
from itertools import combinations

from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.model_selection import StratifiedKFold, train_test_split, KFold
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV, Ridge
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor, RandomForestRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline, make_union
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import BaggingClassifier

from xgboost import XGBClassifier

In [199]:
RAND = 123
PATH_TO_DATA = 'data/'
PATH_TO_SUBMIT = 'submissions/'

In [200]:
def log(*args, **kargs):
    pass

def write_to_submission_file(prediction, out_file):
    pd.Series(prediction, name='dep_delayed_15min') \
        .to_csv(PATH_TO_SUBMIT + '/' + out_file, index_label='id', header=True)
    
def submit(model, X, y, X_test, name='submit'):
    for ind in range(1, 1000):
        name_with_ind = name + '_' + str(ind)
        if name_with_ind not in os.listdir(PATH_TO_SUBMIT):
            name = name_with_ind
            break
    print(name)
    
    model.fit(X, y)
    predict = model.predict_proba(X_test)[:,1]
    log(X=X, model=model, status='submit {}'.format(name))
    write_to_submission_file(predict, name)

In [201]:
train = pd.read_csv(os.path.join(PATH_TO_DATA, 'flight_delays_train.csv'))
test = pd.read_csv(os.path.join(PATH_TO_DATA, 'flight_delays_test.csv'))

In [202]:
categorical_feature_names = ['Month', 'DayofMonth', 'DayOfWeek', 'UniqueCarrier', 'Origin', 'Dest']
continuous_feature_name = ['DepTime', 'Distance']

In [203]:
train.describe()

Unnamed: 0,DepTime,Distance
count,100000.0,100000.0
mean,1341.52388,729.39716
std,476.378445,574.61686
min,1.0,30.0
25%,931.0,317.0
50%,1330.0,575.0
75%,1733.0,957.0
max,2534.0,4962.0


In [204]:
train.drop(continuous_feature_name, axis=1).describe()

Unnamed: 0,Month,DayofMonth,DayOfWeek,UniqueCarrier,Origin,Dest,dep_delayed_15min
count,100000,100000,100000,100000,100000,100000,100000
unique,12,31,7,22,289,289,2
top,c-8,c-1,c-4,WN,ATL,ATL,N
freq,8830,3399,14736,15082,5834,5795,80956


In [205]:
train.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,dep_delayed_15min
0,c-8,c-21,c-7,1934,AA,ATL,DFW,732,N
1,c-4,c-20,c-3,1548,US,PIT,MCO,834,N
2,c-9,c-2,c-5,1422,XE,RDU,CLE,416,N
3,c-11,c-25,c-6,1015,OO,DEN,MEM,872,N
4,c-10,c-7,c-6,1828,WN,MDW,OMA,423,Y


In [206]:
test.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance
0,c-7,c-25,c-3,615,YV,MRY,PHX,598
1,c-4,c-17,c-2,739,WN,LAS,HOU,1235
2,c-12,c-2,c-7,651,MQ,GSP,ORD,577
3,c-3,c-25,c-7,1614,WN,BWI,MHT,377
4,c-6,c-6,c-3,1505,UA,ORD,STL,258


In [207]:
train_df = train.drop('dep_delayed_15min', axis=1)
y_train = train['dep_delayed_15min'].map({'Y': 1, 'N': 0})

In [208]:
SOFT_K = 25.6
THRESHOLD = 24

class CustomTransformer(TransformerMixin, BaseEstimator):
    def __init__(self, soft_k=SOFT_K, threshold=THRESHOLD):
        self.soft_k = soft_k
        self.threshold = threshold

    def fit(self, X, y):
        if isinstance(X, pd.DataFrame):
            X = X.values
            
        if isinstance(y, pd.DataFrame):
            y = y.values
            
        self.cnt = Counter()
        total_mean = np.mean(y)
        for i in range(X.shape[1]):
            small_ind = []
            small_sum = 0
            small_count = 0
            for j in np.unique(X[:, i]):
                indices = X[:, i] == j
                subset_count = indices.sum()
                y_subset_sum = y[indices].sum()

                if subset_count >= self.threshold:
                    val = (y_subset_sum + total_mean * self.soft_k) / \
                          (self.soft_k + subset_count)
                    self.cnt[(i, j)] = val
                else:
                    small_ind.append(j)
                    small_sum += y_subset_sum
                    small_count += subset_count

            val_small = (small_sum + total_mean * self.soft_k) / \
                        (self.soft_k + small_count)
            for j in small_ind:
                self.cnt[(i, j)] = val_small

        return self

    def transform(self, X):
        if isinstance(X, pd.DataFrame):
            X = X.values
            
        X_new = np.copy(X)
        
        for i in range(X.shape[1]):
            for j in np.unique(X[:, i]):
                indices = X[:, i] == j
                if np.sum(indices) > 0:
                    X_new[indices, i] = self.cnt[(i, j)]
        return X_new

In [298]:
class FastCustomTransformer(TransformerMixin, BaseEstimator):
    def __init__(self, soft_k=SOFT_K, threshold=THRESHOLD):
        self.soft_k = soft_k
        self.threshold = threshold

    def fit(self, X, y):
        if isinstance(X, pd.DataFrame):
            X = X.values
            
        if isinstance(y, pd.DataFrame) or isinstance(y, pd.Series):
            y = y.values
            
        self.cnt = Counter()
        self.small_cnt = Counter()
        
        total_mean = np.mean(y)
        total_sum = y.sum()
        total_count = y.sum()
        for i in range(X.shape[1]):
            cur_x = X[:, i]
            big_sum = 0
            big_count = 0
            for j, subset_count in zip(*np.unique(cur_x, return_counts=True)):
                if subset_count >= self.threshold:
                    indices = cur_x == j
                    y_subset_sum = y[indices].sum()

                    big_count += subset_count
                    big_sum += y_subset_sum
                    val = (y_subset_sum + total_mean * self.soft_k) / \
                          (self.soft_k + subset_count)
                    self.cnt[(i, j)] = val
                    
            small_sum = total_sum - big_sum
            small_count = total_count - big_count
            
            val_small = (small_sum + total_mean * self.soft_k) / \
                        (self.soft_k + small_count)
            self.small_cnt[i] = val_small

        return self

    def transform(self, X):
        if isinstance(X, pd.DataFrame):
            X = X.values
            
        X_new = np.copy(X)
        
        for i in range(X.shape[1]):
            cur_x = X[:, i]
            for j in np.unique(cur_x):
                key = (i, j)
                indices = cur_x == j
                X_new[indices, i] = self.cnt.get(key, self.small_cnt[i])
        return X_new
    
    def fit_transform(self, X, y):
        self.fit(X, y)
        

In [316]:
class Extract_columns(TransformerMixin, BaseEstimator):
    def __init__(self, columns):
        self.columns = columns
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X[self.columns]

In [317]:
class Pairs_features(TransformerMixin, BaseEstimator):
    def __init__(self, columns=None):
        self.columns = columns
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        new_X = X.copy()
        columns = X.columns
        if self.columns is not None:
            columns = columns & self.columns
        for a, b in combinations(columns, 2):
            name = '{}__{}'.format(a, b)
            new_X[name] = X[a] + '_|_' + X[b]
                
        return new_X

In [318]:
class Extract_columns(TransformerMixin, BaseEstimator):
    def __init__(self, columns):
        self.columns = columns
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X[self.columns]

In [331]:
feature_extraction_model = FeatureUnion([
    ('continuous', make_pipeline(
        Extract_columns(continuous_feature_name),
        StandardScaler()
    )),
    ('categorical', make_pipeline(
        Extract_columns(categorical_feature_names),
        FastCustomTransformer(threshold=30, soft_k=300)
    )),
    ('categorical_', make_pipeline(
        Extract_columns(['Origin', 'Dest']),
        Pairs_features(),
        FastCustomTransformer(threshold=20, soft_k=500)
    ))
])

model = Pipeline([
    ('features', feature_extraction_model),
    ('model', XGBClassifier(n_estimators=300, n_jobs=2, random_state=RAND))
])

In [332]:
%%time
cvs = cross_val_score(model, X_train, y_train, cv=3, scoring='roc_auc')
print(cvs.mean())

0.723621292872
CPU times: user 1min 37s, sys: 481 ms, total: 1min 38s
Wall time: 1min 22s


In [None]:
0.722674444204

In [222]:
0.73643498054348788

0.7364349805434879

In [197]:
submit(model, X_train, y_train, test)

submit_4
