In [1]:
import datetime

import pandas as pd
import numpy as np

np.random.seed(0)

import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.gridspec as gridspec

from sklearn.manifold import TSNE
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedKFold, train_test_split, KFold
from sklearn.metrics import f1_score,mean_squared_error

import lightgbm as lgb

from hyperopt import hp, tpe, Trials
from hyperopt.fmin import fmin

from tqdm import tqdm

In [2]:
train_df = pd.read_csv('train_df_final.csv')
test_df = pd.read_csv('test_df_final.csv')
submission_df = pd.read_csv('sample_submission.csv')

In [3]:
train_df = train_df.fillna(value=0)
test_df = test_df.fillna(value=0)

In [4]:
train_df.head()

Unnamed: 0,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,fare,pickup_date,pickup_hour,pickup_minute,drop_date,...,duration_meter_waiting_meter_waiting_fare_anomaly,duration_meter_waiting_meter_waiting_till_pickup_anomaly,duration_meter_waiting_fare_meter_waiting_till_pickup_anomaly,meter_waiting_meter_waiting_fare_meter_waiting_till_pickup_anomaly,additional_fare_duration_meter_waiting_meter_waiting_fare_anomaly,additional_fare_duration_meter_waiting_meter_waiting_till_pickup_anomaly,additional_fare_duration_meter_waiting_fare_meter_waiting_till_pickup_anomaly,additional_fare_meter_waiting_meter_waiting_fare_meter_waiting_till_pickup_anomaly,duration_meter_waiting_meter_waiting_fare_meter_waiting_till_pickup_anomaly,additional_fare_duration_meter_waiting_meter_waiting_fare_meter_waiting_till_pickup_anomaly
0,10.5,834.0,56.0,0.0,64.0,270.32,1,0,20,1,...,1,1,1,1,1,1,1,1,1,1
1,10.5,791.0,47.0,0.0,134.0,197.85,1,0,56,1,...,1,1,1,1,1,1,1,1,1,1
2,10.5,1087.0,80.0,0.0,61.0,301.64,1,1,8,1,...,1,1,1,1,1,1,1,1,1,1
3,10.5,598.0,271.0,15.6638,68.0,82.3,1,2,27,1,...,1,1,1,1,1,1,1,1,1,1
4,10.5,3407.0,182.0,0.0,112.0,1065.02,1,5,38,1,...,1,1,1,1,1,1,1,1,1,1


In [16]:
features = list(filter(lambda each: ('anomaly' not in each) and (each != 'label'), train_df.columns))
train_X = train_df[features]
test_X = test_df[features]
y = train_df['label']

In [17]:
params = {
    'objective':'binary',
    'num_iterations':1000,
    'learning_rate':0.05,
    'seed':0,
    'early_stopping_round':50,    
    'metric':'f1',
    'verbosity':False
}

In [18]:
def lgb_f1_score(y_hat, data):
    y_true = data.get_label()
    y_hat = np.round(y_hat) # scikits f1 doesn't like probabilities
    return 'f1', f1_score(y_true, y_hat,average='micro'), True


In [19]:
test_X.shape

(8576, 99)

In [20]:
def validation_score(params,sub_preds=False):
    validation_scores = []
    folds = 3
    skf = StratifiedKFold(n_splits=folds)
    preds = np.zeros(test_X.shape[0])
    for train_index, test_index in skf.split(train_df, y):
        X_train, X_test = train_X.iloc[train_index,:], train_X.iloc[test_index,:]
        y_train, y_test = y[train_index], y[test_index]
        train = lgb.Dataset(X_train,y_train)
        valid = lgb.Dataset(X_test,y_test)
        evals_result = {}
        model = lgb.train(params, train,num_boost_round=1000,early_stopping_rounds=50, valid_sets=valid,feval=lgb_f1_score, evals_result=evals_result,verbose_eval=False)
        validation_scores.append(f1_score(y_test,np.round(model.predict(X_test)),average='micro'))
        if sub_preds:
            preds += np.round(model.predict(test_X))
                              
    return np.mean(validation_scores), np.std(validation_scores) ,preds

In [21]:
def convert_int_params(names, params):
    for int_type in names:
        #sometimes the parameters can be choices between options or numerical values. like "log2" vs "1-10"
        raw_val = params[int_type]
        if is_number(raw_val):
            params[int_type] = int(raw_val)
    return params
def is_number(s):
    if s is None:
        return False
    try:
        float(s)
        return True
    except ValueError:
        return False
def f(params):
    to_int = ["num_leaves", "subsample_for_bin", "min_data_in_leaf"]
    params = convert_int_params(to_int, params)
#     params['boosting_type'] = params['boosting_type']['boosting_type']
    tmp = {
        'objective':'binary',    
        'seed':0,
        'metric':'f1',        
    }
    for each in tmp:
        params[each] = tmp[each]    
    mean, std,_ = validation_score(params)
    return -1* mean

In [22]:
space = {
    'class_weight': hp.choice('class_weight', [None, 'balanced']),
#     'boosting_type': hp.choice('boosting_type',
#                                    [{'boosting_type': 'gbdt',
#                                      },
#                                     {'boosting_type': 'dart',
#                                      },
#                                     {'boosting_type': 'goss'}]),
    'num_leaves': hp.quniform('num_leaves', 30, 150, 1),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.2)),
    'subsample_for_bin': hp.quniform('subsample_for_bin', 20000, 300000, 20000),
    'feature_fraction': hp.uniform('feature_fraction', 0.5, 1),
    'bagging_fraction': hp.uniform('bagging_fraction', 0.5, 1),
    'min_data_in_leaf': hp.qloguniform('min_data_in_leaf', 0, 6, 1),
    'lambda_l1': hp.choice('lambda_l1', [0, hp.loguniform('lambda_l1_positive', -16, 2)]),
    'lambda_l2': hp.choice('lambda_l2', [0, hp.loguniform('lambda_l2_positive', -16, 2)]),
    'verbose': -1,
    'min_child_weight': hp.loguniform('min_child_weight', -16, 5)
}

In [23]:
fmin(fn=f,space=space,algo=tpe.suggest,max_evals=1000,verbose= 1)

100%|██████████| 1000/1000 [1:16:35<00:00,  4.60s/trial, best loss: -0.9579768293237055]


{'bagging_fraction': 0.7528277381788788,
 'class_weight': 0,
 'feature_fraction': 0.5625261317624038,
 'lambda_l1': 0,
 'lambda_l2': 1,
 'lambda_l2_positive': 0.01363014388342585,
 'learning_rate': 0.11231814419878085,
 'min_child_weight': 20.69211631593017,
 'min_data_in_leaf': 5.0,
 'num_leaves': 86.0,
 'subsample_for_bin': 60000.0}

In [25]:
# all the features
params = {
    'bagging_fraction': 0.7528277381788788,
    'class_weight': None,
    'feature_fraction': 0.5625261317624038,
    'lambda_l2_positive': 0.01363014388342585,
    'learning_rate': 0.11231814419878085,
    'min_child_weight': 20.69211631593017,
    'min_data_in_leaf': 5,
    'num_leaves': 86,
    'subsample_for_bin': 60000
}
mean, std, preds = validation_score(params,True)
mean, std

(0.9562087432691735, 0.0044215047150157)

In [12]:
# catboost features
params = {
    'bagging_fraction': 0.9582184397618998,
    'class_weight': 'balanced',
    'feature_fraction': 0.6584730253641345,
    'lambda_l1_positive': 0.004131014051823846,
    'lambda_l2_positive': 2.021463074958273,
    'learning_rate': 0.1292032123987036,
    'min_child_weight': 27.329223863721854,
    'min_data_in_leaf': 3,
    'num_leaves': 120,
    'subsample_for_bin': 20000
}
mean, std, preds = validation_score(params,True)
mean, std

(0.9555603390956753, 0.00552089592647917)

In [140]:
submission_df['prediction'] = np.where(preds > 2, 1, 0)
submission_df.to_csv('submission.csv',index=False)

In [9]:
features = [
    'meter_waiting', 
    'meter_waiting_fare',
    'fare',
    'predicted_fare',
    'predicted_fare_diff',
    'predicted_fare_diff_per_fare',
    'predicted_fare_diff_per_predicted_fare', 
    'fare_per_distance',
    'predicted_fare_per_distance', 
    'predicted_fare_diff_per_distance',
    'predicted_duration',
    'predicted_duration_diff', 
    'predicted_duraton_diff_per_duraton',
    'predicted_duraton_diff_per_predicted_duration', 
    'fare_per_duration',
    'predicted_fare_per_duration', 
    'predicted_fare_per_duration_diff',
    'avg_speed', 
    'predicted_avg_speed', 
    'predicted_avg_speed_diff',
    'predicted_meter_waiting', 
    'predicted_meter_waiting_diff',
    'predicted_meter_waiting_diff_per_meter_waiting',
    'predicted_meter_waiting_diff_per_predicted_meter_waiting',
    'meter_waiting_per_duration', 
    'predicted_meter_waiting_per_duration',
    'predicted_meter_waiting_per_duration_diff',
    'predicted_meter_waiting_fare', 
    'predicted_meter_waiting_fare_diff',
    'predicted_meter_waiting_fare_diff_per_meter_waiting_fare',
    'predicted_meter_waiting_fare_diff_per_predicted_meter_waiting_fare',
    'meter_waiting_fare_per_meter_waiting',
    'predicted_meter_waiting_fare_per_meter_waiting',
    'predicted_meter_waiting_fare_per_meter_waiting_diff',
    'meter_waiting_fare_per_duration',
    'predicted_meter_waiting_fare_per_duration',
    'predicted_meter_waiting_fare_per_duration_diff',
    'predicted_additional_fare', 
    'predicted_additional_fare_diff',
    'predicted_additional_fare_diff_per_additional_fare',
    'predicted_addtional_fare_per_fare', 
    'addtional_fare_per_fare',
    'addtional_fare_per_distance', 
    'predicted_addtional_fare_per_distance',
    'addtional_fare_per_duration', 
    'predicted_addtional_fare_per_duration',
]
train_X = train_df[features]
test_X = test_df[features]

In [26]:
validation_scores = []
folds = 3
skf = StratifiedKFold(n_splits=folds)
test_preds = np.zeros(test_X.shape[0])
train_preds = np.zeros(train_X.shape[0])
for train_index, test_index in skf.split(train_df, y):
    X_train, X_test = train_X.iloc[train_index,:], train_X.iloc[test_index,:]
    y_train, y_test = y[train_index], y[test_index]
    train = lgb.Dataset(X_train,y_train)
    valid = lgb.Dataset(X_test,y_test)
    evals_result = {}
    model = lgb.train(params, train,num_boost_round=1000,early_stopping_rounds=50, valid_sets=valid,feval=lgb_f1_score, evals_result=evals_result,verbose_eval=False)
    validation_scores.append(f1_score(y_test,np.round(model.predict(X_test)),average='micro'))
    
    test_preds += model.predict(test_X)
    train_preds += model.predict(train_X)

In [14]:
stacking_train_df = pd.read_csv('stacking_train_df.csv')
stacking_test_df = pd.read_csv('stacking_test_df.csv')

In [27]:
stacking_train_df['light_gbm_base'] = train_preds
stacking_test_df['light_gbm_base'] = test_preds

In [28]:
stacking_test_df.to_csv('stacking_test_df.csv',index=False)
stacking_train_df.to_csv('stacking_train_df.csv',index=False)