# Stacking, Ensembling and Voting Techniques

In order to improve our score in the competition, in this notebook we will develop advanced techniques based on stacking, ensembling and voting algorithms that will combine multiple learners to boost our predictions.

In [1]:
#!pip install scikit-learn==0.20.3
#!pip install catboost

In [1]:
import pandas as pd
import numpy as np
import warnings
import pickle


from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, train_test_split, StratifiedShuffleSplit
from sklearn.metrics import f1_score, r2_score
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

warnings.filterwarnings("ignore")


%load_ext autoreload
%autoreload 2

## Train/Test Splits

In [2]:
data_sequence = pd.read_hdf('../data/preprocessed/data_sequence_alldata.hdf', key='final_alldata', mode='r')
data_sequence.shape

(167559, 1183)

In [3]:
#data_sequence = data_sequence.replace([np.inf, -np.inf], np.nan)

In [4]:
with open('../data/preprocessed/hashs_train.pkl', 'rb') as fp:
    hashs_train = pickle.load(fp)
    
with open('../data/preprocessed/hashs_test.pkl', 'rb') as fp:
    hashs_test = pickle.load(fp)

In [5]:
train = data_sequence[data_sequence.hash.isin(hashs_train)]
test  = data_sequence[data_sequence.hash.isin(hashs_test)]

In [6]:
window_reference = 5

with open('../data/preprocessed/original_cols_sequence.pkl', 'rb') as fp: 
    original_cols = pickle.load(fp)
    
drop_cols = list(x for x in original_cols if 'exit' in x)# + grid_cols
drop_cols += ['lat_lon_entry', 'lat_lon_exit']
drop_cols += ['euclidean_distance', 'manhattan_distance', 'harvesine_distance',
              'center_permanency', 'crossed_city', 'velocity', 'leaving_city', 'entering_city']

drop_cols = [col+'_'+str(window_reference) for col in drop_cols]
#drop_cols += [col+'_'+str(i) for col in grid_cols for i in range(0, 5)]
drop_cols += [col for col in train.columns if col.endswith('exit_5')]
drop_cols += ['hash', f'delta_last_center_permanency_{window_reference}', f'delta_origin_center_permanency_{window_reference}']

features = list(set(train.columns) - set(drop_cols))
target   = ['is_inside_city_exit_'+str(window_reference)]

In [7]:
train_train, train_val = train_test_split(train[train.hour_exit_5>=15], test_size=0.25, random_state=423)
train_train.shape, train_val.shape

((100513, 1183), (33505, 1183))

## Defining Models

In [8]:
# Logistic Regression
model    = LogisticRegression(random_state=20, n_jobs=-1, penalty='l1', C=0.001)
imputer  = SimpleImputer(strategy='constant', fill_value=0)
lr_pipe  = Pipeline(steps=[('imputer', imputer), ('scaler', MinMaxScaler()), ('model', model)])

# LightGBM

lgbm_lowdepth = LGBMClassifier(**{'boosting_type': 'gbdt', 'colsample_bytree': 1.0, 'is_unbalance': False, 
                 'max_depth': 7,  'n_estimators': 150, 'num_leaves': 31, 
                 'objective': 'binary', 'random_state': 20, 'reg_alpha': 1, 'subsample': 0.7})

lgbm_highdepth = LGBMClassifier(n_estimators=100, max_depth=-1)

# Random Forest with low depth
rf_lowdepth     = RandomForestClassifier(max_depth=7, max_features='sqrt', n_estimators=200, random_state=20)
imputer  = SimpleImputer(strategy='constant', fill_value=0)
rf_pipe_lowdepth  = Pipeline(steps=[('imputer', imputer), ('model', rf_lowdepth)])

# Random Forest with high depth
#rf_highdepth     = RandomForestClassifier(max_depth=None, max_features=0.75, n_estimators=200, n_jobs=-1, random_state=20)
#imputer  = SimpleImputer(strategy='constant', fill_value=0)
#rf_pipe_highdepth  = Pipeline(steps=[('imputer', imputer), ('model', rf_highdepth)])

# CatBoost
cbc = CatBoostClassifier(verbose=0)

# KNN
#knn      = KNeighborsClassifier(n_neighbors=7, n_jobs=-1)
#imputer  = SimpleImputer(strategy='constant', fill_value=0)
#knn_pipe = Pipeline(steps=[('imputer', imputer), ('model', knn)])

## Ensembling

### Simple Hold Out Scheme

In [9]:
train_train, train_val = train_test_split(train[train.hour_exit_5>=15], test_size=0.25, random_state=20)
train_train.shape, train_val.shape

((100513, 1183), (33505, 1183))

In [10]:
train_ab, train_c = train_test_split(train_train, test_size=0.1, random_state=20)
train_a, train_b  = train_test_split(train_ab, test_size=0.1, random_state=20)

train_a.shape, train_b.shape, train_c.shape, train_val.shape

((81414, 1183), (9047, 1183), (10052, 1183), (33505, 1183))

#### Fitting N diverse models on partA

In [11]:
lr_pipe.fit(train_a[features].values, train_a[target])

lr_preds_b = lr_pipe.predict(train_b[features].values)
lr_preds_c = lr_pipe.predict(train_c[features].values)
lr_preds_val = lr_pipe.predict(train_val[features].values)

In [12]:
lgbm_lowdepth.fit(train_a[features].values, train_a[target])

lgbm_lowdepth_b = lgbm_lowdepth.predict(train_b[features].values)
lgbm_lowdepth_c = lgbm_lowdepth.predict(train_c[features].values)
lgbm_lowdepth_val = lgbm_lowdepth.predict(train_val[features].values)

In [13]:
lgbm_highdepth.fit(train_a[features].values, train_a[target])

lgbm_highdepth_b = lgbm_highdepth.predict(train_b[features].values)
lgbm_highdepth_c = lgbm_highdepth.predict(train_c[features].values)
lgbm_highdepth_val = lgbm_highdepth.predict(train_val[features].values)

In [14]:
rf_pipe_lowdepth.fit(train_a[features].values, train_a[target])

rf_pipe_lowdepth_b = rf_pipe_lowdepth.predict(train_b[features].values)
rf_pipe_lowdepth_c = rf_pipe_lowdepth.predict(train_c[features].values)
rf_pipe_lowdepth_val = rf_pipe_lowdepth.predict(train_val[features].values)

KeyboardInterrupt: 

In [None]:
#rf_pipe_highdepth.fit(train_a[features].values, train_a[target])
#
#rf_pipe_highdepth_b = rf_pipe_highdepth.predict(train_b[features].values)
#rf_pipe_highdepth_c = rf_pipe_highdepth.predict(train_c[features].values)
#rf_pipe_highdepth_val = rf_pipe_highdepth.predict(train_val[features].values)

In [None]:
#knn_pipe.fit(train_a[features].values, train_a[target])
#
#knn_pipe_b = knn_pipe.predict(train_b[features].values)
#knn_pipe_c = knn_pipe.predict(train_c[features].values)
#knn_pipe_val = knn_pipe.predict(train_val[features].values)

In [15]:
cbc.fit(train_a[features].values, train_a[target])

cbc_b = cbc.predict(train_b[features].values)
cbc_c = cbc.predict(train_c[features].values)
cbc_val = cbc.predict(train_val[features].values)

In [16]:
part_b_meta   = np.c_[lr_preds_b, lgbm_lowdepth_b, lgbm_highdepth_b, cbc_b]
part_c_meta   = np.c_[lr_preds_c, lgbm_lowdepth_c, lgbm_highdepth_c, cbc_c]
part_val_meta = np.c_[lr_preds_val, lgbm_lowdepth_val, lgbm_highdepth_val, cbc_val]

## Simple convex mix

In [17]:
part_bc_meta = np.concatenate((part_b_meta, part_c_meta), axis=0)
y_bc_meta = pd.concat([train_b[target], train_c[target]], axis=0)

In [None]:
alphas_to_try = np.linspace(0, 1, 1001)

max_score = 0
for alpha in alphas_to_try:
    result = np.round(alpha*part_bc_meta[:, 0] + (1-alpha)*part_bc_meta[:, 1])
    score = f1_score(y_bc_meta, result)

    if score > max_score:
        max_score = score
        bs_alpha = alpha

    
best_alpha = bs_alpha # YOUR CODE GOES HERE
r2_train_simple_mix = max_score # YOUR CODE GOES HERE

print('Best alpha: %f; Corresponding 1 score on train: %f' % (best_alpha, r2_train_simple_mix))

In [None]:
test_preds = np.round(best_alpha*part_val_meta[:, 1] + (1-best_alpha)*part_val_meta[:, 2])
f1_simple_mix = f1_score(train_val[target], test_preds)# YOUR CODE GOES HERE

print('Test F-1 for simple mix is %f' % f1_simple_mix)

## Stacking for HoldOut Scheme

In [18]:
stack = LogisticRegression(C=0.001)

stack.fit(part_b_meta, train_b[target])

preds_meta_c = stack.predict(part_c_meta)
f1_score(train_c[target], preds_meta_c)

0.8922902494331066

In [19]:
stack.fit(part_bc_meta, y_bc_meta)
val_meta_prediction = stack.predict(part_val_meta)

In [20]:
f1_score(train_val[target], val_meta_prediction)

0.8910835214446952

### Submission

In [21]:
ids = pd.read_csv('../data/raw/data_test.zip', index_col='Unnamed: 0', low_memory=True)
ids = ids[ids.x_exit.isnull()]

data_test = test.merge(ids[['hash', 'trajectory_id']], on='hash')
data_test.shape

(33515, 1184)

In [22]:
lr_pipe.fit(train_train[features], train_train[target])

#rf_pipe_lowdepth.fit(train_train[features], train_train[target])
#rf_pipe_highdepth.fit(train_train[features], train_train[target])


lgbm_lowdepth.fit(train_train[features], train_train[target])
lgbm_highdepth.fit(train_train[features], train_train[target])

cbc.fit(train_train[features], train_train[target])
#knn_pipe.fit(train_train[features], train_train[target])


lr_sub_meta       = lr_pipe.predict(data_test[features])
#rf_low_sub_meta   = rf_pipe_lowdepth.predict(data_test[features])
#rf_high_sub_meta  = rf_pipe_highdepth.predict(data_test[features])
lgbm_low_sub_meta = lgbm_lowdepth.predict(data_test[features])
lgbm_high_sub_meta  = lgbm_highdepth.predict(data_test[features])
cbc_sub_meta = cbc.predict(data_test[features])
#knn_sub_meta = knn_pipe.predict(data_test[features])

In [23]:
part_bc_meta = np.concatenate((part_b_meta, part_c_meta), axis=0)
y_bc_meta = pd.concat([train_b[target], train_c[target]], axis=0)

part_bcval_meta = np.concatenate((part_b_meta, part_c_meta, part_val_meta), axis=0)
y_bcval_meta = pd.concat([train_b[target], train_c[target], train_val[target]], axis=0)

stack.fit(part_bcval_meta, y_bcval_meta)

LogisticRegression(C=0.001, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [24]:
sub_meta = np.c_[lr_sub_meta, lgbm_low_sub_meta, lgbm_high_sub_meta, cbc_sub_meta]
yhat = stack.predict(sub_meta)

In [25]:
pd.Series(yhat).value_counts()

0.0    25110
1.0     8405
dtype: int64

In [26]:
submission = pd.DataFrame(list(zip(data_test['trajectory_id'], yhat)), columns=['id', 'target'])
submission.to_csv('../data/submission_victor_stack5.csv', index=False)

In [None]:
# submission_victor_stack1 | local: 0.7460367786937224 | public: 0.8941

### Test meta-features

First, we will run models on train data and get predictions for test set.

In [28]:
lr_pipe.fit(train_train[features].values, train_train[target])
lr_preds = lr_pipe.predict(train_val[features].values)

f1_score(train_val[target], lr_preds)

0.7180858550316678

In [30]:
lgbm.fit(train_train[features].values, train_train[target])
lgbm_preds = lgbm.predict(train_val[features].values)

f1_score(train_val[target], lgbm_preds)

0.7493638676844784

In [31]:
rf_pipe.fit(train_train[features].values, train_train[target])
rf_preds = rf_pipe.predict(train_val[features].values)

f1_score(train_val[target], rf_preds)

0.7058452812352294

In [32]:
cbc.fit(train_train[features].values, train_train[target])
cbc_preds = cbc.predict(train_val[features].values)

f1_score(train_val[target], cbc_preds)

0.7463117382937781

In [33]:
test_meta_features = np.c_[lr_preds, lgbm_preds, rf_preds, cbc_preds]

### Train meta-features

In [None]:
hour_block = [12, 13, 14, 15]