In [1]:
import os
import joblib

import numpy as np
import pandas as pd
import seaborn as sns
import missingno as msno
import matplotlib.pyplot as plt
from scipy import stats

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score, f1_score
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier

import shap
import optuna
import category_encoders as ce
import optuna.visualization as vis

import xgboost as xgb
import lightgbm as lgb
import catboost as cat

from tqdm.notebook import tqdm

import warnings
warnings.filterwarnings('ignore')

In [10]:
train = pd.read_csv('../data/clean_train4.csv')
test = pd.read_csv('../data/clean_test4.csv')

In [11]:
print(train.info())
train.tail()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2802 entries, 0 to 2801
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   age                  2802 non-null   int64  
 1   job_type             2802 non-null   int64  
 2   marital_status       2802 non-null   int64  
 3   education_level      2802 non-null   int64  
 4   balance_amt          2802 non-null   float64
 5   household_insurance  2802 non-null   int64  
 6   car_loan             2802 non-null   int64  
 7   communication        2802 non-null   int64  
 8   last_contact_day     2802 non-null   int64  
 9   last_contact_month   2802 non-null   int64  
 10  no_of_contacts       2802 non-null   int64  
 11  prev_attempts        2802 non-null   int64  
 12  Outcome              2802 non-null   int64  
 13  car_insurance        2802 non-null   int64  
 14  call_duration        2802 non-null   float64
 15  zero_neg_balance     2802 non-null   i

Unnamed: 0,age,job_type,marital_status,education_level,balance_amt,household_insurance,car_loan,communication,last_contact_day,last_contact_month,no_of_contacts,prev_attempts,Outcome,car_insurance,call_duration,zero_neg_balance
2797,41,4,1,1,2753.566864,1,0,0,18,0,1,0,3,1,971.583333,0
2798,30,1,2,1,315.0,0,0,0,22,6,1,0,3,1,23.45,0
2799,32,1,1,0,3136.0,1,0,0,18,9,1,0,3,1,13.666667,0
2800,31,1,1,1,0.0,0,0,1,22,5,1,0,3,0,0.6,1
2801,61,4,2,2,2.0,0,0,0,11,1,1,3,0,1,2.183333,0


In [12]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 935 entries, 0 to 934
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   age                  935 non-null    int64  
 1   job_type             935 non-null    int64  
 2   marital_status       935 non-null    int64  
 3   education_level      935 non-null    int64  
 4   balance_amt          935 non-null    float64
 5   household_insurance  935 non-null    int64  
 6   car_loan             935 non-null    int64  
 7   communication        935 non-null    int64  
 8   last_contact_day     935 non-null    int64  
 9   last_contact_month   935 non-null    int64  
 10  no_of_contacts       935 non-null    int64  
 11  prev_attempts        935 non-null    int64  
 12  Outcome              935 non-null    int64  
 13  call_duration        935 non-null    float64
 14  zero_neg_balance     935 non-null    int64  
dtypes: float64(2), int64(13)
memory usage: 1

In [13]:
X = train.drop('car_insurance', axis=1)
y = train['car_insurance']

In [93]:
def cross_validate(model, X, y, n_folds=10):
    scores = []
    kf = KFold(n_splits=n_folds)

    for fold, (train_index, valid_index) in tqdm(enumerate(kf.split(X)), total=n_folds):

        train_x, valid_x = X.iloc[train_index], X.iloc[valid_index]
        train_y, valid_y = y[train_index], y[valid_index]

#         model.fit(train_x, train_y, eval_set=[(valid_x, valid_y)], verbose=0)
        model.fit(train_x, train_y)

        preds = model.predict(valid_x)
        score = accuracy_score(valid_y, preds)
        scores.append(score)
#         print(f"Fold {fold + 1}: {score:.6f}")

#     print(f'Mean Score: {np.mean(scores)}')
    return np.mean(scores)

In [101]:
def get_kfold_prediction(model, X, y, test_data, n_folds=10):
    scores = []
    oof_preds = np.zeros(test_data.shape[0])
    kf = KFold(n_splits=n_folds)

    for fold, (train_index, valid_index) in tqdm(enumerate(kf.split(X)), total=n_folds):

        train_x, valid_x = X.iloc[train_index], X.iloc[valid_index]
        train_y, valid_y = y[train_index], y[valid_index]

#         model.fit(train_x, train_y, eval_set=[(valid_x, valid_y)], verbose=0)
        model.fit(train_x, train_y)

        preds = model.predict(valid_x)

        # change metric according to requirement
        score = accuracy_score(valid_y, preds)
        scores.append(score)

        # change this according to requirement
        oof_preds += model.predict_proba(test_data)[:, 1] / n_folds

        print(f"Fold {fold + 1}: {score:.6f}")

    print(f'Mean Score: {np.mean(scores)}')

    return oof_preds

In [16]:
cat_params = {
    'eval_metric': 'Accuracy',
    'n_estimators': 2000,
    'learning_rate': 0.005,
    'max_depth': 5,
#     'reg_lambda': 2,
    'cat_features': [1, 2, 3, 8, 9, 12],
    'early_stopping_rounds': 300,
    'subsample': 0.8,
    'colsample_bylevel': 0.8,
    # 'bagging_temperature': 0.5,
    # 'random_strength': 2,
    'one_hot_max_size': 4,
    'use_best_model': True,
    'verbose': 0,
    'random_state': 1234,
    'thread_count': -1
}

In [17]:
cat_model = cat.CatBoostClassifier(**cat_params)
cross_validate(cat_model, X, y, 10)

  0%|          | 0/10 [00:00<?, ?it/s]

Fold 1: 0.882562
Fold 2: 0.857651
Fold 3: 0.885714
Fold 4: 0.842857
Fold 5: 0.885714
Fold 6: 0.896429
Fold 7: 0.875000
Fold 8: 0.892857
Fold 9: 0.857143
Fold 10: 0.864286
Mean Score: 0.8740213523131672


In [29]:
cat_probs = get_kfold_prediction(cat_model, X, y, test)

  0%|          | 0/10 [00:00<?, ?it/s]

Fold 1: 0.882562
Fold 2: 0.857651
Fold 3: 0.885714
Fold 4: 0.842857
Fold 5: 0.885714
Fold 6: 0.896429
Fold 7: 0.875000
Fold 8: 0.892857
Fold 9: 0.857143
Fold 10: 0.864286
Mean Score: 0.8740213523131672


In [32]:
cat_preds = cat_probs.copy()

cat_preds[cat_preds > 0.52] = 1
cat_preds[cat_preds < 1] = 0

In [33]:
print(cat_preds.shape)
np.unique(cat_preds, return_counts=True)

(935,)


(array([0., 1.]), array([405, 530]))

In [34]:
sub = pd.DataFrame({
    'prediction': cat_preds.astype(int)
})
print(sub.shape)
sub.head()

(935, 1)


Unnamed: 0,prediction
0,1
1,0
2,1
3,1
4,0


In [35]:
sub.to_csv('../submissions/catboost4.csv', index=False)

In [75]:
threshold = np.linspace(0, 1, 100)

acc = 
for th in threshold:

In [74]:
threshold

array([0.        , 0.01020408, 0.02040816, 0.03061224, 0.04081633,
       0.05102041, 0.06122449, 0.07142857, 0.08163265, 0.09183673,
       0.10204082, 0.1122449 , 0.12244898, 0.13265306, 0.14285714,
       0.15306122, 0.16326531, 0.17346939, 0.18367347, 0.19387755,
       0.20408163, 0.21428571, 0.2244898 , 0.23469388, 0.24489796,
       0.25510204, 0.26530612, 0.2755102 , 0.28571429, 0.29591837,
       0.30612245, 0.31632653, 0.32653061, 0.33673469, 0.34693878,
       0.35714286, 0.36734694, 0.37755102, 0.3877551 , 0.39795918,
       0.40816327, 0.41836735, 0.42857143, 0.43877551, 0.44897959,
       0.45918367, 0.46938776, 0.47959184, 0.48979592, 0.5       ])

In [69]:
# change the objective

xgb_params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
#     'booster': 'gblinear',
    'n_estimators': 2000,
    'max_depth': 6,
    'learning_rate': 0.05,
#     'gamma': 10,
#     'min_child_weight': 0.4,
    'subsample': 0.7,
    'colsample_bytree': 0.8,
#     'colsample_bylevel': 0.4,
#     'colsample_bynode': 0.6,
    'reg_alpha': 1,
    'reg_lambda': 2,
    'early_stopping_rounds': 100,
    # 'scale_pos_weight': 4,
    # 'importance_type': 'gain',
    # 'max_delta_step': ,
    # 'base_score': ,
    # 'tree_method': 'auto',
    # 'missing': ,
    # 'num_parallel_tree': ,
    # 'monotone_constraints': ,
    # 'interaction_constraints': ,
    'n_jobs': -1,
    'random_state': 1234,
    'verbosity': 0,
}

In [70]:
xgb_model = xgb.XGBRFClassifier(**xgb_params)

In [71]:
cross_validate(xgb_model, X, y, 10)

  0%|          | 0/10 [00:00<?, ?it/s]

Fold 1: 0.868327
Fold 2: 0.807829
Fold 3: 0.860714
Fold 4: 0.832143
Fold 5: 0.835714
Fold 6: 0.853571
Fold 7: 0.860714
Fold 8: 0.875000
Fold 9: 0.832143
Fold 10: 0.835714
Mean Score: 0.8461870869344178


In [72]:
xgb_probs = get_kfold_prediction(xgb_model, X, y, test)

  0%|          | 0/10 [00:00<?, ?it/s]

Fold 1: 0.868327
Fold 2: 0.807829
Fold 3: 0.860714
Fold 4: 0.832143
Fold 5: 0.835714
Fold 6: 0.853571
Fold 7: 0.860714
Fold 8: 0.875000
Fold 9: 0.832143
Fold 10: 0.835714
Mean Score: 0.8461870869344178


In [74]:
xgb_preds = xgb_probs.copy()

xgb_preds[xgb_preds > 0.5] = 1
xgb_preds[xgb_preds < 0.5] = 0

np.unique(xgb_preds, return_counts=True)

(array([0., 1.]), array([384, 551]))

In [75]:
sub = pd.DataFrame({
    'prediction': xgb_preds.astype(int)
})
print(sub.shape)
sub.head()

(935, 1)


Unnamed: 0,prediction
0,1
1,1
2,1
3,1
4,0


In [76]:
sub.to_csv('../submissions/xgb1.csv', index=False)

In [36]:
# change the objective and metric

lgb_params = {
    'objective': 'binary',
#     'eval_metric': 'logloss',
    'boosting_type': 'gbdt',
    'n_estimators': 10000,
    'learning_rate': 0.001,
    'max_depth': 7,
    "num_leaves": 64,  
    "max_bin": 512,
    'min_child_samples': 32,
    'subsample': 0.75,
    'reg_alpha': 0.01,
    'reg_lambda': 0.5,
    'early_stopping_round': 300,
#     'cat_smooth': 10,
    'categorical_feature': [1, 2, 3, 8, 9, 12],
    'min_child_weight': 0.01,
    'colsample_bytree': 0.8,
    'colsample_bynode': 0.8,
    'max_cat_to_onehot': 4,
    'verbose': -1,
    'seed': 1234,
    'n_jobs': -1,
}

In [37]:
lgb_model = lgb.LGBMClassifier(**lgb_params)

In [38]:
cross_validate(lgb_model, X, y)

  0%|          | 0/10 [00:00<?, ?it/s]

Fold 1: 0.868327
Fold 2: 0.836299
Fold 3: 0.875000
Fold 4: 0.835714
Fold 5: 0.875000
Fold 6: 0.878571
Fold 7: 0.864286
Fold 8: 0.896429
Fold 9: 0.846429
Fold 10: 0.850000
Mean Score: 0.8626054905948145


In [39]:
lgb_probs = get_kfold_prediction(lgb_model, X, y, test)

  0%|          | 0/10 [00:00<?, ?it/s]

Fold 1: 0.868327
Fold 2: 0.836299
Fold 3: 0.875000
Fold 4: 0.835714
Fold 5: 0.875000
Fold 6: 0.878571
Fold 7: 0.864286
Fold 8: 0.896429
Fold 9: 0.846429
Fold 10: 0.850000
Mean Score: 0.8626054905948145


In [40]:
lgb_preds = lgb_probs.copy()

lgb_preds[lgb_preds > 0.5] = 1
lgb_preds[lgb_preds < 0.5] = 0

np.unique(lgb_preds, return_counts=True)

(array([0., 1.]), array([380, 555]))

In [41]:
sub = pd.DataFrame({
    'prediction': lgb_preds.astype(int)
})
print(sub.shape)
sub.head()

(935, 1)


Unnamed: 0,prediction
0,1
1,0
2,1
3,1
4,0


In [42]:
sub.to_csv('../submissions/lgb2.csv', index=False)

In [49]:
gbc = GradientBoostingClassifier(n_estimators=300, max_depth=6, random_state=1234)
# cross_validate(gbc, X, y)

In [50]:
gbc_probs = get_kfold_prediction(gbc, X, y, test)

  0%|          | 0/10 [00:00<?, ?it/s]

Fold 1: 0.864769
Fold 2: 0.850534
Fold 3: 0.871429
Fold 4: 0.832143
Fold 5: 0.875000
Fold 6: 0.878571
Fold 7: 0.885714
Fold 8: 0.889286
Fold 9: 0.839286
Fold 10: 0.864286
Mean Score: 0.8651016776817488


In [51]:
gbc_preds = gbc_probs.copy()

gbc_preds[gbc_preds > 0.5] = 1
gbc_preds[gbc_preds < 0.5] = 0

np.unique(gbc_preds, return_counts=True)

(array([0., 1.]), array([379, 556]))

In [52]:
sub = pd.DataFrame({
    'prediction': lgb_preds.astype(int)
})
print(sub.shape)
sub.head()

(935, 1)


Unnamed: 0,prediction
0,1
1,0
2,1
3,1
4,0


In [54]:
sub.to_csv('../submissions/gbc1.csv', index=False)

In [94]:
def objective(trial):
    # To select which parameters to optimize, please look at the LightGBM documentation:
    # https://lightgbm.readthedocs.io/en/latest/Parameters.html
    param = {
        'n_estimators': trial.suggest_int("n_estimators", 600, 1200),
        'max_depth': trial.suggest_int('max_depth', 5, 12),
        'learning_rate': trial.suggest_uniform('learning_rate', 0.02, 0.06),
        'subsample': trial.suggest_categorical('subsample', [0.6, 0.7, 0.8, 0.9, 1]),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 32),
        # 'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.001, 0.05),
        # 'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.1, 1),
        # 'colsample_bytree': trial.suggest_categorical('colsample_bytree', np.linspace(1, 1, num=1)),
        # 'colsample_bynode': trial.suggest_categorical('colsample_bynode', np.linspace(0.7, 1, num=4)),
        # 'min_child_weight': trial.suggest_int('min_child_weight', 1, 50),
        'verbose': 0,
        'random_state': 1234,
    }
    model = GradientBoostingClassifier(**param)      
    score = cross_validate(model, X, y)
    
    return score

In [95]:
gbc_study = optuna.create_study(direction='maximize')
gbc_study.optimize(objective, n_trials=15)

[32m[I 2021-04-25 16:04:18,391][0m A new study created in memory with name: no-name-f24d3608-171a-4745-b184-eae8b86c494f[0m


  0%|          | 0/10 [00:00<?, ?it/s]

[32m[I 2021-04-25 16:05:17,997][0m Trial 0 finished with value: 0.8658185053380784 and parameters: {'n_estimators': 636, 'max_depth': 7, 'learning_rate': 0.04551108716002941, 'subsample': 1, 'min_samples_split': 6}. Best is trial 0 with value: 0.8658185053380784.[0m


  0%|          | 0/10 [00:00<?, ?it/s]

[32m[I 2021-04-25 16:05:59,181][0m Trial 1 finished with value: 0.8626054905948145 and parameters: {'n_estimators': 659, 'max_depth': 6, 'learning_rate': 0.05409623116886003, 'subsample': 0.7, 'min_samples_split': 10}. Best is trial 0 with value: 0.8658185053380784.[0m


  0%|          | 0/10 [00:00<?, ?it/s]

[32m[I 2021-04-25 16:07:35,385][0m Trial 2 finished with value: 0.8672458057956278 and parameters: {'n_estimators': 1001, 'max_depth': 8, 'learning_rate': 0.024469093487761818, 'subsample': 0.9, 'min_samples_split': 15}. Best is trial 2 with value: 0.8672458057956278.[0m


  0%|          | 0/10 [00:00<?, ?it/s]

[32m[I 2021-04-25 16:08:53,671][0m Trial 3 finished with value: 0.8590327910523641 and parameters: {'n_estimators': 857, 'max_depth': 7, 'learning_rate': 0.038969290960244575, 'subsample': 1, 'min_samples_split': 8}. Best is trial 2 with value: 0.8672458057956278.[0m


  0%|          | 0/10 [00:00<?, ?it/s]

[32m[I 2021-04-25 16:11:06,899][0m Trial 4 finished with value: 0.866166751398068 and parameters: {'n_estimators': 1163, 'max_depth': 12, 'learning_rate': 0.03575007033014354, 'subsample': 0.7, 'min_samples_split': 21}. Best is trial 2 with value: 0.8672458057956278.[0m


  0%|          | 0/10 [00:00<?, ?it/s]

[32m[I 2021-04-25 16:12:21,914][0m Trial 5 finished with value: 0.8654588205388917 and parameters: {'n_estimators': 806, 'max_depth': 8, 'learning_rate': 0.05328489200164839, 'subsample': 0.7, 'min_samples_split': 4}. Best is trial 2 with value: 0.8672458057956278.[0m


  0%|          | 0/10 [00:00<?, ?it/s]

[32m[I 2021-04-25 16:13:05,202][0m Trial 6 finished with value: 0.8643848500254194 and parameters: {'n_estimators': 792, 'max_depth': 6, 'learning_rate': 0.051197880069927884, 'subsample': 0.6, 'min_samples_split': 11}. Best is trial 2 with value: 0.8672458057956278.[0m


  0%|          | 0/10 [00:00<?, ?it/s]

[32m[I 2021-04-25 16:15:00,668][0m Trial 7 finished with value: 0.8661731062531773 and parameters: {'n_estimators': 1145, 'max_depth': 9, 'learning_rate': 0.04876910065319573, 'subsample': 0.8, 'min_samples_split': 16}. Best is trial 2 with value: 0.8672458057956278.[0m


  0%|          | 0/10 [00:00<?, ?it/s]

[32m[I 2021-04-25 16:15:46,485][0m Trial 8 finished with value: 0.8611718352821555 and parameters: {'n_estimators': 1007, 'max_depth': 5, 'learning_rate': 0.04197547591789275, 'subsample': 0.6, 'min_samples_split': 9}. Best is trial 2 with value: 0.8672458057956278.[0m


  0%|          | 0/10 [00:00<?, ?it/s]

[32m[I 2021-04-25 16:17:21,000][0m Trial 9 finished with value: 0.8654524656837823 and parameters: {'n_estimators': 1046, 'max_depth': 8, 'learning_rate': 0.03645269822647203, 'subsample': 0.9, 'min_samples_split': 28}. Best is trial 2 with value: 0.8672458057956278.[0m


  0%|          | 0/10 [00:00<?, ?it/s]

[32m[I 2021-04-25 16:19:26,592][0m Trial 10 finished with value: 0.8675965937976613 and parameters: {'n_estimators': 1001, 'max_depth': 11, 'learning_rate': 0.02301617857377725, 'subsample': 0.9, 'min_samples_split': 24}. Best is trial 10 with value: 0.8675965937976613.[0m


  0%|          | 0/10 [00:00<?, ?it/s]

[32m[I 2021-04-25 16:21:29,298][0m Trial 11 finished with value: 0.8675978647686833 and parameters: {'n_estimators': 981, 'max_depth': 11, 'learning_rate': 0.021063159035295535, 'subsample': 0.9, 'min_samples_split': 25}. Best is trial 11 with value: 0.8675978647686833.[0m


  0%|          | 0/10 [00:00<?, ?it/s]

[32m[I 2021-04-25 16:23:31,885][0m Trial 12 finished with value: 0.8668848500254193 and parameters: {'n_estimators': 938, 'max_depth': 12, 'learning_rate': 0.020702545099426428, 'subsample': 0.9, 'min_samples_split': 28}. Best is trial 11 with value: 0.8675978647686833.[0m


  0%|          | 0/10 [00:00<?, ?it/s]

[32m[I 2021-04-25 16:25:37,716][0m Trial 13 finished with value: 0.868666751398068 and parameters: {'n_estimators': 1085, 'max_depth': 10, 'learning_rate': 0.028050703085566647, 'subsample': 0.9, 'min_samples_split': 22}. Best is trial 13 with value: 0.868666751398068.[0m


  0%|          | 0/10 [00:00<?, ?it/s]

[32m[I 2021-04-25 16:27:47,744][0m Trial 14 finished with value: 0.867243263853584 and parameters: {'n_estimators': 1121, 'max_depth': 10, 'learning_rate': 0.02887093554431039, 'subsample': 0.9, 'min_samples_split': 22}. Best is trial 13 with value: 0.868666751398068.[0m


In [96]:
vis.plot_parallel_coordinate(gbc_study)

In [90]:
vis.plot_optimization_history(gbc_study)

In [91]:
vis.plot_param_importances(gbc_study)

In [98]:
BEST_PARAMS = {
    'verbose': 0,
    'random_state': 1234,
}
BEST_PARAMS.update(gbc_study.best_params)
BEST_PARAMS

{'verbose': 0,
 'random_state': 1234,
 'n_estimators': 1085,
 'max_depth': 10,
 'learning_rate': 0.028050703085566647,
 'subsample': 0.9,
 'min_samples_split': 22}

In [102]:
tuned_gbc = GradientBoostingClassifier(**BEST_PARAMS)
gbc_probs = get_kfold_prediction(tuned_gbc, X, y, test)

  0%|          | 0/10 [00:00<?, ?it/s]

Fold 1: 0.886121
Fold 2: 0.846975
Fold 3: 0.878571
Fold 4: 0.821429
Fold 5: 0.892857
Fold 6: 0.882143
Fold 7: 0.878571
Fold 8: 0.900000
Fold 9: 0.835714
Fold 10: 0.864286
Mean Score: 0.868666751398068


In [103]:
gbc_preds = gbc_probs.copy()

gbc_preds[gbc_preds > 0.5] = 1
gbc_preds[gbc_preds < 0.5] = 0

np.unique(gbc_preds, return_counts=True)

(array([0., 1.]), array([371, 564]))

In [108]:
sub = pd.DataFrame({
    'prediction': gbc_preds.astype(int)
})
print(sub.shape)
sub.head()

(935, 1)


Unnamed: 0,prediction
0,1
1,0
2,1
3,1
4,0


In [109]:
sub.to_csv('../submissions/gbc2.csv', index=False)

In [106]:
ensemble_probs = lgb_probs * 0.7 + gbc_probs * 0.3

In [107]:
ensemble_preds = ensemble_probs.copy()

ensemble_preds[ensemble_preds > 0.5] = 1
ensemble_preds[ensemble_preds < 0.5] = 0

np.unique(ensemble_preds, return_counts=True)

(array([0., 1.]), array([378, 557]))

In [112]:
sub = pd.DataFrame({
    'prediction': ensemble_preds.astype(int)
})
print(sub.shape)
sub.head()

(935, 1)


Unnamed: 0,prediction
0,1
1,0
2,1
3,1
4,0


In [113]:
sub.to_csv('../submissions/ensemble1.csv', index=False)

In [115]:
test.groupby('Outcome').count()

Unnamed: 0_level_0,age,job_type,marital_status,education_level,balance_amt,household_insurance,car_loan,communication,last_contact_day,last_contact_month,no_of_contacts,prev_attempts,call_duration,zero_neg_balance
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,96,96,96,96,96,96,96,96,96,96,96,96,96,96
1,32,32,32,32,32,32,32,32,32,32,32,32,32,32
2,112,112,112,112,112,112,112,112,112,112,112,112,112,112
3,695,695,695,695,695,695,695,695,695,695,695,695,695,695


In [132]:
test2 = test.copy()

test2['prediction'] = lgb_preds.astype(int)

In [133]:
test2

Unnamed: 0,age,job_type,marital_status,education_level,balance_amt,household_insurance,car_loan,communication,last_contact_day,last_contact_month,no_of_contacts,prev_attempts,Outcome,call_duration,zero_neg_balance,prediction
0,65,5,1,1,20806.0,0,0,1,7,0,2,0,3,8.716667,0,1
1,36,4,1,2,900.0,1,0,0,14,8,2,0,3,7.916667,0,0
2,37,4,1,2,6771.0,0,0,0,6,11,1,1,2,5.616667,0,1
3,30,4,2,2,0.0,0,0,0,13,8,2,0,3,974.450000,1,1
4,41,4,1,2,328.0,1,0,0,28,5,12,0,3,1.083333,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
930,31,1,1,1,152.0,1,1,0,7,5,1,0,3,13.850000,0,1
931,30,9,2,1,53.0,1,0,0,17,5,5,0,3,5.866667,0,0
932,42,4,1,2,36.0,0,0,0,20,9,3,0,3,11.066667,0,1
933,46,7,1,1,2948.0,0,0,0,20,9,1,0,3,14.883333,0,1


In [134]:
test2.groupby(['Outcome', 'prediction']).count()['age']

Outcome  prediction
0        0              38
         1              58
1        0              13
         1              19
2        0               2
         1             110
3        0             327
         1             368
Name: age, dtype: int64

In [135]:
mask = (test2['Outcome'] == 2) & (test2['prediction'] == 0)
test2.loc[mask, 'prediction'] = 1

In [137]:
tuned_preds = test2[['prediction']]
tuned_preds.head()

Unnamed: 0,prediction
0,1
1,0
2,1
3,1
4,0


In [138]:
tuned_preds.to_csv('../submissions/manual_1.csv', index=False)