Importing libraries and data files

In [53]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import train_test_split 
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.preprocessing import LabelEncoder


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
import warnings 
warnings.filterwarnings('ignore')
pd.set_option('max_columns', 20)

/kaggle/input/janatahack-crosssell-prediction/sample_submission.csv
/kaggle/input/janatahack-crosssell-prediction/test.csv
/kaggle/input/janatahack-crosssell-prediction/train.csv


In [54]:
train = pd.read_csv('/kaggle/input/janatahack-crosssell-prediction/train.csv',index_col=0)
test = pd.read_csv('/kaggle/input/janatahack-crosssell-prediction/test.csv',index_col=0)
sample_submission = pd.read_csv('/kaggle/input/janatahack-crosssell-prediction/sample_submission.csv',index_col=0)

In [55]:
train[['Driving_License','Previously_Insured','Policy_Sales_Channel']] = train[['Driving_License','Previously_Insured','Policy_Sales_Channel']].astype('object')
train['Response'] = train['Response'].astype('object')

test[['Driving_License','Previously_Insured','Policy_Sales_Channel']] = test[['Driving_License','Previously_Insured','Policy_Sales_Channel']].astype('object')

Data pre-processing

In [56]:
X = train.drop('Response',axis=1)
y = train['Response'].values

In [57]:
X = pd.get_dummies(X,drop_first=True)
test = pd.get_dummies(test,drop_first=True)

In [58]:
le = LabelEncoder()
y = le.fit_transform(y)

Training the model

In [47]:
clf = LGBMClassifier(n_estimators=550,
                     learning_rate=0.03,
                     min_child_samples=40,
                     random_state=1,
                     colsample_bytree=0.5,
                     reg_alpha=2,
                     reg_lambda=2)

clf.fit(X, y, verbose=50,eval_metric = 'auc')

LGBMClassifier(colsample_bytree=0.5, learning_rate=0.03, min_child_samples=40,
               n_estimators=550, random_state=1, reg_alpha=2, reg_lambda=2)

In [48]:
lgb_pred = clf.predict_proba(X)[:,1]
roc_auc_score(y,lgb_pred)

0.8648823717319468

Pre-processing test data and predicting probabilities

In [51]:
missing_cols

{'Policy_Sales_Channel_104.0',
 'Policy_Sales_Channel_143.0',
 'Policy_Sales_Channel_144.0',
 'Policy_Sales_Channel_149.0',
 'Policy_Sales_Channel_27.0',
 'Policy_Sales_Channel_28.0',
 'Policy_Sales_Channel_41.0',
 'Policy_Sales_Channel_50.0',
 'Policy_Sales_Channel_67.0',
 'Policy_Sales_Channel_68.0',
 'Policy_Sales_Channel_75.0',
 'Policy_Sales_Channel_84.0'}

In [59]:
print(test.shape,X.shape)

missing_cols = set(X.columns) - set(test.columns)
for c in missing_cols:
    test[c] = 0

print(test.shape,X.shape)

#keeping the order of columns same for X and test
test = test[X.columns]

(127037, 154) (381109, 164)
(127037, 166) (381109, 164)


In [60]:
test_pred = clf.predict_proba(test)[:,1]

In [61]:
sample_submission['Response'] = test_pred
sample_submission.to_csv('Submission_v1.csv')

Hyperparameter tuning 

In [65]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=2020, stratify=y)

In [66]:
#Preparing learning rate shrinkage
def learning_rate_010_decay_power_099(current_iter):
    base_learning_rate = 0.1
    lr = base_learning_rate  * np.power(.99, current_iter)
    return lr if lr > 1e-3 else 1e-3

def learning_rate_010_decay_power_0995(current_iter):
    base_learning_rate = 0.1
    lr = base_learning_rate  * np.power(.995, current_iter)
    return lr if lr > 1e-3 else 1e-3

def learning_rate_005_decay_power_099(current_iter):
    base_learning_rate = 0.05
    lr = base_learning_rate  * np.power(.99, current_iter)
    return lr if lr > 1e-3 else 1e-3

In [67]:
#Using test subset for early stopping
import lightgbm as lgb
fit_params={"early_stopping_rounds":30, 
            "eval_metric" : 'auc', 
            "eval_set" : [(X_test,y_test)],
            'eval_names': ['valid'],
            #'callbacks': [lgb.reset_parameter(learning_rate=learning_rate_010_decay_power_099)],
            'verbose': 100,
            'categorical_feature': 'auto'}

In [69]:
#Setup hyperparameter search
from scipy.stats import uniform as sp_uniform
from scipy.stats import randint as sp_randint
param_test ={'num_leaves': sp_randint(6, 50), 
             'min_child_samples': sp_randint(100, 500), 
             'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
             'subsample': sp_uniform(loc=0.2, scale=0.8), 
             'colsample_bytree': sp_uniform(loc=0.4, scale=0.6),
             'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
             'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100]}

In [70]:
#This parameter defines the number of HP points to be tested
n_HP_points_to_test = 100

import lightgbm as lgb
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

#n_estimators is set to a "large value". The actual number of trees build will depend on early stopping and 5000 define only the absolute maximum
clf = lgb.LGBMClassifier(max_depth=-1, random_state=314, silent=True, metric='None', n_jobs=4, n_estimators=5000)
gs = RandomizedSearchCV(
    estimator=clf, param_distributions=param_test, 
    n_iter=n_HP_points_to_test,
    scoring='roc_auc',
    cv=3,
    refit=True,
    random_state=314,
    verbose=True)

In [71]:
gs.fit(X_train, y_train, **fit_params)
print('Best score reached: {} with params: {} '.format(gs.best_score_, gs.best_params_))

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.857045
Early stopping, best iteration is:
[97]	valid's auc: 0.857101
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.857103
Early stopping, best iteration is:
[103]	valid's auc: 0.857104
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.856945
Early stopping, best iteration is:
[133]	valid's auc: 0.857077
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.856571
Early stopping, best iteration is:
[135]	valid's auc: 0.856633
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.856815
Early stopping, best iteration is:
[85]	valid's auc: 0.856887
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.856703
Early stopping, best iteration is:
[106]	valid's auc: 0.856746
Training until validation scores don't improve for 30 rounds
Early stopping, best iteratio

Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.857339
Early stopping, best iteration is:
[119]	valid's auc: 0.857399
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.855354
Early stopping, best iteration is:
[166]	valid's auc: 0.855607
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.855609
Early stopping, best iteration is:
[145]	valid's auc: 0.855807
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.855468
Early stopping, best iteration is:
[153]	valid's auc: 0.855626
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.857489
Early stopping, best iteration is:
[81]	valid's auc: 0.85763
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[40]	valid's auc: 0.857287
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.857501
Early stopping, best iteration

[100]	valid's auc: 0.856624
Early stopping, best iteration is:
[95]	valid's auc: 0.85663
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.856813
Early stopping, best iteration is:
[73]	valid's auc: 0.856903
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.856741
Early stopping, best iteration is:
[83]	valid's auc: 0.856836
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.857408
[200]	valid's auc: 0.857504
Early stopping, best iteration is:
[173]	valid's auc: 0.85762
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.857558
[200]	valid's auc: 0.85793
Early stopping, best iteration is:
[200]	valid's auc: 0.85793
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.857598
Early stopping, best iteration is:
[133]	valid's auc: 0.857719
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[6]	

[100]	valid's auc: 0.855597
Early stopping, best iteration is:
[130]	valid's auc: 0.85575
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.855735
Early stopping, best iteration is:
[135]	valid's auc: 0.85583
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.856423
Early stopping, best iteration is:
[78]	valid's auc: 0.856551
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.856647
Early stopping, best iteration is:
[106]	valid's auc: 0.856697
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.856422
Early stopping, best iteration is:
[80]	valid's auc: 0.856544
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.852773
[200]	valid's auc: 0.853362
[300]	valid's auc: 0.853649
Early stopping, best iteration is:
[315]	valid's auc: 0.853663
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.853216
[200]	va

[500]	valid's auc: 0.857155
Early stopping, best iteration is:
[525]	valid's auc: 0.857184
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.855636
[200]	valid's auc: 0.856667
[300]	valid's auc: 0.857085
[400]	valid's auc: 0.857256
Early stopping, best iteration is:
[465]	valid's auc: 0.857399
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.855759
[200]	valid's auc: 0.856806
[300]	valid's auc: 0.857128
Early stopping, best iteration is:
[335]	valid's auc: 0.857177
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.855195
[200]	valid's auc: 0.856223
[300]	valid's auc: 0.856618
[400]	valid's auc: 0.856999
Early stopping, best iteration is:
[419]	valid's auc: 0.857022
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.855534
[200]	valid's auc: 0.856531
[300]	valid's auc: 0.856995
[400]	valid's auc: 0.857197
Early stopping, best iteration is:
[404]	valid's auc: 0.

Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.856588
[200]	valid's auc: 0.857196
Early stopping, best iteration is:
[221]	valid's auc: 0.857241
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.856584
Early stopping, best iteration is:
[167]	valid's auc: 0.857015
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.856905
Early stopping, best iteration is:
[92]	valid's auc: 0.856992
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.85717
Early stopping, best iteration is:
[91]	valid's auc: 0.85725
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.857183
Early stopping, best iteration is:
[99]	valid's auc: 0.85719
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.857171
Early stopping, best iteration is:
[113]	valid's auc: 0.857339
Training until validation scores don't improve for 30 rounds
[100]

[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed: 22.4min finished


Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.857973
Early stopping, best iteration is:
[159]	valid's auc: 0.858295
Best score reached: 0.8573948889237052 with params: {'colsample_bytree': 0.5284213741879101, 'min_child_samples': 125, 'min_child_weight': 10.0, 'num_leaves': 22, 'reg_alpha': 0.1, 'reg_lambda': 20, 'subsample': 0.3080033455431848} 


In [72]:
opt_parameters = {'colsample_bytree': 0.5284213741879101, 'min_child_samples': 125, 'min_child_weight': 10.0, 'num_leaves': 22, 'reg_alpha': 0.1, 'reg_lambda': 20, 'subsample': 0.3080033455431848} 

In [76]:
best_params = {'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 0.5284213741879101,
 'importance_type': 'split',
 'learning_rate': 0.1,
 'max_depth': -1,
 'min_child_samples': 125,
 'min_child_weight': 10.0,
 'min_split_gain': 0.0,
 'n_estimators': 5000,
 'n_jobs': 4,
 'num_leaves': 22,
 'objective': None,
 'random_state': 314,
 'reg_alpha': 0.1,
 'reg_lambda': 20,
 'silent': True,
 'subsample': 0.3080033455431848,
 'subsample_for_bin': 200000,
 'subsample_freq': 0,
 'metric': 'None'}

In [77]:
clf_final = lgb.LGBMClassifier(**gs.best_estimator_.get_params())

In [78]:
clf_final.fit(X_train, y_train, **fit_params, callbacks=[lgb.reset_parameter(learning_rate=learning_rate_010_decay_power_0995)])

Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.858007
[200]	valid's auc: 0.85834
[300]	valid's auc: 0.858407
Early stopping, best iteration is:
[336]	valid's auc: 0.858414


LGBMClassifier(colsample_bytree=0.5284213741879101, metric='None',
               min_child_samples=125, min_child_weight=10.0, n_estimators=5000,
               n_jobs=4, num_leaves=22, random_state=314, reg_alpha=0.1,
               reg_lambda=20, subsample=0.3080033455431848)

In [None]:
probabilities = clf_final.predict_proba(test)[:,1]
sample_submission['Response'] = probabilities
sample_submission.to_csv('Submission_v2.csv')