Importing libraries and data files

In [591]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import train_test_split 
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from lightgbm import plot_importance
import regex as re



import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
import warnings 
warnings.filterwarnings('ignore')
pd.set_option('max_columns', 20)

/kaggle/input/janatahack-crosssell-prediction/test.csv
/kaggle/input/janatahack-crosssell-prediction/sample_submission.csv
/kaggle/input/janatahack-crosssell-prediction/train.csv


In [592]:
train = pd.read_csv('/kaggle/input/janatahack-crosssell-prediction/train.csv',index_col=0)
test = pd.read_csv('/kaggle/input/janatahack-crosssell-prediction/test.csv',index_col=0)
sample_submission = pd.read_csv('/kaggle/input/janatahack-crosssell-prediction/sample_submission.csv',index_col=0)

In [593]:
train[['Driving_License','Previously_Insured','Policy_Sales_Channel','Region_Code']] = train[['Driving_License','Previously_Insured','Policy_Sales_Channel','Region_Code']].astype('object')
train['Response'] = train['Response'].astype('object')

test[['Driving_License','Previously_Insured','Policy_Sales_Channel','Region_Code']] = test[['Driving_License','Previously_Insured','Policy_Sales_Channel','Region_Code']].astype('object')

Treating outliers

In [594]:
# premium_median = train['Annual_Premium'].median()
# train['Annual_Premium'] = np.where(train['Annual_Premium']> 1/00000.000000, premium_median, train['Annual_Premium'])

# premium_median_test = test['Annual_Premium'].median()
# test['Annual_Premium'] = np.where(test['Annual_Premium']> 100000.000000, premium_median_test, test['Annual_Premium'])

Feature engineering

In [595]:
# train['Vehicle_Age_Damage'] = train['Vehicle_Age'] + '_' + train['Vehicle_Damage']
# test['Vehicle_Age_Damage'] = test['Vehicle_Age'] + '_' + test['Vehicle_Damage']

In [596]:
train['Premium_sqrt'] = np.sqrt(train['Annual_Premium'])
test['Premium_sqrt'] = np.sqrt(test['Annual_Premium'])

In [597]:
train['Region_Count'] = train.groupby('Region_Code')['Region_Code'].transform('count')
test['Region_Count'] = test.groupby('Region_Code')['Region_Code'].transform('count')

In [598]:
train['Channel_Count'] = train.groupby('Policy_Sales_Channel')['Policy_Sales_Channel'].transform('count')
test['Channel_Count'] = test.groupby('Policy_Sales_Channel')['Policy_Sales_Channel'].transform('count')

In [599]:
train['Premium_mean'] = train.groupby('Annual_Premium')['Annual_Premium'].transform('mean')
test['Premium_mean'] = test.groupby('Annual_Premium')['Annual_Premium'].transform('mean')

In [600]:
train.head()

Unnamed: 0_level_0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response,Premium_sqrt,Region_Count,Channel_Count,Premium_mean
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,Male,44,1,28,0,> 2 Years,Yes,40454.0,26,217,1,201.131798,106415,79700,40454.0
2,Male,76,1,3,0,1-2 Year,No,33536.0,26,183,0,183.12837,9251,79700,33536.0
3,Male,47,1,28,0,> 2 Years,Yes,38294.0,26,27,1,195.688528,106415,79700,38294.0
4,Male,21,1,11,1,< 1 Year,No,28619.0,152,203,0,169.171511,9232,134784,28619.0
5,Female,29,1,41,1,< 1 Year,No,27496.0,152,39,0,165.819179,18263,134784,27496.0


In [601]:
test.head()

Unnamed: 0_level_0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Premium_sqrt,Region_Count,Channel_Count,Premium_mean
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
381110,Male,25,1,11,1,< 1 Year,No,35786.0,152,53,189.17188,3096,44739,35786.0
381111,Male,40,1,28,0,1-2 Year,Yes,33762.0,7,111,183.744388,35522,559,33762.0
381112,Male,47,1,28,0,1-2 Year,Yes,40050.0,124,199,200.124961,35522,24304,40050.0
381113,Male,24,1,27,1,< 1 Year,Yes,37356.0,152,187,193.277003,888,44739,37356.0
381114,Male,27,1,28,1,< 1 Year,No,59097.0,152,297,243.098745,35522,44739,59097.0


Data pre-processing

In [602]:
X = train.drop('Response',axis=1)
# imp_cols = ['Gender','Age', 'Driving_License', 'Previously_Insured','Vehicle_Damage','Annual_Premium',
#        'Vintage','Vehicle_Age_Damage','Policy_Sales_Channel'] 
# X = train[imp_cols]
# test = test[imp_cols]
y = train['Response'].values

In [603]:
#Scaling numeric variables

sc = StandardScaler()
X[X.select_dtypes(exclude='object').columns.to_list()] = sc.fit_transform(X.select_dtypes(exclude='object'))

In [604]:
test[test.select_dtypes(exclude='object').columns.to_list()] = sc.transform(test.select_dtypes(exclude='object'))

In [605]:
X.head()

Unnamed: 0_level_0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Premium_sqrt,Region_Count,Channel_Count,Premium_mean
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,Male,0.333777,1,28,0,> 2 Years,Yes,0.574539,26,0.748795,0.624516,1.57643,-0.023951,0.574539
2,Male,2.396751,1,3,0,1-2 Year,No,0.172636,26,0.342443,0.313807,-0.667688,-0.023951,0.172636
3,Male,0.527181,1,28,0,> 2 Years,Yes,0.449053,26,-1.521998,0.530574,1.57643,-0.023951,0.449053
4,Male,-1.148985,1,11,1,< 1 Year,No,-0.113018,152,0.581474,0.072936,-0.668126,1.1212,-0.113018
5,Female,-0.633242,1,41,1,< 1 Year,No,-0.178259,152,-1.37858,0.01508,-0.459545,1.1212,-0.178259


In [606]:
test.head()

Unnamed: 0_level_0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Premium_sqrt,Region_Count,Channel_Count,Premium_mean
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
381110,Male,-0.891113,1,11,1,< 1 Year,No,0.30335,152,-1.211258,0.418108,-0.809845,-0.750761,0.30335
381111,Male,0.075906,1,28,0,1-2 Year,Yes,0.185766,7,-0.518068,0.324439,-0.060928,-1.669226,0.185766
381112,Male,0.527181,1,28,0,1-2 Year,Yes,0.551068,124,0.533668,0.607139,-0.060928,-1.175588,0.551068
381113,Male,-0.955581,1,27,1,< 1 Year,Yes,0.39456,152,0.390249,0.488955,-0.860841,-0.750761,0.39456
381114,Male,-0.762177,1,28,1,< 1 Year,No,1.657607,152,1.704919,1.348794,-0.060928,-0.750761,1.657607


In [607]:
#Encoding categorical variables

X = pd.get_dummies(X,drop_first=True)
test = pd.get_dummies(test,drop_first=True)

In [608]:
le = LabelEncoder()
y = le.fit_transform(y)

Training the model

In [589]:
# regex = re.compile(r"\[|\]|<", re.IGNORECASE)
# X.columns = [regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in X.columns.values]

# xgb = XGBClassifier()
# xgb.fit(X,y)
# xgb_pred = xgb.predict_proba(X)[:,1]
# roc_auc_score(y,xgb_pred)

In [609]:
clf = LGBMClassifier(n_estimators=550,
                     learning_rate=0.03,
                     min_child_samples=40,
                     random_state=1,
                     colsample_bytree=0.5,
                     reg_alpha=2,
                     reg_lambda=2)

clf.fit(X, y, verbose=50,eval_metric = 'auc')
lgb_pred = clf.predict_proba(X)[:,1]
roc_auc_score(y,lgb_pred)

0.8658209389232592

In [611]:
# plot_importance(clf,max_num_features=20);

Pre-processing test data and predicting probabilities

In [612]:
print(test.shape,X.shape)

missing_cols = set(X.columns) - set(test.columns)
for c in missing_cols:
    test[c] = 0

print(test.shape,X.shape)

#keeping the order of columns same for X and test
test = test[X.columns]

(127037, 209) (381109, 219)
(127037, 221) (381109, 219)


In [613]:
preds = 0
for seed_val in [1,3,10,15,20,33,333,1997,2020,2021]:
    print (seed_val)
    m=LGBMClassifier(n_estimators=450,learning_rate=0.03,random_state=seed_val,colsample_bytree=0.5,reg_alpha=2,reg_lambda=2)
    m.fit(X,y)
    predict=m.predict_proba(test)[:,1]
    preds += predict
preds = preds/10

1
3
10
15
20
33
333
1997
2020
2021


In [487]:
sample_submission['Response'] = preds
sample_submission.to_csv('Submission_v7.csv')