In [1]:
import pandas as pd
import numpy as np
import missingno as msno
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier,BaggingClassifier,AdaBoostClassifier,ExtraTreesClassifier,StackingClassifier, HistGradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder, KBinsDiscretizer, RobustScaler
from sklearn.model_selection import KFold,StratifiedKFold, GroupKFold,train_test_split
import gc
import datetime
from tqdm.notebook import tqdm ,tnrange
import warnings
warnings.filterwarnings("ignore")
np.random.seed(0)
pd.set_option('display.max_columns', 500)

In [2]:
## Loading Train and Test data
df_train = pd.read_csv("train_s3TEQDk.csv")
df_test = pd.read_csv("test_mSzZ8RL.csv")
## there are some rows where all values expect id were same including the target (Is_Lead), dropping them
df_train = df_train.drop_duplicates(subset=[col for col in df_train.columns if 'ID' not in col])
## there are some observations where although all features had same values but target was different, so dropped those too
df_train = df_train[~df_train.duplicated(subset = [i for i in list(df_train.columns) if 'Is_Lead' not in i and 'ID' not in i], keep=False)].sort_values(list(df_train.columns)).reset_index(drop=True)
df_total = pd.concat([df_train,df_test],ignore_index=True, sort=False)
print("Shape of datasets: ", df_train.shape, df_test.shape, df_total.shape)

Shape of datasets:  (245700, 11) (105312, 10) (351012, 11)


In [3]:
# ## filling null values in credit column with a string now, but lets see if we can predict this column using other features later.
df_total['Credit_Product'].fillna("""Missing""", inplace = True)

# ## one idea that just struck my mind is converting this to a text classification problem,
# ## just so that we can used large bert architectures. Lets see if I work on that.

# ## Statement will be: will a 23 year old, financially active, self employed female living in RG282,
# ## who is associated with the company for 14 years and already has a credit product with 1005068
# ## bank balance be interested in buying a credit card if contacted through X2 channel?

In [6]:
# df_total['Vintage'] = np.log(1+df_total['Vintage'])
# df_total['Avg_Account_Balance'] = np.log(1+df_total['Avg_Account_Balance'])
# df_total['Age'] = np.log(1+df_total['Age'])

In [7]:
df_total

Unnamed: 0,ID,Gender,Age,Region_Code,Occupation,Channel_Code,Vintage,Credit_Product,Avg_Account_Balance,Is_Active,Is_Lead
0,222A8XWS,Male,66,RG280,Other,X2,50,No,819609,No,0.0
1,222HSZEH,Male,49,RG268,Self_Employed,X3,69,Yes,679666,No,0.0
2,222TDSNN,Male,32,RG262,Salaried,X1,32,No,761982,No,0.0
3,224FPNSD,Female,39,RG276,Self_Employed,X2,26,Missing,583519,Yes,1.0
4,224VSEND,Male,29,RG261,Salaried,X1,13,No,736866,Yes,0.0
...,...,...,...,...,...,...,...,...,...,...,...
351007,DBENJOYI,Male,52,RG268,Salaried,X2,86,Yes,4242558,Yes,
351008,CWQ72DWS,Male,55,RG277,Other,X2,86,Yes,1159153,No,
351009,HDESC8GU,Male,35,RG254,Salaried,X4,15,No,1703727,No,
351010,2PW4SFCA,Male,53,RG254,Other,X3,93,No,737178,Yes,


In [8]:
train_x_full = df_total[df_total['Is_Lead'].notnull()].reset_index(drop=True)
test_x_full = df_total[df_total['Is_Lead'].isnull()].reset_index(drop=True)
train_y_full = train_x_full['Is_Lead']
del test_x_full['Is_Lead']
del train_x_full['Is_Lead']
## storing test_id to use while submitting solution
test_id = test_x_full['ID']
## del id column from train and test as they have all unique values
del train_x_full['ID']
del test_x_full['ID']
print(train_x_full.shape, train_y_full.shape, test_x_full.shape)

(245700, 9) (245700,) (105312, 9)


In [9]:
cat_features = [col for col in train_x_full.columns if train_x_full[col].dtype=='O']
cat_features

['Gender',
 'Region_Code',
 'Occupation',
 'Channel_Code',
 'Credit_Product',
 'Is_Active']

### CATBOOST

In [10]:
# Final ROC-AUC = 0.8744
train_x = train_x_full.copy()
train_y = train_y_full.copy()
test_x = test_x_full.copy()
feature_importance_df = pd.DataFrame()

NFOLDS = 5
folds = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=18121995)
pred_test = []
pred_x = []
pred_y = []
j=1
for fold, (train_ids, test_ids) in enumerate(folds.split(train_x, train_y)):
    print('● Fold :', fold+1)
    model = CatBoostClassifier(n_estimators=20000,random_state=1812195,learning_rate=0.03,eval_metric='AUC',
                              cat_features =cat_features)
    model.fit(train_x.loc[train_ids], train_y.loc[train_ids], 
              eval_set=[(train_x.loc[train_ids], train_y.loc[train_ids]), (train_x.loc[test_ids], train_y.loc[test_ids])],
              verbose=500,
              early_stopping_rounds=200)
    pred_fold = model.predict_proba(train_x.loc[test_ids])[:,-1]
    pred_x.extend([float(i) for i  in pred_fold])
    pred_y.extend(list(train_y.loc[test_ids].values))
    pred_fold_test = model.predict_proba(test_x)[:,-1]
    pred_test.append([float(i) for i  in pred_fold_test])
    print('\n')
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = train_x.loc[train_ids].columns
    fold_importance_df["importance"] = model.feature_importances_
    fold_importance_df["fold"] = j + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    j=j+1

print("Final ROC-AUC Score:", roc_auc_score(pred_y, pred_x))

final_prediction = pd.DataFrame(pred_test).T
final_prediction.columns = [("FOLD_"+str(i)) for i in range(final_prediction.shape[1])]

vote = final_prediction.mean(axis=1)
final_prediction['Is_Lead'] = vote
final_prediction['ID'] = test_id.values
print(final_prediction.shape)

train_pred_all = pd.DataFrame()
train_pred_all['Is_Lead'] = pred_y
train_pred_all['cb_pred'] = pred_x
final_prediction_cb = final_prediction.copy()

final_prediction.head()

● Fold : 1
0:	test: 0.8654373	test1: 0.8606467	best: 0.8606467 (0)	total: 190ms	remaining: 1h 3m 13s
500:	test: 0.8770102	test1: 0.8709221	best: 0.8709369 (498)	total: 1m 13s	remaining: 47m 23s
1000:	test: 0.8789633	test1: 0.8711379	best: 0.8711658 (892)	total: 2m 27s	remaining: 46m 44s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.8711658313
bestIteration = 892

Shrink model to first 893 iterations.


● Fold : 2
0:	test: 0.8645943	test1: 0.8658580	best: 0.8658580 (0)	total: 135ms	remaining: 45m 4s
500:	test: 0.8760418	test1: 0.8751661	best: 0.8752054 (463)	total: 1m 15s	remaining: 48m 52s
1000:	test: 0.8779842	test1: 0.8753507	best: 0.8753530 (994)	total: 2m 57s	remaining: 56m 2s
1500:	test: 0.8792899	test1: 0.8753662	best: 0.8753841 (1390)	total: 4m 23s	remaining: 54m 11s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.875384108
bestIteration = 1390

Shrink model to first 1391 iterations.


● Fold : 3
0:	test: 0.8650985	test1: 0.8637284	bes

Unnamed: 0,FOLD_0,FOLD_1,FOLD_2,FOLD_3,FOLD_4,Is_Lead,ID
0,0.053367,0.049848,0.047747,0.049892,0.048808,0.049932,VBENBARO
1,0.865028,0.870687,0.871377,0.863397,0.877021,0.869502,CCMEWNKY
2,0.049734,0.064214,0.05872,0.055306,0.060793,0.057754,VK3KGA9M
3,0.023982,0.025493,0.026267,0.02657,0.024196,0.025301,TT8RPZVC
4,0.023913,0.023497,0.024482,0.02413,0.022598,0.023724,SHQZEYTZ


In [11]:
df_submission = final_prediction[['ID', 'Is_Lead']]
df_submission.to_csv("cb_v1.csv", index = False)
df_submission.head()

Unnamed: 0,ID,Is_Lead
0,VBENBARO,0.049932
1,CCMEWNKY,0.869502
2,VK3KGA9M,0.057754
3,TT8RPZVC,0.025301
4,SHQZEYTZ,0.023724


In [12]:
all_features = feature_importance_df[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)
all_features.reset_index(inplace=True)
important_features = list(all_features[all_features['importance']!=0]['feature'])
all_features

Unnamed: 0,feature,importance
0,Credit_Product,42.00705
1,Occupation,20.41232
2,Vintage,13.816159
3,Age,12.671497
4,Channel_Code,4.253038
5,Is_Active,2.729471
6,Region_Code,1.943427
7,Avg_Account_Balance,1.785278
8,Gender,0.38176
