# MachineHack Insurance Churn Prediction - Problem Statement


Insurance companies around the world operate in a very competitive environment. With various aspects of data collected from millions of customers, it is painstakingly hard to analyze and understand the reason for a customer’s decision to switch to a different insurance provider.

For an industry where customer acquisition and retention are equally important, and the former being a more expensive process, insurance companies rely on data to understand customer behavior to prevent retention. Thus knowing whether a customer is possibly going to switch beforehand gives Insurance companies an opportunity to come up with strategies to prevent it from actually happening.

Given are 16 distinguishing factors that can help in understanding the customer churn, your objective as a data scientist is to build a Machine Learning model that can predict whether the insurance company will lose a customer or not using these factors.

You are provided with 16 anonymized factors (feature_0 to feature 15) that influence the churn of customers in the insurance industry

https://www.machinehack.com/course/insurance-churn-prediction-weekend-hackathon-2/

# Import Libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectFromModel
%matplotlib inline
# Classification
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier

import xgboost as xgb
import lightgbm as lgb
import catboost as cat

# Preprocessing
from sklearn.metrics import log_loss
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score

# Reading Data

In [None]:
test = pd.read_csv('../input/insurance-churn-prediction/Test.csv')
train = pd.read_csv('../input/insurance-churn-prediction/Train.csv')

In [None]:
train.head()

In [None]:
sns.heatmap(train.isnull())

In [None]:
df=train.append(test,ignore_index=True)

I generally use pandas profling to ge an overall look at the data before deep-diving! 

In [None]:
# #pd.plotting.scatter_matrix(df, alpha=0.2, figsize=(10, 10))
# import pandas_profiling
# prof = pandas_profiling.ProfileReport(df)
# prof.to_file(output_file='output.html')

In [None]:
original_cols = test.columns
original_cols

In [None]:
df.info()

In [None]:
df['labels'].value_counts()

# Feature Engineering

For anonimysed data, I tried to look into the the transformations for each variable. First I try to find the shift and co-efficient used for every variable and then check if there was any transformation such as exponenet or log function applied.

Feature 3

In [None]:
#df['feature_3'].head()
#plt.hist(df['feature_3'])
sns.distplot(df['feature_3'])

In [None]:
x3=df['feature_3']
diff = np.diff(np.sort(x3))
diff
np.unique(diff)

In [None]:
x3_shift = ((x3/0.00388311868) - 0.8369211 +260).round()
x3_shift.value_counts()
x3_shift_log = np.log(x3_shift)
df['feature_3_shift_log'] = x3_shift_log
sns.distplot(x3_shift_log)

Feature 2

In [None]:
x2=df['feature_2']
np.diff(np.sort(x2.unique()))


In [None]:
np.diff(np.sort(x2.unique())/0.12015788)
x2_shift = ((df['feature_2']/0.12015788) -0.193581 + 15)
x2_shift = x2_shift.round()
np.sort(x2_shift.unique())

In [None]:
(x2_shift).value_counts()
sns.distplot(x2_shift)
x2_shift_bin = x2_shift.apply(lambda x : 1 if x>15 else 0)
df['feature_2_shift'] =x2_shift
df['feature_2_shift_bin'] =x2_shift_bin

In [None]:
x2_shift.value_counts()

Feature 0

In [None]:
x0 = df['feature_0']
sns.boxplot(x0)

In [None]:
diff = np.diff(np.sort(x0))
np.unique(diff)

In [None]:
x0_shift = (x0/0.09417398 -  0.063788 + 54).round()
sns.distplot(np.log(x0_shift))
#sns.distplot(x0_shift)
#sns.boxplot(x0_shift)
x0_shift.describe()
df['feature_0_shift_log']  = np.log(x0_shift)

Feature 1

In [None]:
x1 = df['feature_1']
sns.boxplot(x1)

In [None]:
diff = np.diff(np.sort(x1))
np.unique(diff)

In [None]:
x1_shift = (x1/0.000328436115 - 0.727934 + 10000).round()
x1_shift_log = np.log(x1_shift)
sns.distplot(x1_shift_log)
x1_shift.value_counts()
#sns.boxplot(x1_shift_log)
df['feature1_shift_log']  = x1_shift_log

In [None]:
x1_shift.value_counts()

Feature 14

In [None]:
x14 = df['feature_14']
#sns.boxplot(x14)
sns.distplot(x14)

In [None]:
x14.value_counts()

Feature 4

In [None]:
x4 = df['feature_4']

In [None]:
sns.distplot(x4)

In [None]:
diff = np.diff(np.sort(x4))
np.unique(diff)

In [None]:
x4_shift = (x4/0.64558058 - 0.11808 + 2).round().value_counts()

In [None]:
sns.distplot(np.log(x4_shift))
df['x4_shift_log'] = np.log(x4_shift)

Feature 5

In [None]:
x5 = df['feature_5']
x5.value_counts()

In [None]:
diff = np.diff(np.sort(x5))
np.unique(diff)

In [None]:
x5_shift = (x5/0.00998725 - 0.802206 +43).round()
x5_shift.value_counts()

In [None]:
(np.log(x5_shift)).value_counts()
df['x5_shift_log'] = np.log(x5_shift)
df['x5_cat'] = df['x5_shift_log'].apply(lambda x : 1 if x==0 else 0)

Variable 6

In [None]:
x6 = df['feature_6']
x6.value_counts()

In [None]:
diff = np.diff(np.sort(x6))
np.unique(diff)

In [None]:
x6_shift = (x6/0.4341379 - 0.419677 + 2).round()
x6_shift.value_counts()
(np.log(x6_shift)).value_counts()
df['x6_shift'] = x6_shift
df['x6_shift_log'] = np.log(x6_shift)
df['x6_cat'] = df['x6_shift_log'].apply(lambda x : 1 if x==0 else 0)


Feature 7

In [None]:
x7 = df['feature_7']
x7.value_counts()

In [None]:
for i in [0,1,2,3,4,5,6,14] :
    col = 'feature_'+str(i)
    print(col)
    df.drop(columns = [col],inplace = True)


In [None]:

new_col = ['feature_3_shift_log',
       'feature_2_shift', 'feature_2_shift_bin', 'feature_0_shift_log',
       'feature1_shift_log', 'x4_shift_log', 'x5_shift_log', 'x5_cat',
       'x6_shift', 'x6_shift_log', 'x6_cat']
new_col

I tried making new features with simple mathematical fucntions with these features but didnt turn out too useful

In [None]:
# for i in range(len(new_col)):
#     if(new_col[i]=='labels') :
        
#         continue
    
#     else :
        
#         for j in range(len(new_col)) :
            
#             if(new_col[j]=='labels') :
#                 continue
#             elif i<j :
# #                print(new_col[i],new_col[j])
#                 colm = new_col[i]+"_mul_"+new_col[j]
#                 cols = new_col[i]+"_sum_"+new_col[j]
#                 cold = new_col[i]+"_diff_"+new_col[j]
#                 coldi = new_col[i]+"_div_"+new_col[j]
#                 #print(col)
#                 df[colm] = df[new_col[i]]*df[new_col[j]]
#                 df[cols] = df[new_col[i]]+df[new_col[j]]
#                 #df[cold] = df[new_col[i]]-df[new_col[j]]
#                 #df[coldi] = df[new_col[i]]/df[new_col[j]]
#             else :
#                 continue

In [None]:
labels = df['labels']
df = df.dropna(axis=1)

In [None]:
df['labels']= labels

In [None]:
sns.heatmap(df.isnull())

In [None]:
df = df.replace([np.inf, -np.inf], 0)

In [None]:
feat = df.columns
feat = feat.drop('labels')

In [None]:
feat

In [None]:
target = 'labels'

In [None]:
(train[target].value_counts() / train.shape[0])*100

In [None]:
df_train=df[df['labels'].isnull()==False].copy()

Also checked for imbalance, but didnt play a very big role

In [None]:
# from imblearn.over_sampling import SMOTE
# sm = SMOTE(random_state=42, sampling_strategy='all')
# X_train_ovr, y_train_ovr = sm.fit_sample(df_train[feat], df_train[target])

# print("After Oversampling : {} --> {}".format(X_train_ovr.shape, y_train_ovr.shape))

In [None]:
# train_ovr = pd.DataFrame(X_train_ovr, columns=df_train.columns.tolist())
# train_ovr[target] = y_train_ovr

# train_ovr.shape

In [None]:
# (train_ovr[target].value_counts() / train_ovr.shape[0])*100

# Modelling

Checking a baseline score with all models then fianlised using LGBM for submission

In [None]:
def baseliner(X, y, cv=3, metric='f1_macro'):
    print("Baseliner Models\n")
    eval_dict = {}
    models = [lgb.LGBMClassifier(), xgb.XGBClassifier(),
              #GradientBoostingClassifier(),
                  LogisticRegression(), GaussianNB(), RandomForestClassifier(), DecisionTreeClassifier(),
                  ExtraTreeClassifier(), AdaBoostClassifier(), BaggingClassifier(),
              #ExtraTreesClassifier(),
              #SVC(probability=True), KNeighborsClassifier() 
                 ]
    print("Model Name \t |   f1")
    print("--" * 50)

    for index, model in enumerate(models, 0):
        model_name = str(model).split("(")[0]
        eval_dict[model_name] = {}

        results = cross_val_score(model, X, y, cv=cv, scoring=metric)
        eval_dict[model_name]['cv'] = results.mean()

        print("%s \t | %.4f \t" % (
            model_name[:12], eval_dict[model_name]['cv']))

In [None]:
df_train=df[df['labels'].isnull()==False].copy()
df_test=df[df['labels'].isnull()==True].copy()
df_test.drop(columns=['labels'],axis=1, inplace=True)

print(df_train.shape,df_test.shape)

In [None]:
x = df_train.drop('labels',axis=1)
y = df_train['labels']

In [None]:
baseliner(x, y)

In [None]:
from sklearn.metrics import f1_score

def lgb_f1_score(y_hat, data):
    y_true = data.get_label()
    y_hat = np.round(y_hat) # scikits f1 doesn't like probabilities
    return 'f1', f1_score(y_true, y_hat), True

In [None]:
def lgb_model(train, features, target, ts=False, plot=True):
    evals_result = {}
    trainX, validX, trainY, validY = train_test_split(train[features], train[target], shuffle=False, test_size=0.2, random_state=13)
    print("LGB Model")
    lgb_train_set = lgb.Dataset(trainX, label=trainY)
    lgb_valid_set = lgb.Dataset(validX, label=validY)

    MAX_ROUNDS = 2000
    lgb_params = {
        "boosting": 'gbdt',
        "learning_rate": 0.1,
        "nthread": -1,
        "seed": 13,
        "num_boost_round": MAX_ROUNDS,
        "objective": "binary",
    }

    lgb_model = lgb.train(
        lgb_params,
        train_set=lgb_train_set,
        valid_sets=[lgb_train_set, lgb_valid_set],
        early_stopping_rounds=250,
        verbose_eval=100,
        evals_result=evals_result,
        feval=lgb_f1_score # New metric to be optimised
    )
    if plot:
        lgb.plot_importance(lgb_model, figsize=(24, 24))
        lgb.plot_metric(evals_result, metric='f1')

    return lgb_model, lgb_model.best_score

In [None]:
lgbM, score = lgb_model(df_train, feat, target, True, True)

In [None]:
y_preds = lgbM.predict(df_test[feat])
y_preds

In [None]:
df_lgb = pd.DataFrame({'labels':y_preds})
df_lgb['labels'] = df_lgb['labels'].apply(lambda x : 1 if x>0.5 else 0)

In [None]:
df_lgb['labels'].value_counts()

In [None]:
import time
times = time.strftime("%Y%m%d-%H%M%S")

df_lgb.to_excel('submission-lgb_'+times+'.xlsx',index=False)