**BASELINE MODEL: CHURN PREDICTION**

**Import libraries**

In [1]:
import pandas as pd
import pandas_profiling as pp
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from collections import defaultdict

import lightgbm as lgb


# Work with data

**Load data**

In [2]:
data = pd.read_csv("../data/train.csv", engine='c', sep=',', low_memory=False)
cat_columns = data.select_dtypes(exclude=[np.number]).columns
object_cols = {}
for col in cat_columns:
        object_cols[col] = 'str'
object_cols['zip']='str'

data = pd.read_csv("../data/train.csv", engine='c', sep=',', dtype=object_cols)
responders = data["Responders"].copy()
data.drop("Responders", axis=1, inplace=True)
data.drop("UCIC_ID", axis=1, inplace=True)


**Make profile about data**

In [4]:
#profile=pp.ProfileReport(data)
#profile.to_file(outputfile="../data/profile.html")

**Deal with cathegorical features. Encoding**


In [18]:
d = defaultdict(LabelEncoder)

cat_columns = data.select_dtypes(exclude=[np.number]).columns

# fit and encode data
a = data[cat_columns].fillna('').apply(lambda x: d[x.name].fit(x))

# transform encodings to data
data[cat_columns] = data[cat_columns].fillna('').apply(lambda x: d[x.name].transform(x))


# Model

**Introduce a model**

In [19]:

params = {
    'application': 'binary',
    'num_leaves':256,
    'feature_fraction': 0.9,
    'sub_row': 0.9,
    #'min_data_in_leaf': 50,
    'bagging_freq': 1,
    'max_bin': 256,
    #'metric': 'auc',
    'metric': ('l1', 'l2'),
    'num_threads': 4,
    'verbose': 0,
    'seed': 42,
    'keep_training_booster': False
}



** Cross-validation of the model**

In [42]:
def metric_lift(df, percentage=0.2):
    return df.sort_values('prediction', ascending=False).iloc[:int(len(df)*percentage)]["Responders"].sum()/df["Responders"].sum()

num_folds = 5
subset_size = int(len(data)/num_folds)
lifts=[]
for i in range(num_folds):
    #take fold
    x_test  = data[i*subset_size:][:subset_size]
    x_train = pd.concat([data[:i*subset_size],data[(i+1)*subset_size:]])
    y_test  = responders[i*subset_size:][:subset_size]
    y_train = pd.concat([responders[:i*subset_size],responders[(i+1)*subset_size:]])
    
    # train using training_this_round
    lgb_train = lgb.Dataset(x_train, y_train)
    
    print('Start training number {} ...'.format(i+1))
    model = lgb.train(params, lgb_train, num_boost_round=100)
    print('Training number {} finished.'.format(i+1))
    # predict test
    validation=pd.DataFrame()
    validation["Responders"]=y_test
    validation["prediction"]=model.predict(x_test)
    # calculate lift
    lift=metric_lift(validation)
    print('Lift meric = ', lift)

    # save accuracy
    lifts.append(lift)

#find mean accuracy over all rounds
mean_lifts=np.mean(lifts)
print("Mean of lifts = ", mean_lifts)


Start training number 1 ...
Training number 1 finished.
Lift meric =  0.6733133288161843
Start training number 2 ...
Training number 2 finished.
Lift meric =  0.6788955020001951
Start training number 3 ...
Training number 3 finished.
Lift meric =  0.6700784388750718
Start training number 4 ...
Training number 4 finished.
Lift meric =  0.6706575263107077
Start training number 5 ...
Training number 5 finished.
Lift meric =  0.6681544731224529
Mean of lifts =  0.672219853825
