**MODEL: CHURN PREDICTION**

**Import libraries**

In [1]:
import pandas as pd
import pandas_profiling as pp
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from collections import defaultdict

import lightgbm as lgb


# Work with data

### Load data

In [2]:
data = pd.read_csv("../data/train.csv", engine='c', sep=',', low_memory=False)
cat_columns = data.select_dtypes(exclude=[np.number]).columns
object_cols = {}
for col in cat_columns:
        object_cols[col] = 'str'
object_cols['zip']='str'

data = pd.read_csv("../data/train.csv", engine='c', sep=',', dtype=object_cols)
responders = data["Responders"].copy()
data.drop("Responders", axis=1, inplace=True)
data.drop("UCIC_ID", axis=1, inplace=True)


### Work with features (Optional)

**Create features**

In [12]:
#data['zip1']=data['zip'].str[:2]
#data['zip2']=data['zip'].str[:3]


**Delete features**

*Delete constant features*

In [3]:
num_cols = data.select_dtypes(include=[np.number]).columns
cat_cols = data.select_dtypes(exclude=[np.number]).columns

# check for constant columns
# numerical
stats_num = data[num_cols].fillna(-999.99).apply(pd.Series.nunique, axis=0)
const_num = stats_num[stats_num == 1].index.tolist()
# categorical
stats_cat = data[cat_cols].fillna('is_nan').apply(pd.Series.nunique, axis=0)
const_cat = stats_cat[stats_cat == 1].index.tolist()
# join both
const_cols = const_num + const_cat

print("Number of constant numerical features ", len(const_num) )
print("Number of constant categorical features ", len(const_cat) )

print("Number of columns before dropping: ", data.shape[1])
data.drop(const_cols, axis=1, inplace=True)
print("And after: ", data.shape[1])


Number of constant numerical features  46
Number of constant categorical features  0
Number of columns before dropping:  375
And after:  329


*Delete cathegorical features with too many options*

In [4]:
# calculate number of unique values for each categorial feature
stats_cat = data[cat_cols].fillna('is_nan').apply(pd.Series.nunique, axis=0)
#print(stats_cat)

# select features to drop
# unique values > 1000
feat_drop1 = stats_cat[stats_cat>1000].index.tolist()
# unique values > 10
feat_drop2 = stats_cat[(stats_cat>10) & (stats_cat<1000)].index.tolist()

print("Features to drop 1: ", feat_drop1)
print("Features to drop 2: ", feat_drop2)

data.drop(feat_drop1, axis=1, inplace=True)
data.drop(feat_drop2, axis=1, inplace=True)


Features to drop 1:  ['city', 'zip']
Features to drop 2:  ['Req_Resolved_PrevQ1', 'Query_Resolved_PrevQ1', 'Complaint_Resolved_PrevQ1']


### Deal with cathegorical features. Encoding


In [5]:
d = defaultdict(LabelEncoder)

cat_columns = data.select_dtypes(exclude=[np.number]).columns

# fit and encode data
a = data[cat_columns].fillna('').apply(lambda x: d[x.name].fit(x))

# transform encodings to data
data[cat_columns] = data[cat_columns].fillna('').apply(lambda x: d[x.name].transform(x))


# Model

**Introduce a model**

In [6]:

params = {
    'application': 'binary',
    'num_leaves':256,
    'feature_fraction': 0.9,
    'sub_row': 0.9,
    #'min_data_in_leaf': 50,
    'bagging_freq': 1,
    'max_bin': 256,
    #'metric': 'auc',
    'metric': ('l1', 'l2'),
    'num_threads': 4,
    'verbose': 0,
    'seed': 42,
    'keep_training_booster': False
}



** Cross-validation of the model**

In [7]:
def metric_lift(df, percentage=0.2):
    return df.sort_values('prediction', ascending=False).iloc[:int(len(df)*percentage)]["Responders"].sum()/df["Responders"].sum()

num_folds = 5
subset_size = int(len(data)/num_folds)
lifts=[]
for i in range(num_folds):
    #take fold
    x_test  = data[i*subset_size:][:subset_size]
    x_train = pd.concat([data[:i*subset_size],data[(i+1)*subset_size:]])
    y_test  = responders[i*subset_size:][:subset_size]
    y_train = pd.concat([responders[:i*subset_size],responders[(i+1)*subset_size:]])
    
    # train using training_this_round
    lgb_train = lgb.Dataset(x_train, y_train)
    
    print('Start training number {} ...'.format(i+1))
    model = lgb.train(params, lgb_train, num_boost_round=100)
    print('Training number {} finished.'.format(i+1))
    # predict test
    validation=pd.DataFrame()
    validation["Responders"]=y_test
    validation["prediction"]=model.predict(x_test)
    # calculate lift
    lift=metric_lift(validation)
    print('Lift meric = ', lift)

    # save accuracy
    lifts.append(lift)

#find mean accuracy over all rounds
mean_lifts=np.mean(lifts)
std_lifts=np.std(lifts)
print("Mean of lifts \t \t= ", mean_lifts)
print("Variation of lifts \t= ", std_lifts)


Start training number 1 ...
Training number 1 finished.
Lift meric =  0.6773787629464718
Start training number 2 ...
Training number 2 finished.
Lift meric =  0.6785052200214655
Start training number 3 ...
Training number 3 finished.
Lift meric =  0.6735220968050507
Start training number 4 ...
Training number 4 finished.
Lift meric =  0.6720092690933668
Start training number 5 ...
Training number 5 finished.
Lift meric =  0.6680574422666408
Mean of lifts 	 	=  0.673894558227
Variation of lifts 	=  0.00377260785328


**Save results of cross-validation of the model to a file (Optional)**

In [8]:
with open("compare_models.txt", "a") as myfile:
    myfile.write("\n#")
    myfile.write("\nMean of lifts \t \t= {}".format(mean_lifts))
    myfile.write("\nVariation of lifts \t= {}".format(std_lifts))
    myfile.write("\n")
