# Train Telecom Customer Churn Prediction with XGBoost

This tutorial is based on [this](https://www.kaggle.com/pavanraj159/telecom-customer-churn-prediction/comments#6.-Model-Performances) Kaggle notebook and [this](https://github.com/gojek/feast/tree/master/examples/feast-xgboost-churn-prediction-tutorial) Feast notebook

In [14]:
import numpy as np
import pandas as pd
from hops import featurestore, hdfs
from hops import numpy_helper as numpy
from hops import pandas_helper as pandas
import os
import itertools
import warnings
warnings.filterwarnings("ignore")
import io
import statsmodels, yellowbrick
import sklearn # Tested with 0.22.1
import imblearn
from slugify import slugify

### 1.1 Data

In [15]:
telecom_df = featurestore.get_featuregroup("telcom_featuregroup", dataframe_type="pandas")
telecom_df.head()

Running sql: use churn_featurestore against offline feature store
SQL string for the query created successfully
Running sql: SELECT * FROM telcom_featuregroup_1 against offline feature store
   churn  dependents  ...    tenure  total_charges
0      0           0  ...  0.512486       0.005713
1      1           0  ... -1.239504      -0.924663
2      1           0  ... -1.158016      -0.892103
3      0           0  ... -1.239504      -0.990400
4      1           0  ... -1.158016      -0.875360

[5 rows x 47 columns]

### 1.6 Data Preparation for Training

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
from sklearn.metrics import roc_auc_score,roc_curve,scorer
from sklearn.metrics import f1_score
import statsmodels.api as sm
from sklearn.metrics import precision_score,recall_score
from yellowbrick.classifier import DiscriminationThreshold

Id_col     = ['customer_id']
target_col = ["churn"]
# Split into a train and test set
train, test = train_test_split(telecom_df,test_size = .25 ,random_state = 111)
    
# Seperating dependent and independent variables
cols    = [i for i in telecom_df.columns if i not in Id_col + target_col]
training_x = train[cols]
training_y = train[target_col]
testing_x  = test[cols]
testing_y  = test[target_col]

### 1.7 Training

In [22]:
from xgboost import XGBClassifier

xgb_model = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
                      colsample_bytree=1, gamma=0, learning_rate=0.9, max_delta_step=0,
                      max_depth=7, min_child_weight=1, missing=None, n_estimators=100,
                      n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
                      reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
                      silent=True, subsample=1)

# Train model
xgb_model.fit(training_x, training_y)
predictions = xgb_model.predict(testing_x)
probabilities = xgb_model.predict_proba(testing_x)

### 1.8 Analysis

In [23]:
coefficients = pd.DataFrame(xgb_model.feature_importances_)
column_df = pd.DataFrame(cols)
coef_sumry = (pd.merge(coefficients, column_df, left_index=True,
                       right_index=True, how="left"))
coef_sumry.columns = ["coefficients", "features"]
coef_sumry = coef_sumry.sort_values(by="coefficients", ascending=False)

acc = accuracy_score(testing_y, predictions)
print(xgb_model)
print("\n Classification report : \n", classification_report(testing_y, predictions))
print("Accuracy   Score : ", acc)


XGBClassifier(learning_rate=0.9, max_depth=7, silent=True)

 Classification report : 
               precision    recall  f1-score   support

           0       0.82      0.87      0.85      1282
           1       0.59      0.49      0.54       476

    accuracy                           0.77      1758
   macro avg       0.70      0.68      0.69      1758
weighted avg       0.76      0.77      0.76      1758

Accuracy   Score :  0.7690557451649602

In [24]:
from hops import model
import pickle
MODEL_NAME = "CClassifier"

file_name = "xgb_reg.pkl"
hdfs_path = "Resources/xgboost_model"
# save
pickle.dump(xgb_model, open(file_name, "wb"))
hdfs.mkdir(hdfs_path)
hdfs.copy_to_hdfs(file_name, hdfs_path, overwrite=True)

# load
xgb_model_loaded = pickle.load(open(file_name, "rb"))

# test
xgb_model_loaded.predict(testing_x)[0] == xgb_model.predict(testing_x)[0]

model.export(hdfs_path, MODEL_NAME, metrics={'accuracy': acc})

Started copying local path xgb_reg.pkl to hdfs path hdfs://rpc.namenode.service.consul:8020/Projects/churn/Resources/xgboost_model/xgb_reg.pkl

Finished copying

Exported model CClassifier as version 1 successfully.
Polling CClassifier version 1 for model availability.
Model now available.