In [79]:
import pandas as pd
import numpy as np
import seaborn as sn
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
import statsmodels.formula.api as smf
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix,roc_auc_score,roc_curve

In [80]:
tele = pd.read_csv('telecommunications_churn.csv')
tele

Unnamed: 0,account_length,voice_mail_plan,voice_mail_messages,day_mins,evening_mins,night_mins,international_mins,customer_service_calls,international_plan,day_calls,day_charge,evening_calls,evening_charge,night_calls,night_charge,international_calls,international_charge,total_charge,churn
0,128,1,25,265.1,197.4,244.7,10.0,1,0,110,45.07,99,16.78,91,11.01,3,2.70,75.56,0
1,107,1,26,161.6,195.5,254.4,13.7,1,0,123,27.47,103,16.62,103,11.45,3,3.70,59.24,0
2,137,0,0,243.4,121.2,162.6,12.2,0,0,114,41.38,110,10.30,104,7.32,5,3.29,62.29,0
3,84,0,0,299.4,61.9,196.9,6.6,2,1,71,50.90,88,5.26,89,8.86,7,1.78,66.80,0
4,75,0,0,166.7,148.3,186.9,10.1,3,1,113,28.34,122,12.61,121,8.41,3,2.73,52.09,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3328,192,1,36,156.2,215.5,279.1,9.9,2,0,77,26.55,126,18.32,83,12.56,6,2.67,60.10,0
3329,68,0,0,231.1,153.4,191.3,9.6,3,0,57,39.29,55,13.04,123,8.61,4,2.59,63.53,0
3330,28,0,0,180.8,288.8,191.9,14.1,2,0,109,30.74,58,24.55,91,8.64,6,3.81,67.74,0
3331,184,0,0,213.8,159.6,139.2,5.0,2,1,105,36.35,84,13.57,137,6.26,10,1.35,57.53,0


In [81]:
drop = ['day_charge','evening_charge','night_charge','international_charge','day_calls','evening_calls','night_calls','voice_mail_messages','day_mins','evening_mins','night_mins','account_length']
tele1 = tele.drop(columns=drop)
x = tele1.drop("churn", axis=1)
y = tele1.churn
tele1.shape

(3333, 7)

In [82]:
tele1

Unnamed: 0,voice_mail_plan,international_mins,customer_service_calls,international_plan,international_calls,total_charge,churn
0,1,10.0,1,0,3,75.56,0
1,1,13.7,1,0,3,59.24,0
2,0,12.2,0,0,5,62.29,0
3,0,6.6,2,1,7,66.80,0
4,0,10.1,3,1,3,52.09,0
...,...,...,...,...,...,...,...
3328,1,9.9,2,0,6,60.10,0
3329,0,9.6,3,0,4,63.53,0
3330,0,14.1,2,0,6,67.74,0
3331,0,5.0,2,1,10,57.53,0


In [83]:
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.2, random_state=42)

max_depth = range(1, 11, 1)
print(max_depth)

params={
 "learning_rate"    : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ] ,
 "n_estimators"     : [50, 100, 200, 150, 250, 300]  
}


param_grid = dict(max_depth=max_depth)
params.update(param_grid)

In [84]:
xmodel = XGBClassifier()
xmodel.fit(train_x,train_y)
y_pred1  = xmodel.predict(train_x)
y_pred2  = xmodel.predict(test_x)

In [85]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
import xgboost
from xgboost.sklearn import XGBClassifier



In [86]:
classifier=xgboost.XGBClassifier()

In [87]:
random_search=RandomizedSearchCV(classifier,param_distributions=params,n_iter=5,scoring='roc_auc',n_jobs=-1,cv=5,verbose=3)
random_search.fit(train_x,train_y)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


In [88]:
random_search.best_estimator_

In [89]:
random_search.best_params_

{'n_estimators': 150, 'max_depth': 2, 'learning_rate': 0.25}

In [90]:
xgb2 = XGBClassifier(n_estimators=150,learning_rate=0.25,max_depth=2)
xgb2.fit(train_x,train_y)
y_pred4  = xgb2.predict(test_x)

In [91]:
print(classification_report(test_y, y_pred4))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       566
           1       1.00      0.87      0.93       101

    accuracy                           0.98       667
   macro avg       0.99      0.94      0.96       667
weighted avg       0.98      0.98      0.98       667



In [93]:
confusion_matrix(test_y, y_pred4)

array([[566,   0],
       [ 13,  88]], dtype=int64)

In [94]:
from sklearn.model_selection import cross_val_score
score=cross_val_score(classifier,train_x,train_y,cv=10)

In [95]:
score

array([0.98876404, 0.97752809, 0.9588015 , 0.98876404, 0.98127341,
       0.97752809, 0.96240602, 0.98120301, 0.97744361, 0.98120301])

In [96]:
score.mean()

0.9774914815127707

### Pipeline

In [54]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn_pandas import DataFrameMapper

In [55]:
tele1

Unnamed: 0,voice_mail_plan,international_mins,customer_service_calls,international_plan,international_calls,total_charge,churn
0,1,10.0,1,0,3,75.56,0
1,1,13.7,1,0,3,59.24,0
2,0,12.2,0,0,5,62.29,0
3,0,6.6,2,1,7,66.80,0
4,0,10.1,3,1,3,52.09,0
...,...,...,...,...,...,...,...
3328,1,9.9,2,0,6,60.10,0
3329,0,9.6,3,0,4,63.53,0
3330,0,14.1,2,0,6,67.74,0
3331,0,5.0,2,1,10,57.53,0


In [68]:
tele1['voice_mail_plan'].replace({0:"No",1:'Yes'}, inplace=True)
tele1['international_plan'].replace({0:"No",1:'Yes'}, inplace=True)
#tele1.replace({'churn': {0: "No", 1: "Yes"}}, inplace=True)


In [69]:
tele1

Unnamed: 0,voice_mail_plan,international_mins,customer_service_calls,international_plan,international_calls,total_charge,churn
0,Yes,10.0,1,No,3,75.56,0
1,Yes,13.7,1,No,3,59.24,0
2,No,12.2,0,No,5,62.29,0
3,No,6.6,2,Yes,7,66.80,0
4,No,10.1,3,Yes,3,52.09,0
...,...,...,...,...,...,...,...
3328,Yes,9.9,2,No,6,60.10,0
3329,No,9.6,3,No,4,63.53,0
3330,No,14.1,2,No,6,67.74,0
3331,No,5.0,2,Yes,10,57.53,0


In [70]:
X = tele1.drop("churn", axis=1)
y = tele1.churn

In [72]:
y.value_counts()

0    2850
1     483
Name: churn, dtype: int64

In [73]:
m = DataFrameMapper([(["voice_mail_plan",LabelEncoder()]),(["international_plan",LabelEncoder()])])

In [74]:
classifier = []
classifier.append(("mapper",m))
classifier.append(("model", XGBClassifier(n_estimators=150, max_depth=5,learning_rate= 0.1)))

In [75]:
classifier

[('mapper',
  DataFrameMapper(drop_cols=[],
                  features=[['voice_mail_plan', LabelEncoder()],
                            ['international_plan', LabelEncoder()]])),
 ('model',
  XGBClassifier(base_score=None, booster=None, callbacks=None,
                colsample_bylevel=None, colsample_bynode=None,
                colsample_bytree=None, early_stopping_rounds=None,
                enable_categorical=False, eval_metric=None, feature_types=None,
                gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
                interaction_constraints=None, learning_rate=0.1, max_bin=None,
                max_cat_threshold=None, max_cat_to_onehot=None,
                max_delta_step=None, max_depth=5, max_leaves=None,
                min_child_weight=None, missing=nan, monotone_constraints=None,
                n_estimators=150, n_jobs=None, num_parallel_tree=None,
                predictor=None, random_state=None, ...))]

In [76]:
model= Pipeline(classifier)

In [77]:
model.fit(X,y)

In [78]:
import pickle
with open(file="Tele_Final_M.pkl", mode="wb") as f:
    pickle.dump(model, f)