In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import shap

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, roc_curve
from imblearn.over_sampling import SMOTENC
from sklearn.model_selection import StratifiedKFold

import imblearn

import warnings
warnings.filterwarnings("ignore")

In [23]:
df = pd.read_csv("../Data/telecom_users_eda.csv")
print(f'telecom_users.csv -> shape: {df.shape}')

telecom_users.csv -> shape: (5986, 19)


In [24]:
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,Churn
0,Male,No,Yes,Yes,72,Yes,Yes,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Credit card (automatic),24.1,No
1,Female,No,No,No,44,Yes,No,Fiber optic,No,Yes,Yes,No,Yes,No,Month-to-month,Yes,Credit card (automatic),88.15,No
2,Female,Yes,Yes,No,38,Yes,Yes,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Bank transfer (automatic),74.95,Yes
3,Male,No,No,No,4,Yes,No,DSL,No,No,No,No,No,Yes,Month-to-month,Yes,Electronic check,55.9,No
4,Male,No,No,No,2,Yes,No,DSL,Yes,No,Yes,No,No,No,Month-to-month,No,Electronic check,53.45,No


### Train / Test Split

In [25]:
X = df.drop("Churn", axis = 1)
y = df["Churn"].map({"No" : 0, "Yes" : 1})

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train.shape, X_test.shape

((4788, 18), (1198, 18))

In [27]:
num_cols = X_train.select_dtypes(include = np.number).columns.tolist()
cat_cols = X_train.select_dtypes("object").columns.tolist()

In [28]:
num_cols

['tenure', 'MonthlyCharges']

In [29]:
cat_cols

['gender',
 'SeniorCitizen',
 'Partner',
 'Dependents',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod']

In [30]:
categorical_transformer = Pipeline(steps=[
                                          ("onehot", OneHotEncoder(handle_unknown="ignore"))
                                          ])
numeric_transformer = Pipeline(steps=[
                                      ("scaler", StandardScaler())
                                      ])
preprocessor_new = ColumnTransformer(transformers=[
                                               ("num", numeric_transformer, num_cols),
                                               ("cat", categorical_transformer, cat_cols)
                                               ])

In [31]:
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score


clf_rf = RandomForestClassifier(bootstrap= True,
                                ccp_alpha= 0.0,
                                class_weight= None,
                                criterion= 'gini',
                                max_depth= None,
                                max_features= 'auto',
                                max_leaf_nodes= None,
                                max_samples= None,
                                min_impurity_decrease= 0.0,
                                min_samples_leaf= 1,
                                min_samples_split= 2,
                                min_weight_fraction_leaf= 0.0,
                                n_estimators= 100,
                                n_jobs= -1,
                                oob_score= False,
                                random_state= 13,
                                verbose= 0,
                                warm_start= False)

In [32]:
pipeline_rf = Pipeline([("pre_process", preprocessor_new),
                         ("model", clf_rf)])
pipeline_rf

In [33]:
pipeline_rf.fit(X_train, y_train)
y_pred = pipeline_rf.predict(X_test)

class_labels = pipeline_rf.named_steps['model'].classes_

print(classification_report(y_test, y_pred))

pd.DataFrame(confusion_matrix(y_test, y_pred),
             columns=class_labels, index=class_labels)

              precision    recall  f1-score   support

           0       0.82      0.88      0.85       880
           1       0.58      0.47      0.52       318

    accuracy                           0.77      1198
   macro avg       0.70      0.67      0.68      1198
weighted avg       0.76      0.77      0.76      1198



Unnamed: 0,0,1
0,770,110
1,168,150


In [35]:
explainer = shap.KernelExplainer(pipeline_rf.predict, data = X_train)
# shap_values = explainer.shap_values(X_test)

In [None]:
type(pipeline_rf)


### SMOTE

In [36]:
y_train.value_counts(normalize=True)
categorical_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist()
numeric_features = X_train.select_dtypes(include=['int', 'float']).columns.tolist()

categorical_features, numeric_features

(['gender',
  'SeniorCitizen',
  'Partner',
  'Dependents',
  'PhoneService',
  'MultipleLines',
  'InternetService',
  'OnlineSecurity',
  'OnlineBackup',
  'DeviceProtection',
  'TechSupport',
  'StreamingTV',
  'StreamingMovies',
  'Contract',
  'PaperlessBilling',
  'PaymentMethod'],
 ['tenure', 'MonthlyCharges'])

In [37]:
categorical_feature_idxs = np.where(X_train.dtypes == "object")[0]
smtnc = SMOTENC(categorical_features=categorical_feature_idxs)
X_train_smote, y_train_smote = smtnc.fit_resample(X_train, y_train)

In [38]:
categorical_transformer = Pipeline(steps=[
                                          ("onehot", OneHotEncoder(handle_unknown="ignore"))
                                          ])
numeric_transformer = Pipeline(steps=[
                                      ("scaler", StandardScaler())
                                      ])
preprocessor = ColumnTransformer(transformers=[
                                               ("num", numeric_transformer, numeric_features),
                                               ("cat", categorical_transformer, categorical_features)
                                               ])


preprocessor = ColumnTransformer(transformers=[
                                               ("num", numeric_transformer, numeric_features),
                                               ("cat", categorical_transformer, categorical_features)
                                               ])

lr_model_smote = LogisticRegression(random_state=42, solver='liblinear')

In [39]:
pipeline_lr_smote = Pipeline([("col_transformer", preprocessor),
                            ("estimator", lr_model_smote)])

In [40]:
pipeline_lr_smote.fit(X_train_smote, y_train_smote)

y_pred_smote = pipeline_lr_smote.predict(X_test)

In [41]:
pd.DataFrame(confusion_matrix(y_test, y_pred_smote), index=class_labels)

Unnamed: 0,0,1
0,663,217
1,84,234


In [42]:
print(classification_report(y_test, y_pred_smote))

              precision    recall  f1-score   support

           0       0.89      0.75      0.81       880
           1       0.52      0.74      0.61       318

    accuracy                           0.75      1198
   macro avg       0.70      0.74      0.71      1198
weighted avg       0.79      0.75      0.76      1198

