# Data description

In [None]:
# necessary download for running hyperparameter tuning
!pip install optuna

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting optuna
  Downloading optuna-3.0.5-py3-none-any.whl (348 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m348.5/348.5 KB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=1.5.0
  Downloading alembic-1.9.1-py3-none-any.whl (210 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m210.4/210.4 KB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
Collecting cliff
  Downloading cliff-4.1.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.0/81.0 KB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
Collecting colorlog
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting cmaes>=0.8.2
  Downloading cmaes-0.9.1-py3-none-any.whl (21 kB)
Collecting importlib-metadata<5.0.0
  Downloading importlib_metadata-4.13.0-py3-none-any.whl (23 kB)
Collecting Mako
  Downloading Mako-1.2.4-py3-none-any.whl (7

In [None]:
# libraries for data preparation
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

# libraries for modeling
from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# libraries for displaying performance metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# library for hyperparameter tuning
import optuna
%matplotlib inline

In [None]:
data = pd.read_csv("/content/WA_Fn-UseC_-Telco-Customer-Churn.csv")

In [None]:
data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


# Data Preparation

## 1. Checking missing values

In [None]:
data.isnull().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

## 2. Removing empty cells

In [None]:
for columns in data.columns.tolist():
  print(f'{columns} - ',len(data[data[columns] == ' ']) )

customerID -  0
gender -  0
SeniorCitizen -  0
Partner -  0
Dependents -  0
tenure -  0
PhoneService -  0
MultipleLines -  0
InternetService -  0
OnlineSecurity -  0
OnlineBackup -  0
DeviceProtection -  0
TechSupport -  0
StreamingTV -  0
StreamingMovies -  0
Contract -  0
PaperlessBilling -  0
PaymentMethod -  0
MonthlyCharges -  0
TotalCharges -  11
Churn -  0


In [None]:
new_data = data.drop( # because 'TotalCharges' contains empty cells
    labels=data.loc[data['TotalCharges']==' ',].index.values.tolist(),
    axis=0,
    inplace=False
)

new_data['TotalCharges']=new_data['TotalCharges'].astype(float) # to avoid TypeError: no numeric data to plot

#### Before removing empty cells, we had 7043 rows. We dropped 11 rows due to empty cells. 

In [None]:
new_data.shape

(7032, 21)

In [None]:
new_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7032 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7032 non-null   object 
 1   gender            7032 non-null   object 
 2   SeniorCitizen     7032 non-null   int64  
 3   Partner           7032 non-null   object 
 4   Dependents        7032 non-null   object 
 5   tenure            7032 non-null   int64  
 6   PhoneService      7032 non-null   object 
 7   MultipleLines     7032 non-null   object 
 8   InternetService   7032 non-null   object 
 9   OnlineSecurity    7032 non-null   object 
 10  OnlineBackup      7032 non-null   object 
 11  DeviceProtection  7032 non-null   object 
 12  TechSupport       7032 non-null   object 
 13  StreamingTV       7032 non-null   object 
 14  StreamingMovies   7032 non-null   object 
 15  Contract          7032 non-null   object 
 16  PaperlessBilling  7032 non-null   object 


## 3. Dropping customerID

In [None]:
new_data = new_data.drop('customerID', axis = 1)

## 4. Dropping duplicates

In [None]:
new_data = new_data.drop_duplicates(keep='first') 
new_data.shape

(7010, 20)

## 4. Encoding categorical features

### Encoding output feature using LabelEncoder

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(new_data.drop(columns=['Churn']), new_data['Churn'], test_size=0.2, random_state=45)
le = LabelEncoder()
le.fit(Y_train)
Y_train_le = le.transform(Y_train)
Y_test_le = le.transform(Y_test)

In [None]:
print(Y_train_le.shape)
print(Y_test_le.shape)

(5608,)
(1402,)


### Encoding input features using ColumnTransformer

In [None]:
transformer = ColumnTransformer(transformers=[
    # applying one hot coder because all columns are nominal cat variables
    ('tnf1', OneHotEncoder(sparse=False), 
     ['gender', 
      'Partner', 
      'Dependents', 
      'PhoneService', 
      'MultipleLines', 
      'InternetService', 
      'OnlineSecurity', 
      'OnlineBackup', 
      'DeviceProtection', 
      'TechSupport', 
      'StreamingTV', 
      'StreamingMovies', 
      'Contract', 
      'PaperlessBilling', 
      'PaymentMethod'])
],remainder='passthrough')

X_train_ohe = transformer.fit_transform(X_train)
X_test_ohe = transformer.transform(X_test)

In [None]:
print(X_train_ohe.shape)
print(X_test_ohe.shape)

(5608, 45)
(1402, 45)


## 5. Removing class imbalance

In [None]:
# concatenating split train and test columns for smote
X = np.concatenate((X_train_ohe,X_test_ohe), axis=0)
Y = np.concatenate((Y_train_le,Y_test_le), axis=0)

In [None]:
smote = SMOTE()
# fit predictor and target variable
x_smote, y_smote = smote.fit_resample(X, Y)
y_smote = pd.Series(y_smote)
y_smote.value_counts()

0    5153
1    5153
dtype: int64

# Modeling

## 1. Random Forest

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(x_smote, y_smote, test_size=0.3, random_state=100)
classifier= RandomForestClassifier() 
classifier.fit(X_train, Y_train)  

RandomForestClassifier()

In [None]:
y_pred= classifier.predict(X_test)

In [None]:
print("Accuracy is: ", accuracy_score(Y_test, y_pred)*100)

Accuracy is:  85.25226390685641


In [None]:
# confusion matrix
confusion_matrix(Y_test, y_pred)

array([[1358,  171],
       [ 285, 1278]])

In [None]:
# confusion matrix metrics
matrix = classification_report(Y_test, y_pred)
print(matrix)

              precision    recall  f1-score   support

           0       0.83      0.89      0.86      1529
           1       0.88      0.82      0.85      1563

    accuracy                           0.85      3092
   macro avg       0.85      0.85      0.85      3092
weighted avg       0.85      0.85      0.85      3092



### Hyperparameter tuning



In [None]:
import sklearn.model_selection
X_train, X_test, Y_train, Y_test = train_test_split(x_smote, y_smote, test_size=0.3, random_state=110)
def objective(trial):
    n_estimators = trial.suggest_categorical('n_estimators', [200, 400, 600, 800, 1000])
    max_depth = trial.suggest_int('max_depth', 2, 20)
    max_features = trial.suggest_categorical('max_features', ['auto', 'sqrt','log2', None])
    min_samples_leaf = trial.suggest_categorical('min_samples_leaf', [1,2,4])
    min_samples_split = trial.suggest_categorical('min_samples_split', [2,5,10])
    clf = RandomForestClassifier(max_depth=max_depth, 
                                 n_estimators=n_estimators,
                                 max_features = max_features,
                                 min_samples_leaf=min_samples_leaf,
                                 min_samples_split=min_samples_split
                                 )
    cv = KFold(n_splits=12, shuffle=True, random_state=43)
    return sklearn.model_selection.cross_val_score(clf, X_train, Y_train, 
       n_jobs=-1, cv=cv).mean()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=200)

[32m[I 2023-01-06 21:55:53,780][0m A new study created in memory with name: no-name-d6c0702e-9a96-4684-804d-d20f412f0df0[0m
[32m[I 2023-01-06 21:57:44,013][0m Trial 0 finished with value: 0.8500134972535623 and parameters: {'n_estimators': 400, 'max_depth': 7, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2}. Best is trial 0 with value: 0.8500134972535623.[0m
[32m[I 2023-01-06 21:58:47,484][0m Trial 1 finished with value: 0.8622135403710685 and parameters: {'n_estimators': 800, 'max_depth': 19, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 2}. Best is trial 1 with value: 0.8622135403710685.[0m
[32m[I 2023-01-06 21:59:28,199][0m Trial 2 finished with value: 0.8606894562956166 and parameters: {'n_estimators': 600, 'max_depth': 15, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 10}. Best is trial 1 with value: 0.8622135403710685.[0m
[32m[I 2023-01-06 22:00:43,163][0m Trial 3 finished with value: 0.860689225967057

In [None]:
trial = study.best_trial
print('Accuracy: {}'.format(trial.value))

Accuracy: 0.8641547494670196


In [None]:
print("Best hyperparameters: {}".format(trial.params))

Best hyperparameters: {'n_estimators': 400, 'max_depth': 17, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 10}


In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
optuna.visualization.plot_slice(study)

## 2. Gradient Boosted Trees

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(x_smote, y_smote, test_size=0.3, random_state=111)
classifier = GradientBoostingClassifier()
classifier = classifier.fit(X_train,Y_train)

In [None]:
y_pred= classifier.predict(X_test)

In [None]:
print("Accuracy is: ", accuracy_score(Y_test, y_pred)*100)

Accuracy is:  86.51358344113842


In [None]:
# confusion matrix
confusion_matrix(Y_test, y_pred)

array([[1311,  194],
       [ 223, 1364]])

In [None]:
# confusion matrix metrics
matrix = classification_report(Y_test, y_pred)
print(matrix)

              precision    recall  f1-score   support

           0       0.85      0.87      0.86      1505
           1       0.88      0.86      0.87      1587

    accuracy                           0.87      3092
   macro avg       0.87      0.87      0.87      3092
weighted avg       0.87      0.87      0.87      3092



### Hyperparameter tuning

In [None]:
import sklearn.model_selection
X_train, X_test, Y_train, Y_test = train_test_split(x_smote, y_smote, test_size=0.3, random_state=100)
def objective(trial):
    n_estimators = trial.suggest_categorical('n_estimators', [200, 400, 600, 800, 1000])
    max_depth = trial.suggest_int('max_depth', 2, 15)
    max_features = trial.suggest_categorical('max_features', ['auto', 'sqrt','log2'])
    learning_rate = trial.suggest_categorical('learning_rate', [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ])
    min_samples_leaf = trial.suggest_categorical('min_samples_leaf', [1,2,4])
    min_samples_split = trial.suggest_categorical('min_samples_split', [2,5,10])

    clf = GradientBoostingClassifier(max_depth=max_depth, 
                                 n_estimators=n_estimators,
                                 max_features = max_features,
                                 learning_rate = learning_rate,
                                 min_samples_leaf = min_samples_leaf,
                                 min_samples_split = min_samples_split)
    cv = KFold(n_splits=12, shuffle=True, random_state=42)
    return sklearn.model_selection.cross_val_score(clf, X_train, Y_train, 
       n_jobs=-1, cv=cv).mean()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=200)

[32m[I 2023-01-07 12:02:01,326][0m A new study created in memory with name: no-name-75cc1b81-55c9-4b6b-bb8d-20a89d0a60b5[0m
[32m[I 2023-01-07 12:04:59,831][0m Trial 0 finished with value: 0.8422477395555211 and parameters: {'n_estimators': 400, 'max_depth': 14, 'max_features': 'log2', 'learning_rate': 0.1, 'min_samples_leaf': 2, 'min_samples_split': 5}. Best is trial 0 with value: 0.8422477395555211.[0m
[32m[I 2023-01-07 12:06:54,924][0m Trial 1 finished with value: 0.8583278975793389 and parameters: {'n_estimators': 200, 'max_depth': 10, 'max_features': 'auto', 'learning_rate': 0.05, 'min_samples_leaf': 1, 'min_samples_split': 10}. Best is trial 1 with value: 0.8583278975793389.[0m
[32m[I 2023-01-07 12:07:41,840][0m Trial 2 finished with value: 0.8441894093085905 and parameters: {'n_estimators': 800, 'max_depth': 5, 'max_features': 'log2', 'learning_rate': 0.2, 'min_samples_leaf': 4, 'min_samples_split': 10}. Best is trial 1 with value: 0.8583278975793389.[0m
[32m[I 2023-

In [None]:
trial = study.best_trial
print('Accuracy: {}'.format(trial.value))

Accuracy: 0.8691411324426067


In [None]:
print("Best hyperparameters: {}".format(trial.params))

Best hyperparameters: {'n_estimators': 200, 'max_depth': 5, 'max_features': 'sqrt', 'learning_rate': 0.05, 'min_samples_leaf': 1, 'min_samples_split': 5}


In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
optuna.visualization.plot_slice(study)

## 3. XGBoost

In [None]:
xgboost = XGBClassifier()

In [None]:
xgb = XGBClassifier()
xgb = xgb.fit(X_train, Y_train)

In [None]:
# generating predictions
y_pred = xgb.predict(X_test)

In [None]:
print("Accuracy is: ", accuracy_score(Y_test, y_pred)*100)

Accuracy is:  87.1927554980595


In [None]:
# confusion matrix
confusion_matrix(Y_test, y_pred)

array([[1370,  192],
       [ 204, 1326]])

In [None]:
# confusion matrix metrics
matrix = classification_report(Y_test, y_pred)
print(matrix)

              precision    recall  f1-score   support

           0       0.87      0.88      0.87      1562
           1       0.87      0.87      0.87      1530

    accuracy                           0.87      3092
   macro avg       0.87      0.87      0.87      3092
weighted avg       0.87      0.87      0.87      3092



### Hyperparameter tuning


In [None]:
import sklearn.model_selection
X_train, X_test, Y_train, Y_test = train_test_split(x_smote, y_smote, test_size=0.3, random_state=120)
def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 230, 260)
    # max_depth = trial.suggest_int('max_depth', 2, 10)
    # min_child_weight = trial.suggest_int('min_child_weight', 6, 8)
    learning_rate = trial.suggest_categorical('learning_rate', [0.05, 0.10])
    # gamma = trial.suggest_categorical('gamma', [0.3, 0.4])
    # colsample_bytree = trial.suggest_categorical('colsample_bytree', [0.3, 0.4, 0.5 , 0.7])
    clf = XGBClassifier(max_depth=2, 
                        n_estimators=n_estimators,
                        min_child_weight=6,
                        learning_rate=learning_rate,
                        gamma=0.3,
                        colsample_bytree=0.5)
    cv = KFold(n_splits=12, shuffle=True, random_state=44)
    return sklearn.model_selection.cross_val_score(clf, X_train, Y_train, 
       n_jobs=-1, cv=cv).mean()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

[32m[I 2023-01-06 18:46:09,022][0m A new study created in memory with name: no-name-09a83468-9ee1-47b1-a3b1-3cc90673e871[0m
[32m[I 2023-01-06 18:46:26,279][0m Trial 0 finished with value: 0.8534822453533515 and parameters: {'n_estimators': 237, 'learning_rate': 0.05}. Best is trial 0 with value: 0.8534822453533515.[0m
[32m[I 2023-01-06 18:46:37,245][0m Trial 1 finished with value: 0.8538979884024965 and parameters: {'n_estimators': 244, 'learning_rate': 0.05}. Best is trial 1 with value: 0.8538979884024965.[0m
[32m[I 2023-01-06 18:46:50,298][0m Trial 2 finished with value: 0.8616589092007966 and parameters: {'n_estimators': 254, 'learning_rate': 0.1}. Best is trial 2 with value: 0.8616589092007966.[0m
[32m[I 2023-01-06 18:47:01,188][0m Trial 3 finished with value: 0.8536206728173604 and parameters: {'n_estimators': 241, 'learning_rate': 0.05}. Best is trial 2 with value: 0.8616589092007966.[0m
[32m[I 2023-01-06 18:47:12,578][0m Trial 4 finished with value: 0.8547290138

In [None]:
trial = study.best_trial
print('Accuracy: {}'.format(trial.value))

Accuracy: 0.8619362247859327


In [None]:
print("Best hyperparameters: {}".format(trial.params))

Best hyperparameters: {'n_estimators': 239, 'learning_rate': 0.1}


In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
optuna.visualization.plot_slice(study)