In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import OneHotEncoder
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

In [2]:
url = 'DATA_Customer-Churn.csv'
df = pd.read_csv(url)

In [3]:
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})

In [4]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

In [5]:
# For rows with NaN in TotalCharges, calculate or replace based on the strategy

df.loc[(df['TotalCharges'].isna()) & (df['tenure'] > 1), 'TotalCharges'] = df['MonthlyCharges'] * df['tenure']
df.loc[(df['TotalCharges'].isna()) & (df['tenure'] <= 1), 'TotalCharges'] = df['MonthlyCharges']

In [6]:
def null_check(data_frame):
    print(f'Total null values per row: \n{data_frame.isnull().sum(axis=1)}\n')
    print(f'Total null values per column: \n{data_frame.isnull().sum()}\n')

In [7]:
null_check(df)

Total null values per row: 
0       0
1       0
2       0
3       0
4       0
       ..
7038    0
7039    0
7040    0
7041    0
7042    0
Length: 7043, dtype: int64

Total null values per column: 
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64



In [8]:
RAND_STATE = 42 # for reproducible shuffling
TT_RATIO = 0.25 # test/train

In [9]:
# X,y
y = df['Churn']
X = df.drop(['Churn'], axis=1)
# split the data by type
numericalX = X.select_dtypes(np.number)
categoricalX = X.select_dtypes(object)

In [10]:
# onehot encoding (needed for SMOTE and sci-kit learn's random forest)
encoder = OneHotEncoder(drop='if_binary').fit(categoricalX) #  drop the first category in each feature to reduce frame size
encoded_categorical = pd.DataFrame(encoder.transform(categoricalX).toarray()) # encode
X = pd.concat([numericalX, encoded_categorical], axis = 1) # rejoin
X.head(3)

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,0,1,2,3,4,5,...,15,16,17,18,19,20,21,22,23,24
0,0,1,29.85,29.85,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,0,34,56.95,1889.5,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,0,2,53.85,108.15,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


## Split Data

In [11]:
# #PowerTransform X_train
# pt1 = PowerTransformer()
# X_train = pt1.fit_transform(X_train)
# #PowerTransform X_test
# X_test = pt1.transform(X_test)

In [12]:
# category_0 = df[df['Churn'] == 0] # negative class (majority)
# category_1 = df[df['Churn'] == 1] # positive class (minority)

In [13]:
# print(category_0.shape)
# print(category_1.shape)

## Scaling

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TT_RATIO, random_state=RAND_STATE)

X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

In [15]:
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

In [16]:
# again we need to drop the null (we use the same shuffling)
na_idx = X_train[X_train.isna().any(axis=1)].index
X_train = pd.DataFrame(X_train).drop(na_idx)
y_train = pd.DataFrame(y_train).drop(na_idx)

In [17]:
y_train = pd.DataFrame(y_train)
y_test = pd.DataFrame(y_test)

In [18]:
X_train.columns = X_train.columns.astype(str)
y_train.columns = y_train.columns.astype(str)
X_test.columns = X_test.columns.astype(str)
y_test.columns = y_test.columns.astype(str)

## Upsampling using SMOTE

In [19]:
smote = SMOTE()
X_train, y_train = smote.fit_resample(X_train, y_train)
y_train.value_counts()

Churn
0        3892
1        3892
Name: count, dtype: int64

## Fit a Random forest Classifier

In [20]:
param_grid = {
    'n_estimators': [50, 100],
    'min_samples_split': [80],
    'min_samples_leaf' : [50],
    # 'max_features': ['sqrt'],
    ##'max_samples' : ['None', 0.5],
   'max_depth':[3,5]
    ## 'bootstrap':[True,False]
    }
clf = RandomForestClassifier(random_state=RAND_STATE)

In [21]:
grid_search = GridSearchCV(clf, param_grid, cv=5,return_train_score=True,n_jobs=-1,)

In [22]:
grid_search.fit(X_train,y_train.values.ravel())

In [23]:
best_params = grid_search.best_params_ #To check the best set of parameters returned
best_params

{'max_depth': 5,
 'min_samples_leaf': 50,
 'min_samples_split': 80,
 'n_estimators': 100}

In [24]:
pd.DataFrame(grid_search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_samples_leaf,param_min_samples_split,param_n_estimators,params,split0_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,0.304513,0.008944,0.025611,0.009721,3,50,80,50,"{'max_depth': 3, 'min_samples_leaf': 50, 'min_...",0.726397,...,0.783405,0.039324,4,0.817087,0.812269,0.782078,0.783363,0.78131,0.795221,0.015973
1,0.634283,0.036199,0.03371,0.002221,3,50,80,100,"{'max_depth': 3, 'min_samples_leaf': 50, 'min_...",0.725112,...,0.787389,0.043626,3,0.81532,0.811466,0.786253,0.788662,0.786127,0.797566,0.013012
2,0.387164,0.033212,0.021144,0.002394,5,50,80,50,"{'max_depth': 5, 'min_samples_leaf': 50, 'min_...",0.723828,...,0.803963,0.05455,2,0.840533,0.833628,0.803437,0.805685,0.806198,0.817896,0.015843
3,0.647615,0.049696,0.026526,0.003064,5,50,80,100,"{'max_depth': 5, 'min_samples_leaf': 50, 'min_...",0.722543,...,0.805762,0.058133,1,0.842139,0.831219,0.806327,0.810181,0.808606,0.819695,0.014344


<b> Please check RandomSearch as another algorithm comparable to GridSearch

In [25]:
clf = RandomForestClassifier(random_state=RAND_STATE, **best_params)
cross_val_scores = cross_val_score(clf, X_train, y_train, cv=5)
print(np.mean(cross_val_scores))

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


0.8057616898887684


## Feature Importance

In [26]:
clf.fit(X_train, y_train)

  clf.fit(X_train, y_train)


In [27]:
len(X_train.columns)

29

In [28]:
feature_names = X_train.columns
feature_names = list(feature_names)

In [29]:
df = pd.DataFrame(list(zip(feature_names, clf.feature_importances_)))
df.columns = ['columns_name', 'score_feature_importance']
df.sort_values(by=['score_feature_importance'], ascending = False)

Unnamed: 0,columns_name,score_feature_importance
26,22,0.213842
8,4,0.134268
17,13,0.102491
28,24,0.072086
1,tenure,0.062585
11,7,0.059439
19,15,0.048398
14,10,0.040101
5,1,0.036824
6,2,0.036188


## Models Comparison (3 classification estimators)

In [30]:
model1 = DecisionTreeClassifier(max_depth=best_params['max_depth'],
                                min_samples_leaf=best_params['min_samples_leaf'],
                                min_samples_split=best_params['min_samples_split'])
model2 = LogisticRegression()
model3 = KNeighborsClassifier()
model4 = RandomForestClassifier(max_depth=best_params['max_depth'],
                                min_samples_leaf=best_params['min_samples_leaf'],
                                min_samples_split=best_params['min_samples_split'],
                                n_estimators=best_params['n_estimators'])

model_pipeline = [model1, model2, model3, model4]
model_names = ['Classification Tree', 'Logistic Regression', 'KNN', 'Random Forest']
scores = {}

# Assuming X_sm and y_sm are your oversampled training set
# Evaluate each model using cross-validation and store their mean recall scores
for i, model in enumerate(model_pipeline):
    mean_score = np.mean(cross_val_score(model, X_test, y_test, cv=5, scoring='recall'))
    scores[model_names[i]] = mean_score

print(scores)

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown 

{'Classification Tree': 0.5323684210526316, 'Logistic Regression': 0.5989912280701754, 'KNN': 0.42370614035087717, 'Random Forest': 0.43624999999999997}
