In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, precision_recall_curve
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import log_loss
from sklearn.preprocessing import MinMaxScaler

In [None]:
# Load the CSV file into a DataFrame
training_dataset = pd.read_csv('dataset/customer_churn_dataset-training-master.csv')
testing_dataset = pd.read_csv('dataset/customer_churn_dataset-testing-master.csv')
# Display the first 5 rows of the DataFrame
# print(df1.head())
# print(df1.columns)
df = pd.concat([training_dataset, testing_dataset], ignore_index=True)

df.head(50)

In [None]:
#Inspect the churn column
df['Churn']

After looking through the dataset, row 199295 has missing values for all the columns. Therefore, we can remove this row.

In [None]:
#Recheck for null values
nan_count = np.sum(df.isnull(), axis=0)
nan_count

In [None]:
df.drop(columns=["CustomerID"],inplace=True)

In [None]:
# sns.lineplot(x=df['Age'], y=df['Churn'])
# sns.lineplot(x=training_dataset['Age'], y=training_dataset['Churn'])

In [None]:
#Identifying correlations with the label
corr_matrix = round(df.corr(),5)
corrs = corr_matrix['Churn']
corrs_sorted = corrs.sort_values(ascending=False)
corrs_sorted



In [None]:
# #Visualize the top two correlated features
# df_sample = df.sample(n=30000)
# top_two_corr = list(corrs_sorted[2:4].index)
# df_corrs_sample = df_sample[top_two_corr].copy()
# df_corrs_sample['Churn'] = df_sample['Churn']
# sns.pairplot(data=df_corrs_sample, kind='kde', corner=True)
# #ASK TA about this

In [None]:
# filter_df = df[df['Age']>50]

# filter_df
# filter_df.hist(column='Churn')

# training_dataset[training_dataset['Age']>50].hist(column='Churn')

In [None]:
#Normalize the numerical features by scaling values between 0 and 1
# scaler = MinMaxScaler()
# df = scaler.fit_transform(df)
# df_norm_test = pd.DataFrame(df_norm_test, columns=df_testing.columns)
df.shape

Spltting the concatenated data into 80/20 training and testing


In [None]:
#Spltting the concatenated data into 80/20 training and testing
from sklearn.model_selection import train_test_split
y = df['Churn']
X = df.drop(columns='Churn')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=1234)

In [None]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
predictions = model.predict(X_test)
probabilities = model.predict_proba(X_test)
lg_loss = log_loss(y_test, probabilities)
acc_score = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions)
recall = recall_score(y_test, predictions)
# print('Log loss=' + str(lg_loss) + ', accuracy score: ' + str(acc_score), ', recall : ' + recall)
print(f'Log Loss: {lg_loss}, accuracy score: {acc_score}, precision: {precision}, recall: {recall}')
print('F1 Score: ' + str((2*precision*recall)/(precision+recall)))


In [None]:
#get the coefficients from the logistic regression model
coefficients = model.coef_[0]
feature_names = X_train.columns

coef_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})

print(coef_df)

In [None]:
#Histograms of churn and no churn in the newly split training dataset
y_train_named = y_train.rename('Churn')
training_data = pd.concat([X_train, y_train_named], axis=1)

plt.figure(figsize=(6, 3))

features = training_data.drop(columns=['Churn']).columns

for feature in features:
    plt.figure(figsize=(6, 3))
    
    sns.histplot(training_data[training_data['Churn'] == 0][feature], color='blue', label='No Churn', kde=False, stat="density", bins=30, alpha=0.5)
    sns.histplot(training_data[training_data['Churn'] == 1][feature], color='red', label='Churn', kde=False, stat="density", bins=30, alpha=0.5)
    
    plt.title(f'Distribution of {feature} for Churn vs No Churn')
    plt.xlabel(feature)
    plt.ylabel('Density')
    plt.legend()
    
    plt.show()

Stratifying the 80/20 training and testing split

In [None]:
from sklearn.model_selection import train_test_split
y = df['Churn']
X = df.drop(columns='Churn')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,stratify=y,random_state=1234)

In [None]:
#checking imbalance: 
y_train.value_counts(normalize=True)



In [None]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
predictions = model.predict(X_test)
probabilities = model.predict_proba(X_test)
lg_loss = log_loss(y_test, probabilities)
acc_score = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions)
recall = recall_score(y_test, predictions)
# print('Log loss=' + str(lg_loss) + ', accuracy score: ' + str(acc_score), ', recall : ' + recall)
print(f'Log Loss: {lg_loss}, accuracy score: {acc_score}, precision: {precision}, recall: {recall}')
print('F1 Score: ' + str((2*precision*recall)/(precision+recall)))


In [None]:
#get the coefficients from the logistic regression model
coefficients = model.coef_[0]
feature_names = X_train.columns

coef_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})

print(coef_df)

In [None]:
#Histograms of churn and no churn in the newly split training dataset
y_train_named = y_train.rename('Churn')
training_data = pd.concat([X_train, y_train_named], axis=1)

plt.figure(figsize=(6, 3))

features = training_data.drop(columns=['Churn']).columns

for feature in features:
    plt.figure(figsize=(6, 3))
    
    sns.histplot(training_data[training_data['Churn'] == 0][feature], color='blue', label='No Churn', kde=False, stat="density", bins=30, alpha=0.5)
    sns.histplot(training_data[training_data['Churn'] == 1][feature], color='red', label='Churn', kde=False, stat="density", bins=30, alpha=0.5)
    
    plt.title(f'Distribution of {feature} for Churn vs No Churn')
    plt.xlabel(feature)
    plt.ylabel('Density')
    plt.legend()
    
    plt.show()

In [None]:
#Histograms of churn and no churn in the newly split testing dataset
y_test_named = y_test.rename('Churn')
testing_data = pd.concat([X_test, y_test_named], axis=1)

plt.figure(figsize=(6, 3))

features = testing_data.drop(columns=['Churn']).columns

for feature in features:
    plt.figure(figsize=(6, 3))
    
    sns.histplot(testing_data[testing_data['Churn'] == 0][feature], color='blue', label='No Churn', kde=False, stat="density", bins=30, alpha=0.5)
    sns.histplot(testing_data[testing_data['Churn'] == 1][feature], color='red', label='Churn', kde=False, stat="density", bins=30, alpha=0.5)
    
    plt.title(f'Distribution of {feature} for Churn vs No Churn')
    plt.xlabel(feature)
    plt.ylabel('Density')
    plt.legend()
    
    plt.show()

Creating a decision tree model.

In [None]:
def get_stats(y_test, y_pred, probabilities):
    acc_score = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    lg_loss = log_loss(y_test, probabilities)
    print(f'Log Loss: {lg_loss}, accuracy score: {acc_score}, precision: {precision}, recall: {recall}')
    print('F1 Score: ' + str((2*precision*recall)/(precision+recall)))

In [None]:
from sklearn.tree import DecisionTreeClassifier
y = df['Churn']
X = df.drop(columns='Churn')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4,stratify=y,random_state=1234)

dt = DecisionTreeClassifier(criterion='log_loss', max_depth=7, min_samples_leaf=4, random_state=1234)
dt.fit(X_train, y_train)

y_pred = dt.predict(X_test)
probabilities = dt.predict_proba(X_test)

get_stats(y_test, y_pred, probabilities)




In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'max_depth': [3, 5, 7],  # Testing different tree depths
    'min_samples_leaf': [2, 4]  # Minimum samples required at leaf nodes
}

# Perform Grid Search with 5-fold cross-validation
grid_search = GridSearchCV(estimator=dt, param_grid=param_grid, cv=5, scoring='accuracy')

# Train the model using grid search
grid_search.fit(X_train, y_train)

# Get the best hyperparameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print(f'{best_params}, {best_model}')

In [None]:
feature_names = X.columns
feature_names

In [None]:
from matplotlib import pyplot as plt
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

text_representation = tree.export_text(dt, feature_names=feature_names)
print(text_representation)

In [None]:
fig = plt.figure(figsize=(200,50))
_ = tree.plot_tree(dt,
                   feature_names=feature_names,
                   class_names=['No Churn','Churn'],
                   filled=True)

Trying a Random forest model now.

In [None]:
from sklearn.ensemble import RandomForestClassifier

# RandomForestClassifier(criterion='entropy', n_estimators=20)


y = df['Churn']
X = df.drop(columns='Churn')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4,stratify=y,random_state=1234)

rf = RandomForestClassifier(criterion='log_loss', n_estimators=20)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)
probabilities = rf.predict_proba(X_test)

get_stats(y_test, y_pred, probabilities)

Hist Gradient Boosting CLassifier

In [None]:
from sklearn.ensemble import HistGradientBoostingClassifier
# from sklearn.datasets import make_hastie_10_2

y = df['Churn']
X = df.drop(columns='Churn')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4,stratify=y,random_state=1234)

hgbc = HistGradientBoostingClassifier(loss='log_loss', max_iter=100)
hgbc.fit(X_train, y_train)

y_pred = hgbc.predict(X_test)
probabilities = hgbc.predict_proba(X_test)

get_stats(y_test, y_pred, probabilities)

In [None]:
import xgboost as xgb

y = df['Churn']
X = df.drop(columns='Churn')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4,stratify=y,random_state=1234)

xgb_clf = xgb.XGBClassifier(
    objective='binary:hinge',
    eval_metric='auc',
    # 'objective': 'binary:logistic',
    # 'eval_metric': 'auc',
    # 'eta': 0.1,
    # 'max_depth': 6,
    # 'min_child_weight': 1,
    # 'subsample': 0.8,
    # 'colsample_bytree': 0.8,
    # 'lambda': 1.0,
    # 'alpha': 0.0,
    # 'n_estimators': 500
)

xgb_clf.fit(X_train, y_train)

y_pred = xgb_clf.predict(X_test)

get_stats(y_test, y_pred)