In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score,precision_score, f1_score
from imblearn.over_sampling import SMOTE


In [None]:
file_url = "https://github.com/Yihe-Harry/DSA3101-Group-Project/tree/Subgroup-B/GrpB_models/B5_Marcus/Churn_Modelling.csv"

df = pd.read_csv(file_url)
df = df.drop(columns=['RowNumber','CustomerId','Surname'])
#print(df.head())

df= pd.get_dummies(df,columns=["Geography","Gender"],drop_first=True)
#print(df.head())

label_counts = df['Exited'].value_counts()
total_samples = len(df)
class_percentages = (label_counts / total_samples) * 100
print("\nClass Percentages:\n", class_percentages)


x = df.drop(columns='Exited')
y = df['Exited']


x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=3101)

Mounted at /content/drive

Class Percentages:
 Exited
0    79.63
1    20.37
Name: count, dtype: float64


# Log regression

In [15]:
param_grid = {
    'C': [ 0.01, 0.1, 1, 10, 100],  # inverse of regularization strength
    'penalty': ['l1', 'l2'],  # regularization types
    'solver': ['liblinear'],
    'max_iter': [500, 1000, 2000]  #iterations for convergence
}

log_reg = LogisticRegression(random_state=3101)


grid_search = GridSearchCV(estimator=log_reg, param_grid=param_grid, cv=10, scoring='recall', verbose=1, n_jobs=-1)


grid_search.fit(x_train, y_train)

best_params = grid_search.best_params_
best_score = grid_search.best_score_
print(best_params)
print(f"Best cross-validation accuracy: {best_score:.4f}")


best_model = grid_search.best_estimator_
y_pred = best_model.predict(x_test)


accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)



print(accuracy)
print(recall)
print(f1)

Fitting 10 folds for each of 30 candidates, totalling 300 fits
{'C': 10, 'max_iter': 500, 'penalty': 'l1', 'solver': 'liblinear'}
Best cross-validation accuracy: 0.2124
0.8185
0.22305764411027568
0.3290203327171904


# Log regression with SMOTE

In [16]:
smote = SMOTE(sampling_strategy=0.5, random_state=3101)
x_resampled, y_resampled = smote.fit_resample(x_train, y_train)

param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],  # inverse of regularization strength
    'penalty': ['l1', 'l2'],  # regularization types
    'solver': ['liblinear'],  # solvers
    'max_iter': [500, 1000, 2000]  #iterations for convergence
}

log_reg = LogisticRegression(random_state=3101)


grid_search = GridSearchCV(estimator=log_reg, param_grid=param_grid, cv=10, scoring='recall', verbose=1, n_jobs=-1)


grid_search.fit(x_resampled, y_resampled)


best_params = grid_search.best_params_
best_score = grid_search.best_score_
print(best_params)
print(f"Best cross-validation accuracy: {best_score:.4f}")


best_model = grid_search.best_estimator_
y_pred = best_model.predict(x_test)


accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(accuracy)
print(recall)
print(f1)

Fitting 10 folds for each of 36 candidates, totalling 360 fits
{'C': 10, 'max_iter': 500, 'penalty': 'l1', 'solver': 'liblinear'}
Best cross-validation accuracy: 0.5464
0.794
0.42857142857142855
0.4535809018567639


# Random forest

In [None]:
rf = RandomForestClassifier(class_weight='balanced', random_state=3101)

param_grid = {
    'n_estimators': [50, 100, 200],  #number of trees
    'max_depth': [None, 10, 20],  #tree depth
    'min_samples_split': [2, 5, 10],  #min samples to split a node
    'min_samples_leaf': [1, 2, 5]  #min samples per leaf
}

grid_search = GridSearchCV(rf, param_grid, cv=10, scoring='recall', n_jobs=-1)
grid_search.fit(x_train, y_train)

best_rf = grid_search.best_estimator_

cv_scores = cross_val_score(best_rf, x_train, y_train, cv=10, scoring='recall')
#print(cv_scores.mean())

y_pred = best_rf.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)
print(accuracy)
recall = recall_score(y_test, y_pred)
print(recall)


0.8365
0.6741854636591479


# Random forest with SMOTE

In [None]:
smote = SMOTE(sampling_strategy=0.5, random_state=3101)
x_resampled, y_resampled = smote.fit_resample(x_train, y_train)


rf = RandomForestClassifier(class_weight={0:1, 1:4}, random_state=3101)

param_grid = {
    'n_estimators': [500, 700],  #number of trees
    'max_depth': [10, 20],  #tree depth
    'min_samples_split': [10, 15],  #min samples to split a node
    'min_samples_leaf': [1, 2, 5],  #min samples per leaf
    'bootstrap': [True, False]
}

grid_search = GridSearchCV(rf, param_grid, cv=10, scoring='recall', n_jobs=-1)
grid_search.fit(x_resampled, y_resampled)

best_rf = grid_search.best_estimator_

cv_scores = cross_val_score(best_rf, x_resampled, y_resampled, cv=10, scoring='recall')
#print(cv_scores.mean())

y_pred = best_rf.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)
print(accuracy)
recall = recall_score(y_test, y_pred)
print(recall)

0.7495
0.7819548872180451


# Most important features

In [None]:
print(grid_search.best_params_)
importances = best_rf.feature_importances_
features = x_resampled.columns
feature_importances = pd.DataFrame({"Feature": features, "Importance": importances})
print(feature_importances.sort_values(by="Importance", ascending=False))


{'bootstrap': False, 'max_depth': 10, 'min_samples_leaf': 5, 'min_samples_split': 15, 'n_estimators': 700}
              Feature  Importance
1                 Age    0.314612
4       NumOfProducts    0.196255
6      IsActiveMember    0.131309
3             Balance    0.114633
8   Geography_Germany    0.072872
7     EstimatedSalary    0.053858
0         CreditScore    0.046049
2              Tenure    0.033580
9     Geography_Spain    0.013899
5           HasCrCard    0.012520
10        Gender_Male    0.010414


In [3]:
# Define a function to add interaction features to any dataset
def create_interaction_features(df):
    df['Age_Balance'] = df['Age'] * df['Balance']
    df['Age_NumOfProducts'] = df['Age'] * df['NumOfProducts']
    df['Age_IsActiveMember'] = df['Age'] * df['IsActiveMember']
    df['Balance_NumOfProducts'] = df['Balance'] * df['NumOfProducts']
    df['Balance_IsActiveMember'] = df['Balance'] * df['IsActiveMember']
    df['NumOfProducts_IsActiveMember'] = df['NumOfProducts'] * df['IsActiveMember']
    return df

# Create interaction features for both the training and test data
x_train = create_interaction_features(x_train)
x_test = create_interaction_features(x_test)


# Now, apply SMOTE to the training data (x_train)
smote = SMOTE(sampling_strategy=0.5, random_state=3101)
x_resampled, y_resampled = smote.fit_resample(x_train, y_train)




# Random forest with interaction terms

In [4]:
# Initialize the RandomForest model
rf = RandomForestClassifier(class_weight={0:1, 1:5}, random_state=3101)

# Set up the parameter grid for grid search
param_grid = {
    'n_estimators': [700, 1000],
    'max_depth': [5, 10],
    'min_samples_split': [10, 15],
    'min_samples_leaf': [2, 5],
    'bootstrap': [True, False]
}

# Perform Grid Search with cross-validation
grid_search = GridSearchCV(rf, param_grid, cv=10, scoring='recall', n_jobs=-1)

grid_search.fit(x_resampled, y_resampled)

# Best model from grid search
best_rf = grid_search.best_estimator_

# Evaluate the model on the test set
y_pred = best_rf.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
recall = recall_score(y_test, y_pred)
print("Recall:", recall)

# Print the best parameters found by the grid search
print("Best Parameters:", grid_search.best_params_)


precision = precision_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Precision:", precision)
print("F1-Score:", f1)




Accuracy: 0.7415
Recall: 0.8095238095238095
Best Parameters: {'bootstrap': False, 'max_depth': 10, 'min_samples_leaf': 5, 'min_samples_split': 15, 'n_estimators': 1000}
Precision: 0.42277486910994766
F1-Score: 0.5554600171969045


In [5]:
feature_importance = best_rf.feature_importances_
features = x_train.columns
importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importance})
importance_df = importance_df.sort_values(by='Importance', ascending=False)
print(importance_df)

                         Feature  Importance
1                            Age    0.185405
12             Age_NumOfProducts    0.145824
4                  NumOfProducts    0.120734
11                   Age_Balance    0.081033
16  NumOfProducts_IsActiveMember    0.077977
8              Geography_Germany    0.062555
13            Age_IsActiveMember    0.056956
14         Balance_NumOfProducts    0.054286
3                        Balance    0.046738
6                 IsActiveMember    0.033626
7                EstimatedSalary    0.031782
0                    CreditScore    0.029307
15        Balance_IsActiveMember    0.028210
2                         Tenure    0.023254
5                      HasCrCard    0.008884
9                Geography_Spain    0.007648
10                   Gender_Male    0.005781


# Getting key insights

In [7]:
def churn_rate_comparison(df, feature, threshold=None, binary=False):
    if binary:
        group_1 = df[df[feature] == 1]
        group_0 = df[df[feature] == 0]
    else:
        threshold = df[feature].median() if threshold is None else threshold
        group_1 = df[df[feature] > threshold]
        group_0 = df[df[feature] <= threshold]

    churn_rate_1 = group_1['Exited'].mean()
    churn_rate_0 = group_0['Exited'].mean()

    print(f"\nChurn Rate for {feature} > {threshold}: {churn_rate_1:.2%}")
    print(f"Churn Rate for {feature} <= {threshold}: {churn_rate_0:.2%}")


churn_rate_comparison(df, 'Age')
churn_rate_comparison(df, 'NumOfProducts')



Churn Rate for Age > 37.0: 32.06%
Churn Rate for Age <= 37.0: 9.08%

Churn Rate for NumOfProducts > 1.0: 12.77%
Churn Rate for NumOfProducts <= 1.0: 27.71%


Older customers were more likely to churn <br>
Customers with more than 1 product is less likely to churn

# production ready

In [None]:
import joblib
joblib.dump(best_rf, 'churn_prediction_model.pkl')


best_rf = joblib.load('churn_prediction_model.pkl')
