# Beta Bank Customer Retention

## Introduction



In [228]:
# Import libraries
import pandas as pd
import numpy as np
import warnings

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.metrics import accuracy_score

from sklearn.utils import resample

from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=UserWarning)

In [229]:
# Read the data
data = pd.read_csv('https://practicum-content.s3.us-west-1.amazonaws.com/datasets/Churn.csv')

# Show the data
data.info()
display(data.sample(10))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           9091 non-null   float64
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(3), int64(8), object(3)
memory usage: 1.1+ MB


Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
1791,1792,15776467,De Salis,702,Spain,Female,35,8.0,14262.8,2,1,0,54689.16,0
1246,1247,15721189,Kung,666,France,Female,66,7.0,0.0,2,1,1,99792.82,0
5746,5747,15751131,Moss,836,Spain,Female,41,7.0,150302.84,1,1,1,156036.19,0
4499,4500,15773322,Obiajulu,536,Germany,Female,44,4.0,121898.82,1,0,0,131007.18,0
7697,7698,15637315,Melvin,601,Spain,Female,41,3.0,0.0,2,1,0,54342.83,0
5262,5263,15659194,Mishina,628,France,Male,30,8.0,89182.09,1,1,1,13126.9,0
8872,8873,15626475,Gamble,685,France,Male,30,2.0,0.0,2,1,1,140889.32,0
3471,3472,15743582,T'ang,632,France,Female,27,,107375.82,1,1,1,62703.38,0
4818,4819,15588587,Stetson,752,France,Female,36,1.0,86837.95,1,1,1,105280.55,0
4098,4099,15621267,Ejimofor,637,France,Male,32,,0.0,1,0,0,148769.08,0


In [230]:
# Check for duplicates
print(data.duplicated().sum())

0


In [231]:
# Check for missing values
print(data.isnull().sum())

RowNumber            0
CustomerId           0
Surname              0
CreditScore          0
Geography            0
Gender               0
Age                  0
Tenure             909
Balance              0
NumOfProducts        0
HasCrCard            0
IsActiveMember       0
EstimatedSalary      0
Exited               0
dtype: int64


In [232]:
# Fill missing values in 'Tenure' with the median value
data['Tenure'].fillna(data['Tenure'].median(), inplace=True)

# Check to see if it's save to convert 'Tenure' from float to int. If so, then convert it.
if np.array_equal(data['Tenure'], data['Tenure'].astype('int')):
    data['Tenure'] = data['Tenure'].astype('int')

print(data.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int32  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(2), int32(1), int64(8), object(3)
memory usage: 1.0+ MB
None


In [233]:
# Drop the columns that are not needed for the model
data = data.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1)

display(data.sample(10))

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
3318,629,France,Female,40,6,0.0,2,1,1,139356.3,0
9519,610,France,Female,27,2,0.0,2,1,0,14546.76,0
4885,562,Spain,Male,41,5,165445.04,2,1,0,85787.31,0
5482,603,Spain,Male,46,2,0.0,2,1,0,174478.54,0
8398,767,Germany,Female,45,5,132746.2,2,1,0,26628.88,1
4208,447,France,Female,44,5,89188.83,1,1,1,75408.24,0
9101,597,Spain,Male,38,6,115702.67,2,1,1,25059.05,0
6962,677,Germany,Female,26,3,102395.79,1,1,0,119368.99,0
4792,756,Spain,Male,19,5,130274.22,1,1,1,133535.29,0
224,671,Germany,Male,45,6,99564.22,1,1,1,108872.45,1


I dropped these columns since they do not contribute to the model's prediction of customer churn. For RowNumber is an index column that does not provide meaningful information for the model. CustomerId is a unique identifier for each customer. Including this in the model could associate specific outcomes to the individual customer IDs and may not work well with unseen data. Surname is the customer's last name, which will probably not have influence towards their likelihood to churn.

In [234]:
# Convert categorical data into numerical data
data = pd.get_dummies(data, drop_first=True)

display(data.sample(10))

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_Germany,Geography_Spain,Gender_Male
9277,633,29,7,0.0,1,1,1,130224.73,0,False,False,True
7651,431,45,5,83624.55,2,0,0,36899.62,0,True,False,True
3695,571,40,10,112896.86,1,1,1,121402.53,0,False,False,True
7118,715,32,8,175307.32,1,1,0,187051.23,0,False,False,True
2079,562,45,6,136855.24,1,1,0,46864.0,0,False,False,True
5258,728,38,1,115934.74,1,1,1,139059.05,0,False,False,True
5773,523,36,8,113680.54,1,0,0,13197.44,0,False,True,False
8714,703,41,6,109941.51,1,1,0,116267.28,0,False,False,False
3211,686,27,1,115095.88,2,0,0,78622.46,0,True,False,False
1209,850,32,7,0.0,2,0,0,155227.0,0,False,False,False


In [235]:
# Split the data into features and target
# The 'Exited' column is the target, the rest are features
features = data.drop('Exited', axis=1)
target = data['Exited']

# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
features_train, features_test, target_train, target_test = train_test_split(
    features, target, test_size=0.2, random_state=42)

In [236]:
# Examine the balance of classes
class_counts = target.value_counts()
print(class_counts)


Exited
0    7963
1    2037
Name: count, dtype: int64


In [237]:
# Calculate the imbalance ratio
imbalance_ratio = class_counts[0] / class_counts[1]
print(f'Imbalance Ratio: {imbalance_ratio}')

Imbalance Ratio: 3.9091801669121256


In [238]:
# Train a Logistic Regression model without considering the imbalance
model = LogisticRegression(random_state=42)
model.fit(features_train, target_train)

In [239]:
# Make predictions on the test set
predictions = model.predict(features_test)

In [240]:
# Evaluate the model
print(classification_report(target_test, predictions))

              precision    recall  f1-score   support

           0       0.81      0.98      0.89      1607
           1       0.45      0.07      0.12       393

    accuracy                           0.80      2000
   macro avg       0.63      0.53      0.51      2000
weighted avg       0.74      0.80      0.74      2000



In [241]:
# Upsampling
# Separate majority and minority classes
data_majority = data[data.Exited==0]
data_minority = data[data.Exited==1]

In [242]:
# Upsample minority class
data_minority_upsampled = resample(data_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=len(data_majority),    # to match majority class
                                 random_state=42) # reproducible results

In [243]:
# Combine majority class with upsampled minority class
data_upsampled = pd.concat([data_majority, data_minority_upsampled])

# Display new class counts
print(data_upsampled.Exited.value_counts())

Exited
0    7963
1    7963
Name: count, dtype: int64


In [244]:
# Train a model using the upsampled data
# Separate input features (features) and target variable (target)
target_upsampled = data_upsampled.Exited
features_upsampled = data_upsampled.drop('Exited', axis=1)

In [245]:
# Perform train-test split
features_train, features_test, target_train, target_test = train_test_split(
    features_upsampled, target_upsampled, test_size=0.2, random_state=42)

In [246]:
# Train model
clf_1 = LogisticRegression().fit(features_train, target_train)

# Predict on test set
pred_y_1 = clf_1.predict(features_test)

In [247]:
# Is our model still predicting just one class?
print(np.unique(pred_y_1))

# How's our accuracy?
print(accuracy_score(target_test, pred_y_1))

# What about AUROC?
prob_y_1 = clf_1.predict_proba(features_test)
prob_y_1 = [p[1] for p in prob_y_1]
print(roc_auc_score(target_test, prob_y_1))

[0 1]
0.6603892027620841
0.7108317027946743


In [248]:
# Downsample majority class
data_majority_downsampled = resample(data_majority, 
                                 replace=False,    # sample without replacement
                                 n_samples=len(data_minority),     # to match minority class
                                 random_state=42) # reproducible result

In [249]:
# Combine minority class with downsampled majority class
data_downsampled = pd.concat([data_majority_downsampled, data_minority])

# Display new class counts
print(data_downsampled.Exited.value_counts())

Exited
0    2037
1    2037
Name: count, dtype: int64


In [250]:
# Train model using the downsampled data
# Separate input features (features) and target variable (target)
target_downsampled = data_downsampled.Exited
features_downsampled = data_downsampled.drop('Exited', axis=1)

In [251]:
# Perform train-test split
features_train, features_test, target_train, target_test = train_test_split(
    features_downsampled, target_downsampled, test_size=0.2, random_state=42)

In [252]:
# Train model
clf_2 = LogisticRegression().fit(features_train, target_train)

# Predict on test set
pred_y_2 = clf_2.predict(features_test)

In [253]:
# Is our model still predicting just one class?
print(np.unique(pred_y_2))

# How's our accuracy?
print(accuracy_score(target_test, pred_y_2))

# What about AUROC?
prob_y_2 = clf_2.predict_proba(features_test)
prob_y_2 = [p[1] for p in prob_y_2]
print(roc_auc_score(target_test, prob_y_2))

[0 1]
0.6478527607361964
0.6913760042719485


## Testing

In [254]:
# Make predictions on the test set using both models
pred_y_1 = clf_1.predict(features_test)
pred_y_2 = clf_2.predict(features_test)

In [255]:
# Calculate F1 score for both models
f1_score_1 = f1_score(target_test, pred_y_1)
f1_score_2 = f1_score(target_test, pred_y_2)

In [256]:
# Calculate AUC-ROC for both models
roc_auc_score_1 = roc_auc_score(target_test, pred_y_1)
roc_auc_score_2 = roc_auc_score(target_test, pred_y_2)

In [257]:
# Print the scores
print(f'Upsampled Model: F1 Score = {f1_score_1}, AUC-ROC = {roc_auc_score_1}')
print(f'Downsampled Model: F1 Score = {f1_score_2}, AUC-ROC = {roc_auc_score_2}')

Upsampled Model: F1 Score = 0.6341463414634145, AUC-ROC = 0.6514023398626181
Downsampled Model: F1 Score = 0.6408010012515645, AUC-ROC = 0.6510837641690332
