In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
import numpy as np
from sklearn.metrics import precision_recall_fscore_support
import random
import seaborn as sns

from sklearn import svm

from sklearn.linear_model import LogisticRegression

In [None]:
origDf = pd.read_csv('./Bank_Customer.csv')

bankDf = origDf.copy()

# Visualization

In [None]:
bankDf.head()

In [None]:
bankDf['gender'] = bankDf['gender'].map({'Female': 0, 'Male': 1})

bankDfOne = pd.get_dummies(bankDf['country'])
bankDfOne = bankDfOne.astype(int)

bankDf = pd.concat([bankDf, bankDfOne], axis=1)

bankDf = bankDf.drop(columns=['country'])

display(bankDf)
display(bankDf.dtypes)

In [None]:
sns.heatmap(bankDf.corr(), annot=True, cmap='coolwarm', fmt=".2f")

In [None]:
sns.heatmap(bankDf.drop(columns=['France', 'Germany', 'Spain']).corr(), annot=True, cmap='coolwarm', fmt=".2f")

In [None]:
minAge = min(bankDf.age)
maxAge = max(bankDf.age)

difference = maxAge-minAge

plt.hist(bankDf.age, bins = difference)
plt.xlabel('Age')

In [None]:
churn_counts = origDf.groupby(['gender', 'churn']).size().unstack()

# Plot the bar plot
churn_counts.plot(kind='bar', stacked=True)
plt.title('Churn Rate by Gender')
plt.xlabel('Gender')
plt.ylabel('Number of Customers')
plt.xticks(rotation=0)
plt.legend(title='Churn', labels=['Not Churned', 'Churned'])
plt.show()

In [None]:
plt.hist(bankDf.churn)
plt.xlabel('Churn')

In [None]:
palette = ['#1f77b4', '#ff7f0e']

plt.figure(figsize=(10, 6))
sns.scatterplot(data=bankDf, x='age', y='gender', hue='churn', palette=palette)
plt.title('Churn Status by Age and Gender')
plt.xlabel('Age')
plt.ylabel('Gender')
plt.legend(title='Churn', labels=['Not Churned', 'Churned'])
plt.show()

In [None]:
display(min(bankDf.balance))
display(max(bankDf.balance))

plt.hist(bankDf.balance)
plt.xlabel('Balance')

# Models

In [None]:
bankDf.age = (bankDf.age - bankDf.age.mean()) / bankDf.age.std()
bankDf.gender = (bankDf.gender - bankDf.gender.mean()) / bankDf.gender.std()
bankDf.balance = (bankDf.balance  - bankDf.balance .mean()) / bankDf.balance.std()

Logistic Regression

In [None]:
X = bankDf[['age', 'gender']]
y = bankDf.churn

lm = lm = LogisticRegression(class_weight={0:0.2, 1:0.8})
lm.fit(X, y)

y_pred = lm.predict(X)

display(y_pred)

display('predicted range: [{0:.2f},{1:.2f}]'.format(min(y_pred), max(y_pred)))

p,r,f,s = precision_recall_fscore_support(y, y_pred, labels=[0,1])
display('precision = {}'.format(p))
display('recall = {}'.format(r))
display('f-score = {}'.format(f))

SVM

In [None]:
clf = svm.SVC(kernel='poly', degree = 2)
clf.fit(X.values, y)

y_pred = clf.predict(X)
p,r,f,s = precision_recall_fscore_support(y, y_pred)
display('f-score = {}'.format(f))

Sampling Data to Balance

Polynomial Kernel

In [None]:
from imblearn.over_sampling import RandomOverSampler

clf = svm.SVC(kernel='poly', degree = 2)

ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X, y)
clf.fit(X_resampled, y_resampled)

y_pred_resampled = clf.predict(X_resampled)
p,r,f,s = precision_recall_fscore_support(y_resampled, y_pred_resampled)
display('Precision: {}, Recall: {}, f-score = {}, Support: {}'.format(p,r,f,s))

In [None]:
lm = lm = LogisticRegression(class_weight={0:0.2, 1:0.8})
lm.fit(X_resampled, y_resampled)

y_pred_resampled = lm.predict(X_resampled)

display(y_pred_resampled)

display('predicted range: [{0:.2f},{1:.2f}]'.format(min(y_pred_resampled), max(y_pred_resampled)))

p,r,f,s = precision_recall_fscore_support(y_resampled, y_pred_resampled, labels=[0,1])
display('Precision: {}, Recall: {}, f-score = {}, Support: {}'.format(p,r,f,s))

RBF Kernel

In [None]:
clf = svm.SVC(kernel='rbf', class_weight={0:1, 1:1})

clf.fit(X_resampled, y_resampled)

y_pred_resampled = clf.predict(X_resampled)
p,r,f,s = precision_recall_fscore_support(y_resampled, y_pred_resampled)
display('Precision: {}, Recall: {}, f-score = {}, Support: {}'.format(p,r,f,s))

Different Class Weights

In [None]:
clf = svm.SVC(kernel='poly', degree = 2, class_weight={0:1, 1:2})

ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X, y)
clf.fit(X_resampled, y_resampled)

y_pred_resampled = clf.predict(X_resampled)
p,r,f,s = precision_recall_fscore_support(y_resampled, y_pred_resampled)
display('Precision: {}, Recall: {}, f-score = {}, Support: {}'.format(p,r,f,s))

In [None]:
clf = svm.SVC(kernel='rbf', class_weight={0:1, 1:2})

clf.fit(X_resampled, y_resampled)

y_pred_resampled = clf.predict(X_resampled)
p,r,f,s = precision_recall_fscore_support(y_resampled, y_pred_resampled)
display('Precision: {}, Recall: {}, f-score = {}, Support: {}'.format(p,r,f,s))

# Best Model:

In [None]:
clf = svm.SVC(kernel='rbf', class_weight={0:1, 1:1})

clf.fit(X_resampled, y_resampled)

y_pred_resampled = clf.predict(X_resampled)
p,r,f,s = precision_recall_fscore_support(y_resampled, y_pred_resampled)
display('Precision: {}, Recall: {}, f-score = {}, Support: {}'.format(p,r,f,s))

Generalizable:

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

p,r,f,s = precision_recall_fscore_support(y_test, y_pred)
display('Precision: {}, Recall: {}, f-score = {}, Support: {}'.format(p,r,f,s))
