# Bank customer churn forecast

**The project's task is to predict customer churn from the bank using customer behavior data.**

# Results

- F1-measure of the model is 0.61
- ROC AUC - 0.83
- This means that the model predicted customer churn quite well and could benefit the business.

## Data understanding

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.utils import shuffle

In [2]:
data = pd.read_csv('/datasets/Churn.csv')
data.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2.0,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1.0,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8.0,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1.0,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2.0,125510.82,1,1,1,79084.1,0


## Preprocessing

In [3]:
data.columns = ['row_number', 'customer_id', 'surname', 'credit_score', 'geography',
       'gender', 'age', 'tenure', 'balance', 'num_of_products', 'has_cr_card',
       'is_active_member', 'estimated_salary', 'exited']

In [4]:
data.columns

Index(['row_number', 'customer_id', 'surname', 'credit_score', 'geography',
       'gender', 'age', 'tenure', 'balance', 'num_of_products', 'has_cr_card',
       'is_active_member', 'estimated_salary', 'exited'],
      dtype='object')

In [5]:
data = data.drop(['row_number', 'customer_id', 'surname'], axis=1)

In [7]:
data.isna().sum()

credit_score          0
geography             0
gender                0
age                   0
tenure              909
balance               0
num_of_products       0
has_cr_card           0
is_active_member      0
estimated_salary      0
exited                0
dtype: int64

In [8]:
data['tenure'] = data['tenure'].fillna(data['tenure'].median())

In [9]:
data_ohe = pd.get_dummies(data, drop_first=True)

In [10]:
data_ohe.head()

Unnamed: 0,credit_score,age,tenure,balance,num_of_products,has_cr_card,is_active_member,estimated_salary,exited,geography_Germany,geography_Spain,gender_Male
0,619,42,2.0,0.0,1,1,1,101348.88,1,0,0,0
1,608,41,1.0,83807.86,1,0,1,112542.58,0,0,1,0
2,502,42,8.0,159660.8,3,1,0,113931.57,1,0,0,0
3,699,39,1.0,0.0,2,0,0,93826.63,0,0,0,0
4,850,43,2.0,125510.82,1,1,1,79084.1,0,0,1,0


In [11]:
features = data_ohe.drop('exited', axis=1)
target = data_ohe['exited']

In [12]:
features_train, features_valid_test, target_train, target_valid_test = train_test_split(features, target, train_size=0.4, random_state=1)

In [13]:
features_valid, features_test, target_valid, target_test = train_test_split(features_valid_test, target_valid_test, test_size=0.5, random_state=1)

In [14]:
numeric = ['credit_score', 'age', 'tenure', 'balance', 'num_of_products', 'estimated_salary']

In [15]:
pd.options.mode.chained_assignment = None
scaler = StandardScaler()
scaler.fit(features_train[numeric])
features_train[numeric] = scaler.transform(features_train[numeric])
features_valid[numeric] = scaler.transform(features_valid[numeric])
features_test[numeric] = scaler.transform(features_test[numeric])

## EDA

In [16]:
data['exited'].value_counts(normalize=True)

0    0.7963
1    0.2037
Name: exited, dtype: float64

### Training models on imbalanced data

In [17]:
model = LogisticRegression(random_state=1)
model.fit(features_train, target_train)
predicted_valid = model.predict(features_valid)
print('accuracy:', accuracy_score(target_valid, predicted_valid))

accuracy: 0.822


In [18]:
accuracy = []

for depth in range(1, 11):
    model = DecisionTreeClassifier(max_depth=depth, random_state=1)
    model.fit(features_train, target_train)
    predicted_valid = model.predict(features_valid)
    result = accuracy_score(target_valid, predicted_valid)
    accuracy.append((result, depth))
    
print('accuracy:', max(accuracy))

accuracy: (0.8573333333333333, 6)


In [19]:
model = DecisionTreeClassifier(max_depth=6, random_state=1)
model.fit(features_train, target_train)
predicted_valid = model.predict(features_valid)
print('f1:', f1_score(target_valid, predicted_valid))

f1: 0.559670781893004


In [20]:
target_pred_constant = pd.Series(target_valid.map({0: 0, 1: 0}))
print(accuracy_score(target_valid, target_pred_constant))

0.8013333333333333


### Handling imbalanced data

In [21]:
model = DecisionTreeClassifier(max_depth=6, random_state=1, class_weight='balanced', criterion='entropy')
model.fit(features_train, target_train)
predicted_valid = model.predict(features_valid)
f1_1 = f1_score(target_valid, predicted_valid)
print("f1:", f1_1)

f1: 0.5684210526315789


In [22]:
def upsample(features, target, repeat):
    features_zeros = features[target == 0]
    features_ones = features[target == 1]
    target_zeros = target[target == 0]
    target_ones = target[target == 1]

    features_upsampled = pd.concat([features_zeros] + [features_ones] * repeat)
    target_upsampled = pd.concat([target_zeros] + [target_ones] * repeat)
    
    features_upsampled, target_upsampled = shuffle(
        features_upsampled, target_upsampled, random_state=1)
    
    return features_upsampled, target_upsampled

In [23]:
features_upsampled, target_upsampled = upsample(features_train, target_train, 4)

In [24]:
model = DecisionTreeClassifier(max_depth=6, random_state=1, criterion='entropy')
model.fit(features_upsampled, target_upsampled)
predicted_valid = model.predict(features_valid)
f1_2 = f1_score(target_valid, predicted_valid)
print("f1:", f1_2)

f1: 0.5693069306930693


### Training models on balanced data

In [25]:
model = DecisionTreeClassifier(max_depth=6, random_state=1, criterion='entropy')
model.fit(features_train, target_train)
probabilities_valid = model.predict_proba(features_valid)
probabilities_one_valid = probabilities_valid[:, 1]

f1 = []

for threshold in np.arange(0, 1, 0.02):
    predicted_valid = probabilities_one_valid > threshold 
    result = f1_score(target_valid, predicted_valid)
    f1.append((result, threshold))
    
print('f1, threshold:', max(f1))

f1, threshold: (0.5980015372790161, 0.36)


In [26]:
probabilities_one_valid = list(probabilities_one_valid)
probabilities_one_valid = [1 if i > 0.36 else 0 for i in probabilities_one_valid] 

In [27]:
f1_score(target_valid, probabilities_one_valid)

0.5980015372790161

In [28]:
roc_auc_score(target_valid, probabilities_valid[:, 1])

0.827255538866989

## Testing

In [29]:
model = DecisionTreeClassifier(max_depth=6, random_state=1, criterion='entropy')
model.fit(features_train, target_train)
probabilities_test = model.predict_proba(features_test)
probabilities_one_test = probabilities_test[:, 1]
probabilities_one_test = list(probabilities_one_test)
probabilities_one_test = [1 if i > 0.36 else 0 for i in probabilities_one_test] 
f1_score(target_test, probabilities_one_test)

0.6108597285067875

In [30]:
roc_auc_score(target_test, probabilities_test[:, 1])

0.8344688555802363