In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, RandomForestRegressor, GradientBoostingRegressor

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import roc_auc_score

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [None]:
# <class 'pandas.core.frame.DataFrame'>
# RangeIndex: 10000 entries, 0 to 9999
# Data columns (total 12 columns):
#  #   Column         Non-Null Count  Dtype  
# ---  ------         --------------  -----  
#  0   USERID         10000 non-null  int64  
#  1   score          10000 non-null  float64
#  2   city           10000 non-null  object 
#  3   gender         10000 non-null  object 
#  4   age            10000 non-null  int64  
#  5   equity         10000 non-null  int64  
#  6   balance        7705 non-null   float64
#  7   products       10000 non-null  int64  
#  8   credit_card    10000 non-null  int64  
#  9   last_activity  10000 non-null  int64  
#  10  EST_SALARY     10000 non-null  float64
#  11  churn          10000 non-null  int64  
# dtypes: float64(3), int64(7), object(2)
# memory usage: 937.6+ KB

In [None]:
# df consist of 12 columns and 10000 rows
# df represents bank customer's information 
# the main goal of the study is to find features that affect the outflow of customers from the bank
# this part of research is made with machine learning algorithms

In [None]:
# we will compare results with simple correlation matrix for "churn" column of df


df_test = pd.get_dummies(df, columns=['gender', 'city', 'balance_category', 'est_salary_category', 'age_category'])

# drop user_id

df_test_corr = df_test.drop(['user_id'],axis = 1)

# correlation matrix

corr_matrix = df_test_corr.corr().round(2)

plt.figure(figsize=(15, 15))
sns.heatmap(corr_matrix[['churn']].sort_values(by=['churn'],ascending=False) 
            ,vmin=-1, annot = True,cbar=True, square = True, cmap="coolwarm")

plt.title('feature correlation to churn rate')

In [None]:
# machine learning part

In [None]:
# first step is to choose a model:


df = pd.get_dummies(df)

# function to print all metrics
def print_all_metrics(y_true, y_pred, y_proba, title='all_metrics'):
    print(title)
    print('\tAccuracy: {:.2f}'.format(accuracy_score(y_true, y_pred)))
    print('\tPrecision: {:.2f}'.format(precision_score(y_true, y_pred)))
    print('\tRecall: {:.2f}'.format(recall_score(y_true, y_pred)))
    print('\tF1: {:.2f}'.format(f1_score(y_true, y_pred)))
    print('\tROC_AUC: {:.2f}'.format(roc_auc_score(y_true, y_proba)))

print('dataset size:', df.shape)

X = df.drop(['churn'], axis = 1)
y = df['churn']

# split the model into training and validation sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0
)

# train StandardScaler on the training set
scaler = StandardScaler()
scaler.fit(X_train)

# Transform Training and Validation Datasets
X_train_st = scaler.transform(X_train)
X_test_st = scaler.transform(X_test)

# define an algorithm for the logistic regression model
lr_model = LogisticRegression(random_state=0)
# train model
lr_model.fit(X_train_st, y_train)
# forecast
lr_predictions = lr_model.predict(X_test_st)
lr_probabilities = lr_model.predict_proba(X_test_st)[:, 1]
# print_all_metrics
print_all_metrics(
    y_test,
    lr_predictions,
    lr_probabilities,
    title='metrics for LogisticRegression:',
)

# same for DecisionTreeClassifier

tree_model = DecisionTreeClassifier(random_state=0)
tree_model.fit(X_train_st, y_train)
tree_predictions = tree_model.predict(X_test_st)
tree_probabilities = tree_model.predict_proba(X_test_st)[:, 1]
print_all_metrics(
    y_test,
    tree_predictions,
    tree_probabilities,
    title='metrics for DecisionTreeClassifier:',
)

# same for RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators = 100, random_state = 0)
rf_model.fit(X_train_st, y_train)
rf_predictions = rf_model.predict(X_test_st)# Ваш код здесь
rf_probabilities = rf_model.predict_proba(X_test_st)[:,1]# Ваш код здесь
print_all_metrics(
y_test,rf_predictions, 
rf_probabilities, 
title = 'metrics for RandomForestClassifier:')

# same for GradientBoostingClassifier
 
gb_model = GradientBoostingClassifier(n_estimators = 100, random_state = 0)
gb_model.fit(X_train_st, y_train)
gb_predictions = gb_model.predict(X_test_st)
gb_probabilities = gb_model.predict_proba(X_test_st)[:,1]
print_all_metrics(
y_test,gb_predictions,
gb_probabilities,
title = 'metrics for GradientBoostingClassifier:')

In [None]:
# metrics for LogisticRegression:
# 	Accuracy: 0.85
# 	Precision: 0.62
# 	Recall: 0.36
# 	F1: 0.45
# 	ROC_AUC: 0.84
# metrics for DecisionTreeClassifier:
# 	Accuracy: 0.82
# 	Precision: 0.49
# 	Recall: 0.54
# 	F1: 0.52
# 	ROC_AUC: 0.71
# metrics for RandomForestClassifier:
# 	Accuracy: 0.88
# 	Precision: 0.75
# 	Recall: 0.46
# 	F1: 0.57
# 	ROC_AUC: 0.87
# metrics for GradientBoostingClassifier:
# 	Accuracy: 0.87
# 	Precision: 0.73
# 	Recall: 0.41
# 	F1: 0.53
# 	ROC_AUC: 0.87

In [None]:
# RandomForestClassifier showed best results
# will use it to find most important features

df = pd.get_dummies(df)

X = df.drop(['churn','user_id'], axis = 1)
y = df['churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

rf_model = RandomForestClassifier(n_estimators = 100, random_state = 0) 
scaler = StandardScaler()
scaler.fit(X_train)

X_train_st = scaler.transform(X_train)
X_test_st = scaler.transform(X_test)

rf_model.fit(X_train_st, y_train) 

rf_predictions = rf_model.predict(X_test_st)
rf_probabilities = rf_model.predict_proba(X_test_st)[:,1]

features = pd.DataFrame({'feature':X_train.columns,'coeff':rf_model.feature_importances_}) 
# use the coeff attribute

features.sort_values(by= 'coeff', ascending = False).head()

In [None]:

#   feature	        coeff
# 
# 3	balance	        0.149911
# 0	credit_score	0.143258
# 7	est_salary	    0.114340
# 1	age	            0.099659
# 4	products	    0.077722


and so we see some difference between the coefficients obtained using 
machine learning and the correlation matrix: balance, credit score and salary of the client are important indicators that you should pay attention to to reduce the outflow of clients from the bank