# Отток клиентов

Из «Бета-Банка» стали уходить клиенты. Каждый месяц. Немного, но заметно. Банковские маркетологи посчитали: сохранять текущих клиентов дешевле, чем привлекать новых.

Нужно спрогнозировать, уйдёт клиент из банка в ближайшее время или нет. Вам предоставлены исторические данные о поведении клиентов и расторжении договоров с банком. 

Постройте модель с предельно большим значением *F1*-меры. Чтобы сдать проект успешно, нужно довести метрику до 0.59. Проверьте *F1*-меру на тестовой выборке самостоятельно.

Дополнительно измеряйте *AUC-ROC*, сравнивайте её значение с *F1*-мерой.

Источник данных: [https://www.kaggle.com/barelydedicated/bank-customer-churn-modeling](https://www.kaggle.com/barelydedicated/bank-customer-churn-modeling)

## Подготовка данных

In [1]:
import pandas as pd

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, f1_score, precision_recall_curve, precision_score, recall_score, roc_auc_score, roc_curve, r2_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier 
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.utils import shuffle
from sklearn.preprocessing import OneHotEncoder


import numpy as np

In [2]:
df = pd.read_csv('/datasets/Churn.csv')

In [3]:
df.shape #изучаем данные

(10000, 14)

In [4]:
df.head(10)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2.0,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1.0,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8.0,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1.0,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2.0,125510.82,1,1,1,79084.1,0
5,6,15574012,Chu,645,Spain,Male,44,8.0,113755.78,2,1,0,149756.71,1
6,7,15592531,Bartlett,822,France,Male,50,7.0,0.0,2,1,1,10062.8,0
7,8,15656148,Obinna,376,Germany,Female,29,4.0,115046.74,4,1,0,119346.88,1
8,9,15792365,He,501,France,Male,44,4.0,142051.07,2,0,1,74940.5,0
9,10,15592389,H?,684,France,Male,27,2.0,134603.88,1,1,1,71725.73,0


In [5]:
df.isnull().sum() #проверяем на пропуски 

RowNumber            0
CustomerId           0
Surname              0
CreditScore          0
Geography            0
Gender               0
Age                  0
Tenure             909
Balance              0
NumOfProducts        0
HasCrCard            0
IsActiveMember       0
EstimatedSalary      0
Exited               0
dtype: int64

In [6]:
df.duplicated().sum() #проверяем на пропуски 

0

In [7]:
df.info() #для понимания типа данных 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           9091 non-null   float64
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(3), int64(8), object(3)
memory usage: 1.1+ MB


In [8]:
df['Tenure']=df['Tenure'].fillna(df['Tenure'].median()) #так как пропусков немного заполним сюда медианным значением 

In [9]:
df.isnull().sum() #проверяем удалились ли пропуски 

RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [10]:
df.head(10)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2.0,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1.0,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8.0,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1.0,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2.0,125510.82,1,1,1,79084.1,0
5,6,15574012,Chu,645,Spain,Male,44,8.0,113755.78,2,1,0,149756.71,1
6,7,15592531,Bartlett,822,France,Male,50,7.0,0.0,2,1,1,10062.8,0
7,8,15656148,Obinna,376,Germany,Female,29,4.0,115046.74,4,1,0,119346.88,1
8,9,15792365,He,501,France,Male,44,4.0,142051.07,2,0,1,74940.5,0
9,10,15592389,H?,684,France,Male,27,2.0,134603.88,1,1,1,71725.73,0


In [11]:
df['Geography'].value_counts() #изучаем из каких стран пользователи 

France     5014
Germany    2509
Spain      2477
Name: Geography, dtype: int64

In [12]:
df['Gender'].value_counts()  #исследуем пол пользователей банка 

Male      5457
Female    4543
Name: Gender, dtype: int64

In [13]:
df = df.drop(columns = ['Surname', 'CustomerId', 'RowNumber'], axis = 1)  #удаляю три столбца которые мне нужны для обучения модели 

## Исследование задачи

In [14]:
#перенесла разделение выборки после обработки методом OneHotEncoder

In [15]:
df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2.0,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1.0,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8.0,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1.0,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2.0,125510.82,1,1,1,79084.1,0


In [16]:
features = df.drop(["Exited"], axis = 1)
target = df["Exited"]

In [17]:
train_size=0.6 # определим размер тренировочной выборки 

In [18]:
features_train, features_rem, target_train, target_rem = train_test_split(features, target, train_size=0.6, random_state=12345)

In [19]:
features_train.shape

(6000, 10)

In [20]:
# Теперь нам из оставшейся надо выделить валидационную и тестовую. Мы хотим чтобы они были по 40% 
# или 50% от оставшейся  

In [21]:
features_test, features_valid, target_test, target_valid = train_test_split(features_rem, target_rem, train_size=0.5, random_state=12345)

In [22]:
features_train.shape

(6000, 10)

In [23]:
features_valid.shape

(2000, 10)

In [24]:
features_test.shape

(2000, 10)

In [25]:
#работаем с  OHE

In [26]:
encoder = OneHotEncoder(drop='first')

In [27]:
features_train_ohe = pd.DataFrame(encoder.fit_transform(features_train[['Geography','Gender']]).toarray(), columns=encoder.get_feature_names())

In [28]:
features_train_new = features_train.drop(columns=['Geography','Gender']).reset_index(drop=True).join(features_train_ohe.reset_index(drop=True))

In [29]:
features_train_new.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,x0_Germany,x0_Spain,x1_Male
0,567,35,8.0,153137.74,1,1,0,88659.07,0.0,1.0,1.0
1,711,37,8.0,113899.92,1,0,0,80215.2,0.0,0.0,0.0
2,850,44,3.0,140393.65,2,0,1,186285.52,1.0,0.0,1.0
3,512,24,6.0,0.0,2,1,0,37654.31,0.0,0.0,1.0
4,665,27,2.0,147435.96,1,0,0,187508.06,1.0,0.0,0.0


In [30]:
features_valid_ohe = pd.DataFrame(encoder.transform(features_valid[['Geography','Gender']]).toarray(), columns=encoder.get_feature_names())

In [31]:
features_valid_new = features_valid.drop(columns=['Geography','Gender']).reset_index(drop=True).join(features_valid_ohe.reset_index(drop=True))

In [32]:
features_valid_new.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,x0_Germany,x0_Spain,x1_Male
0,438,38,2.0,0.0,2,1,0,136859.55,0.0,0.0,1.0
1,644,39,9.0,0.0,1,1,0,3740.93,0.0,0.0,0.0
2,564,31,5.0,121461.87,1,1,1,20432.09,0.0,1.0,1.0
3,628,40,10.0,0.0,2,1,0,103832.58,0.0,1.0,1.0
4,728,30,10.0,114835.43,1,0,1,37662.49,0.0,0.0,1.0


In [33]:
features_test_ohe = pd.DataFrame(encoder.transform(features_test[['Geography','Gender']]).toarray(),columns=encoder.get_feature_names())

In [34]:
features_test_new = features_test.drop(columns=['Geography','Gender']).reset_index(drop=True).join(features_test_ohe.reset_index(drop=True))

In [35]:
features_test_new.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,x0_Germany,x0_Spain,x1_Male
0,585,35,2.0,0.0,2,1,0,98621.04,0.0,0.0,0.0
1,625,45,3.0,0.0,1,1,1,184474.15,0.0,0.0,0.0
2,667,32,0.0,103846.65,1,1,0,20560.69,1.0,0.0,1.0
3,568,36,10.0,153610.61,1,1,1,54083.8,0.0,1.0,0.0
4,606,42,10.0,0.0,2,1,0,177938.52,0.0,1.0,1.0


In [36]:
#масштабируем данные 

In [37]:
numeric = ['CreditScore','Age','Tenure','Balance','NumOfProducts','HasCrCard','IsActiveMember','EstimatedSalary','x0_Germany','x0_Spain','x1_Male']

In [38]:
scaler = StandardScaler()
scaler.fit(features_train_new[numeric])
features_train_new[numeric] = scaler.transform(features_train_new[numeric])
features_valid_new[numeric] = scaler.transform(features_valid_new[numeric])
features_test_new[numeric] = scaler.transform(features_test_new[numeric])
pd.options.mode.chained_assignment = None
print(features_train_new.head(5))

   CreditScore       Age    Tenure   Balance  NumOfProducts  HasCrCard  \
0    -0.886751 -0.373192  1.082277  1.232271      -0.891560   0.642466   
1     0.608663 -0.183385  1.082277  0.600563      -0.891560  -1.556504   
2     2.052152  0.480939 -0.737696  1.027098       0.830152  -1.556504   
3    -1.457915 -1.417129  0.354288 -1.233163       0.830152   0.642466   
4     0.130961 -1.132419 -1.101690  1.140475      -0.891560  -1.556504   

   IsActiveMember  EstimatedSalary  x0_Germany  x0_Spain   x1_Male  
0       -1.055187        -0.187705   -0.572475  1.728977  0.907278  
1       -1.055187        -0.333945   -0.572475 -0.578377 -1.102198  
2        0.947699         1.503095    1.746802 -0.578377  0.907278  
3       -1.055187        -1.071061   -0.572475 -0.578377  0.907278  
4       -1.055187         1.524268    1.746802 -0.578377 -1.102198  


In [39]:
target_train.value_counts()     #Изучаем дисбаланс 

0    4804
1    1196
Name: Exited, dtype: int64

In [40]:
target_train.reset_index(drop= True , inplace= True ) #сбрасываем индексы ,чтобы код не упал

In [41]:
target_valid.reset_index(drop= True , inplace= True )

In [42]:
target_test.reset_index(drop= True , inplace= True )

In [43]:
#без учета дисбаланса обучим данные на разных моделях 
#начнем с логистической регрессии 

In [44]:
model = LogisticRegression(random_state=12345, solver='lbfgs', max_iter=1000)
model.fit(features_train_new, target_train)
model.score(features_train_new, target_train) 

0.8183333333333334

In [45]:
predicted_valid_new = model.predict(features_valid_new)

In [46]:
confusion_matrix(target_valid, predicted_valid_new)

array([[1504,   73],
       [ 344,   79]])

In [47]:
f1_score(target_valid, predicted_valid_new)

0.27478260869565213

In [48]:
probabilities_valid = model.predict_proba(features_valid_new)
probabilities_one_valid = probabilities_valid[:, 1]
auc_roc = roc_auc_score(target_valid, probabilities_one_valid, multi_class='ovr')
print(auc_roc)

0.7385990396824326


In [49]:
#теперь с помощью цикла определим лучшую модель для Случайного леса 

In [None]:
best_model = None
best_result = 0
for est in range(1, 27):
    for depth in range(1, 50):
        model = RandomForestClassifier(random_state=12345, n_estimators= est, max_depth = depth) 
        model.fit(features_train_new, target_train) 
        predicted_valid = model.predict(features_valid_new)
        result = f1_score(target_valid, predicted_valid) 
        if result > best_result:
            best_model = model
            best_result = result    
print("f1 наилучшей модели на валидационной выборке:", best_result)

In [51]:
confusion_matrix(target_valid, predicted_valid)

array([[1509,   68],
       [ 251,  172]])

In [52]:
probabilities_valid = best_model.predict_proba(features_valid_new)
probabilities_one_valid = probabilities_valid[:, 1]
auc_roc = roc_auc_score(target_valid , probabilities_one_valid, multi_class='ovr')
print(auc_roc)

0.8304213494515577


<div class="alert alert-info"> 
    
<h1> Вывод: </h1> 
Без учета дисбаланса данных, используя модель случайнй лес, мы получили более высокий результат, чем используя логистическую регрессию. Однако, из-за дисбаланса мы не можем пока получить необходимое значение f1
</div>

## Борьба с дисбалансом

In [53]:
#перенесла 

In [54]:
#видим сильный дисбаланс данных 

In [55]:
# пробуем первый метод upsampling для борьбы с дисбалансом 

In [56]:
def upsample(features, target, repeat):
    features_zeros = features[target == 0]
    features_ones = features[target == 1]
    target_zeros = target[target == 0]
    target_ones = target[target == 1]

    features_upsampled = pd.concat([features_zeros] + [features_ones] * repeat)
    target_upsampled = pd.concat([target_zeros] + [target_ones] * repeat)
    
    features_upsampled, target_upsampled = shuffle(
        features_upsampled, target_upsampled, random_state=12345)
    
    return features_upsampled, target_upsampled

features_upsampled, target_upsampled = upsample(features_train_new, target_train, 4)


In [57]:
target_upsampled.value_counts() #проверим дисбаланс после метода 

0    4804
1    4784
Name: Exited, dtype: int64

In [58]:
#подбираем  еще раз лучшую модель  "Случайный лес"  с новыми данными

In [59]:
best_model_up = None
best_result_up = 0
for est in range(1, 27):
    for depth in range(1, 50):
        model = RandomForestClassifier(random_state=12345, n_estimators= est, max_depth = depth) 
        model.fit(features_upsampled, target_upsampled) 
        predicted_valid = model.predict(features_valid_new)
        result = f1_score(target_valid, predicted_valid) 
        if result > best_result_up:
            best_model_up = model
            best_result_up = result    
print("f1 наилучшей модели на валидационной выборке:", best_result_up)

f1 наилучшей модели на валидационной выборке: 0.6192090395480226


In [60]:
confusion_matrix(target_valid, predicted_valid)

array([[1465,  112],
       [ 195,  228]])

In [61]:
probabilities_valid = best_model_up.predict_proba(features_valid_new)
probabilities_one_valid = probabilities_valid[:, 1]
auc_roc_up = roc_auc_score(target_valid , probabilities_one_valid, multi_class='ovr')
print(auc_roc_up)

0.8504686907390667


In [62]:
#  пробуем метод downsample

In [63]:
def downsample(features, target, fraction):
    features_zeros = features[target == 0]
    features_ones = features[target == 1]
    target_zeros = target[target == 0]
    target_ones = target[target == 1]

    features_downsampled = pd.concat(
        [features_zeros.sample(frac=fraction, random_state=12345)] + [features_ones])  
    target_downsampled = pd.concat(
        [target_zeros.sample(frac=fraction, random_state=12345)] + [target_ones])
    features_downsampled, target_downsampled = shuffle(
        features_downsampled, target_downsampled, random_state=12345)
    
    return features_downsampled, target_downsampled

features_downsampled, target_downsampled = downsample(features_train_new, target_train, 0.25)

In [64]:
target_downsampled.value_counts() #проверим дисбаланс после метода 

0    1201
1    1196
Name: Exited, dtype: int64

In [65]:
best_model_down = None
best_result_down = 0
for est in range(1, 30):
    for depth in range(1, 50):
        model = RandomForestClassifier(random_state=12345, n_estimators= est, max_depth = depth) 
        model.fit(features_downsampled, target_downsampled)
        predicted_valid = model.predict(features_valid_new)
        result = f1_score(target_valid, predicted_valid) 
        if result > best_result_down:
            best_model_down = model
            best_result_down = result    
print("Accuracy наилучшей модели на валидационной выборке:", best_result_down)

Accuracy наилучшей модели на валидационной выборке: 0.5928085519922255


In [66]:
confusion_matrix(target_valid, predicted_valid) 

array([[1211,  366],
       [ 116,  307]])

In [67]:
probabilities_valid = best_model_down.predict_proba(features_valid_new)
probabilities_one_valid = probabilities_valid[:, 1]
auc_roc = roc_auc_score(target_valid , probabilities_one_valid, multi_class='ovr')
print(auc_roc)

0.8468371432726051


<div class="alert alert-info"> 
    
<h1> Вывод: </h1> 
После применения масштабирования улучшилось качество модели  "Случайный лес" . Мы использовали метод upsampled так как она дал лучший показатель f1 чем downsample. Для проверки на тест берем модель , где дисбаланс мы решили методом   upsampled

</div>

In [68]:
predicted_valid_test = best_model_up.predict(features_test_new)

In [69]:
confusion_matrix(target_test, predicted_valid_test)

array([[1384,  198],
       [ 146,  272]])

In [70]:
f1_score(target_test, predicted_valid_test)

0.6126126126126127

In [71]:
probabilities_valid_test = best_model_up.predict_proba(features_test_new)
probabilities_one_valid_test = probabilities_valid_test[:, 1]
auc_roc_test = roc_auc_score(target_test ,probabilities_one_valid_test, multi_class='ovr')
print(auc_roc_test)

0.8465965799454388
