In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, make_scorer
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report

In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.0.3-cp37-none-manylinux1_x86_64.whl (76.3 MB)
[K     |████████████████████████████████| 76.3 MB 1.1 MB/s 
Installing collected packages: catboost
Successfully installed catboost-1.0.3


## Загрузка данных

In [None]:
df = pd.read_csv("./sample_data/WA_Fn-UseC_-Telco-Customer-Churn.csv")

## EDA

In [None]:
df.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
count,7043.0,7043.0,7043.0
mean,0.162147,32.371149,64.761692
std,0.368612,24.559481,30.090047
min,0.0,0.0,18.25
25%,0.0,9.0,35.5
50%,0.0,29.0,70.35
75%,0.0,55.0,89.85
max,1.0,72.0,118.75


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [None]:
for i in df.columns:
  print("{}: {}".format( i, df[i].unique()))

customerID: ['7590-VHVEG' '5575-GNVDE' '3668-QPYBK' ... '4801-JZAZL' '8361-LTMKD'
 '3186-AJIEK']
gender: ['Female' 'Male']
SeniorCitizen: [0 1]
Partner: ['Yes' 'No']
Dependents: ['No' 'Yes']
tenure: [ 1 34  2 45  8 22 10 28 62 13 16 58 49 25 69 52 71 21 12 30 47 72 17 27
  5 46 11 70 63 43 15 60 18 66  9  3 31 50 64 56  7 42 35 48 29 65 38 68
 32 55 37 36 41  6  4 33 67 23 57 61 14 20 53 40 59 24 44 19 54 51 26  0
 39]
PhoneService: ['No' 'Yes']
MultipleLines: ['No phone service' 'No' 'Yes']
InternetService: ['DSL' 'Fiber optic' 'No']
OnlineSecurity: ['No' 'Yes' 'No internet service']
OnlineBackup: ['Yes' 'No' 'No internet service']
DeviceProtection: ['No' 'Yes' 'No internet service']
TechSupport: ['No' 'Yes' 'No internet service']
StreamingTV: ['No' 'Yes' 'No internet service']
StreamingMovies: ['No' 'Yes' 'No internet service']
Contract: ['Month-to-month' 'One year' 'Two year']
PaperlessBilling: ['Yes' 'No']
PaymentMethod: ['Electronic check' 'Mailed check' 'Bank transfer (automatic)

In [None]:
len(df['customerID'].unique())

7043

In [None]:
for i in df.columns[1:]:
  print(df.loc[:, ['customerID', i]].groupby(i).count(), '\n')

        customerID
gender            
Female        3488
Male          3555 

               customerID
SeniorCitizen            
0                    5901
1                    1142 

         customerID
Partner            
No             3641
Yes            3402 

            customerID
Dependents            
No                4933
Yes               2110 

        customerID
tenure            
0               11
1              613
2              238
3              200
4              176
...            ...
68             100
69              95
70             119
71             170
72             362

[73 rows x 1 columns] 

              customerID
PhoneService            
No                   682
Yes                 6361 

                  customerID
MultipleLines               
No                      3390
No phone service         682
Yes                     2971 

                 customerID
InternetService            
DSL                    2421
Fiber optic            3096
No     

## Preprocessing & Feature Engineering

In [None]:
# Деление признаков на бинарные и небинарные
bin_columns_cat = ['gender', 'PhoneService', 
                'Partner', 'Dependents', 
               'PaperlessBilling', 'Churn']

other_columns_cat = ['MultipleLines', 'InternetService', 'OnlineSecurity',
                 'OnlineBackup', 'DeviceProtection', 'TechSupport', 
                 'StreamingTV', 'StreamingMovies', 'Contract', 'PaymentMethod']

In [None]:
# Приведение типа
df['TotalCharges'] = df['TotalCharges'].apply(lambda x: float(x) if x != ' ' else 0)

In [None]:
# Кодирование категориальных признаков в числовые
label_encoder = LabelEncoder()
for i in bin_columns_cat + other_columns_cat:
  df[i] = label_encoder.fit_transform(df[i])

In [None]:
# Убираем ненужный уникальный признак для всех строок
df = df.drop('customerID', axis=1)

In [None]:
X = df.drop('Churn', axis=1)
y = df['Churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

## Train & Test

In [None]:
cat_features = list(df.drop(['MonthlyCharges', 'TotalCharges', 'tenure', 'Churn'], axis=1).columns)

In [None]:
#Обучаем модели
cat = CatBoostClassifier()
gbc = GradientBoostingClassifier()
xgbc = XGBClassifier()
lgbm = LGBMClassifier()

cat.fit(X_train, y_train, cat_features=cat_features, silent=True)
gbc.fit(X_train, y_train)
xgbc.fit(X_train, y_train)
lgbm.fit(X_train, y_train)

LGBMClassifier()

In [None]:
# Проверяем качество
for model in [cat, gbc, xgbc, lgbm]:
  y_pred = model.predict(X_test)
  print(model, "\n", classification_report(y_test, y_pred))

<catboost.core.CatBoostClassifier object at 0x7fd8d74e7c50> 
               precision    recall  f1-score   support

           0       0.86      0.89      0.87      1585
           1       0.63      0.56      0.59       528

    accuracy                           0.81      2113
   macro avg       0.74      0.73      0.73      2113
weighted avg       0.80      0.81      0.80      2113

GradientBoostingClassifier() 
               precision    recall  f1-score   support

           0       0.86      0.90      0.88      1585
           1       0.64      0.56      0.60       528

    accuracy                           0.81      2113
   macro avg       0.75      0.73      0.74      2113
weighted avg       0.80      0.81      0.81      2113

XGBClassifier() 
               precision    recall  f1-score   support

           0       0.86      0.90      0.88      1585
           1       0.64      0.56      0.60       528

    accuracy                           0.81      2113
   macro avg     

Из отчетов классификации видно, что у XGBClassifier и GradientBoostingClassifier выше показатели по precision, recall, f1-score, support, чем у остальных моделей

## Tuning

In [None]:
n_estimators = range(100, 1000, 250)
max_features = ['auto', 'sqrt', 'log2', None]
max_depth = [2, 3, 5, 10, 15]
subsample =  [0.6,0.7,0.8]

hyperparameter_grid = {
      'n_estimators': n_estimators,
        'max_features': max_features, 
        'subsample': [0.6,0.7,0.8]}

In [None]:
for model in [gbc, xgbc, lgbm]:
  grid_cv = GridSearchCV(
      estimator=model,
      param_grid=hyperparameter_grid,
      cv=4, 
      scoring = 'accuracy',
      n_jobs = -1, 
      return_train_score = True)

  grid_cv.fit(X_train, y_train)
  
  best_model = grid_cv.best_estimator_
  best_model.fit(X_train, y_train)
  y_pred = best_model.predict(X_test)
  print(best_model, "\n", classification_report(y_test, y_pred))

GradientBoostingClassifier(max_features='log2', subsample=0.8) 
               precision    recall  f1-score   support

           0       0.86      0.89      0.88      1585
           1       0.64      0.58      0.61       528

    accuracy                           0.81      2113
   macro avg       0.75      0.73      0.74      2113
weighted avg       0.81      0.81      0.81      2113

XGBClassifier(max_features='auto', subsample=0.8) 
               precision    recall  f1-score   support

           0       0.86      0.90      0.88      1585
           1       0.65      0.58      0.61       528

    accuracy                           0.82      2113
   macro avg       0.76      0.74      0.75      2113
weighted avg       0.81      0.82      0.81      2113

LGBMClassifier(max_features='auto', subsample=0.6) 
               precision    recall  f1-score   support

           0       0.86      0.88      0.87      1585
           1       0.62      0.57      0.59       528

    accuracy

На кроссвалидации XGBClassifie показал значения по метрикам лучше, чем у других моделей