# Практическое задание к уроку "Задача оттока: варианты постановки, возможные способы решения"

Для нашего пайплайна (Case1) поэкспериментировать с разными моделями:  

1 - бустинг,  
2 - логистическая регрессия (не забудьте здесь добавить в cont_transformer стандартизацию - нормирование вещественных признаков)

Отобрать лучшую модель по метрикам (кстати, какая по вашему мнению здесь наиболее подходящая DS-метрика)



In [177]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
#from sklearn.feature_extraction.text import TfidfVectorizer
import itertools

import matplotlib.pyplot as plt

%matplotlib inline

In [178]:
df = pd.read_csv("../materials/churn_data.csv")
df.head(5)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [181]:
df.drop(columns=['CustomerId'], inplace=True)

In [182]:
df['Exited'].value_counts()

0    7963
1    2037
Name: Exited, dtype: int64

In [183]:
X_train, X_test, y_train, y_test = train_test_split(df, df['Exited'], random_state=0)

- Категориальные признаки закодируем с помощью OneHotEncoding
- Вещественные оставим пока как есть

Cоберем наш простой pipeline, но нам понадобится написать класс для выбора нужного поля

In [184]:
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X[self.column]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]
    
class OHEEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        self.columns = []

    def fit(self, X, y=None):
        self.columns = [col for col in pd.get_dummies(X, prefix=self.key).columns]
        return self

    def transform(self, X):
        X = pd.get_dummies(X, prefix=self.key)
        test_columns = [col for col in X.columns]
        for col_ in self.columns:
            if col_ not in test_columns:
                X[col_] = 0
        return X[self.columns]

In [185]:
df.head(3)

Unnamed: 0,RowNumber,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1


In [186]:
categorical_columns = ['Geography', 'Gender', 'Tenure', 'HasCrCard', 'IsActiveMember']
continuous_columns = ['CreditScore', 'Age', 'Balance', 'NumOfProducts', 'EstimatedSalary']

In [187]:
final_transformers = list()

for cat_col in categorical_columns:
    cat_transformer = Pipeline([
                ('selector', FeatureSelector(column=cat_col)),
                ('ohe', OHEEncoder(key=cat_col))
            ])
    final_transformers.append((cat_col, cat_transformer))
    
for cont_col in continuous_columns:
    cont_transformer = Pipeline([
                ('selector', NumberSelector(key=cont_col)),
                ('standardizer', StandardScaler())
            ])
    final_transformers.append((cont_col, cont_transformer))

In [188]:
from sklearn.pipeline import FeatureUnion

feats = FeatureUnion(final_transformers)
feature_processing = Pipeline([('feats', feats)])

In [189]:
from sklearn.ensemble import RandomForestClassifier

random_forest_pipeline = Pipeline([
    ('features', feats),
    ('classifier', RandomForestClassifier(random_state = 42)),
])

In [190]:
from xgboost import XGBClassifier

xgboost_pipeline = Pipeline([
    ('features', feats),
    ('classifier', XGBClassifier(random_state=42)),
])


In [191]:
from sklearn.linear_model import LogisticRegression

log_reg_pipeline = Pipeline([
    ('features', feats),
    ('classifier', LogisticRegression(C=0.1, solver='sag', random_state=42)),
])

In [192]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit

cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)

In [193]:
X = df.drop(columns=['Exited'])
y = df['Exited']

In [194]:
log_reg_precision_scores = cross_val_score(log_reg_pipeline, X, y, cv=cv, scoring="precision_macro")
log_reg_recall_scores = cross_val_score(log_reg_pipeline, X, y, cv=cv, scoring="recall_macro")
log_reg_roc_auc_scores = cross_val_score(log_reg_pipeline, X, y, cv=cv, scoring="roc_auc")
log_reg_f1_scores = cross_val_score(log_reg_pipeline, X, y, cv=cv, scoring="f1_macro")

In [195]:
xgboost_precision_scores = cross_val_score(xgboost_pipeline, X, y, cv=cv, scoring="precision_macro")
xgboost_recall_scores = cross_val_score(xgboost_pipeline, X, y, cv=cv, scoring="recall_macro")
xgboost_roc_auc_scores = cross_val_score(xgboost_pipeline, X, y, cv=cv, scoring="roc_auc")
xgboost_f1_score_scores = cross_val_score(xgboost_pipeline, X, y, cv=cv, scoring="f1_macro")



2. Отобрать лучшую модель по метрикам (кстати, какая по вашему мнению здесь наиболее подходящая DS-метрика)

In [196]:
data = {
    'logistic_regression': [
        np.mean(log_reg_precision_scores),
        np.mean(log_reg_recall_scores),
        np.mean(log_reg_roc_auc_scores),
        np.mean(log_reg_f1_scores)
    ],
    'xgboost': [
        np.mean(xgboost_precision_scores),
        np.mean(xgboost_recall_scores),
        np.mean(xgboost_roc_auc_scores),
        np.mean(xgboost_f1_score_scores)
    ]
}
pd.DataFrame.from_dict(data, orient='index', columns=['precision', 'recall', 'roc_auc', 'f1_score'])

Unnamed: 0,precision,recall,roc_auc,f1_score
logistic_regression,0.708998,0.588895,0.767919,0.602818
xgboost,0.792635,0.727527,0.847278,0.751838


Явно выигрывает XGB

Для отобранной модели (на отложенной выборке) сделать оценку экономической эффективности при тех же вводных, как в вопросе 2 (1 доллар на привлечение, 2 доллара - с каждого правильно классифицированного (True Positive) удержанного). (подсказка) нужно посчитать FP/TP/FN/TN для выбранного оптимального порога вероятности и посчитать выручку и траты. 

In [200]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['Exited']), df['Exited'], random_state=42)

In [201]:
model = xgboost_pipeline.fit(X_train, y_train)



In [206]:
y_pred = model.predict(X_test)

In [224]:
pairs = zip(y_pred, y_test)

tp = 0
fp = 0
tn = 0
fn = 0

for pair in pairs:
    pred, test = pair

    tp = (tp + 1) if pred == 1 and pred == test else tp
    fp = (fp + 1) if pred == 1 and pred != test else fp
    tn = (tn + 1) if pred == 0 and pred == test else tn
    fn = (fn + 1) if pred == 0 and pred != test else fn

(tp, fp, tn, fn)

(241, 96, 1907, 256)

Посчитаем экономическую эффективность

In [230]:
income = tp * 2 # USD
cost = tp + fp

cost_to_income = cost / income
clean_income = income - cost

cost_to_income, clean_income

(0.6991701244813278, 145)

Посчитаем недополученные деньги (фронт работ для улучшения модели)

In [231]:
lost_money = fn
fn

256