In [1]:
import pymssql
import pandas as pd
import numpy as np
import pickle
from matplotlib import pyplot as plt
import datetime as dt
import warnings
import dateutil.relativedelta
from sklearn.model_selection import train_test_split as train
from mlxtend.evaluate import lift_score
from sklearn.metrics import make_scorer, roc_curve, auc, roc_auc_score
from sklearn.model_selection import KFold, GridSearchCV
import lightgbm as lgbm 
from sklearn import ensemble
import uncertainties
import random
from random import randint
import seaborn as sns
from uncertainties import ufloat
from sklearn.metrics import precision_recall_curve, classification_report
from dateutil.relativedelta import relativedelta
warnings.filterwarnings('ignore')

pd.options.display.max_columns = 500
pd.options.display.max_rows = 500

## Загрузка данных

In [None]:
from dataclasses import dataclass
from sqlalchemy import create_engine, event, DateTime, Column, String, MetaData, Integer, \
    Binary, PrimaryKeyConstraint, Date
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker, Session
from contextlib import contextmanager
from urllib.parse import quote_plus

@dataclass(frozen=True)
class DbConfig:
    db_server = "10.252.4.116"
    pwd = "MxGX91Zy"
    uid = "ext-E.Stepanova"
    db_name = "DWH_Globus"
    driver = r"{ODBC Driver 17 for SQL Server}"
    params = quote_plus(
        'DRIVER={DRIVER};SERVER={DB_SERVER};DATABASE={DB_NAME};UID={UID};PWD={PWD}'.format(
            DB_SERVER=db_server, DB_NAME=db_name, UID=uid, PWD=pwd, DRIVER=driver
        ))
    print(params)
    conn_str = 'mssql+pyodbc:///?odbc_connect={}'.format(params)


class Db:
    def __init__(self):
        self._db_conf = DbConfig()
        self.engine = None

    def create_engine(self):
        if self.engine is None:
            self.engine = create_engine(self._db_conf.conn_str)

        @event.listens_for(self.engine, 'before_cursor_execute')
        def receive_before_cursor_execute(conn, cursor, statement, params, context, executemany):
            if executemany:
                cursor.fast_executemany = True
                cursor.commit()

    @contextmanager
    def open_session(self):
        """Provide a transactional scope around a series of operations."""
        session: Session = sessionmaker(bind=self.engine)()
        try:
            yield session
            session.commit()
        except:
            session.rollback()
            raise
        finally:
            session.close()

In [None]:
str_sql_query_churn="""select * from DWH_Globus.[cda_prom].[ForKorusTempChurnWeeklyKAT_2020] with (nolock) """

In [None]:
db = Db()
db.create_engine()
chunks = []
for chunk in pd.read_sql(str_sql_query_churn, con=db.engine, chunksize = 10**5):
    chunks.append(chunk)
df= pd.concat(chunks)

## Предобработка данных, смена типов, 

In [None]:
#df = df.drop('CNT_VISITS_DAY', axis=1)#сережа должен убрать из витрины
df['Datum'] = pd.to_datetime(df['Datum'])
df['CNT_VISITS_WEEK'] = df['CNT_VISITS_WEEK'].astype(np.float16)
df['CNT_VISITS_MONTH'] = df['CNT_VISITS_MONTH'].astype(np.float16)
df['CNT_VISITS_THREE_MONTH'] = df['CNT_VISITS_THREE_MONTH'].astype(np.float16)
df['CNT_VISITS_DAY_TO_LAST_WEEK'] = df['CNT_VISITS_DAY_TO_LAST_WEEK'].astype(np.float16)
df['DIV_VISITS_week_TO_AVG_VISITS_SIX_WEEK'] = df['DIV_VISITS_week_TO_AVG_VISITS_SIX_WEEK'].astype(np.float64)
df['DIV_VISITS_month_TO_LAST_MONTH'] = df['DIV_VISITS_month_TO_LAST_MONTH'].astype(np.float64)
df['DIV_VISITS_month_TO_AVG_VISITS_THREE_MONTH'] = df['DIV_VISITS_month_TO_AVG_VISITS_THREE_MONTH'].astype(np.float64)
df['DIV_UNITS_IN_CHECK_SIX_WEEK'] = df['DIV_UNITS_IN_CHECK_SIX_WEEK'].astype(np.float64)
df['DIV_UNITS_IN_CHECK_THREE_week'] = df['DIV_UNITS_IN_CHECK_THREE_week'].astype(np.float64)
df['DIV_UNITS_IN_CHECK_TO_LAST_MONTH'] = df['DIV_UNITS_IN_CHECK_TO_LAST_MONTH'].astype(np.float64)
df['DIV_UNITS_IN_CHECK_WEEK'] = df['DIV_UNITS_IN_CHECK_WEEK'].astype(np.float64)
df['AVG_COST_PRODUCT_IN_CHECK'] = df['AVG_COST_PRODUCT_IN_CHECK'].astype(np.float64)
df['AVG_UNITS_IN_CHECK'] = df['AVG_UNITS_IN_CHECK'].astype(np.float64)
df['AVG_SUM_CHECK'] = df['AVG_SUM_CHECK'].astype(np.float64)
df['DIV_VISITS_week_TO_AVG_VISITS_THREE_WEEK'] = df['DIV_VISITS_week_TO_AVG_VISITS_THREE_WEEK'].astype(np.float64)
df['DIV_VISITS_week_TO_AVG_VISITS_EIGHT_WEEK'] = df['DIV_VISITS_week_TO_AVG_VISITS_EIGHT_WEEK'].astype(np.float64)
df['DIV_VISITS_week_TO_AVG_VISITS_FOUR_WEEK'] = df['DIV_VISITS_week_TO_AVG_VISITS_FOUR_WEEK'].astype(np.float64)
df['COMM_AGREEMENT'] = df['COMM_AGREEMENT'].astype(np.float16)
df['EMPLOYEE'] = df['EMPLOYEE'].astype(np.float16)
df['FACT_FAVE_STORE'] = df['FACT_FAVE_STORE'].astype(np.float16)
df['CNT_VISITS_WEEK_TO_LAST_WEEK'] = df['CNT_VISITS_WEEK_TO_LAST_WEEK'].astype(np.float16)
# преобразование в количественный признак, если в витрине не в числовом формате
#df.at[df['MEMS_TYPE'] == 'Индивидуальное', 'MEMS_TYPE'] = 0
#df.at[df['MEMS_TYPE'] == 'Анонимное', 'MEMS_TYPE'] = 1
# не определен тип клиента 
df.at[df['MEMS_TYPE'].isnull(), 'MEMS_TYPE'] = 2 
df['div_unique_category_lvl_3_week_last_week'] = df['div_unique_category_lvl_3_week_last_week'].astype(np.float64)
df['activity_key_kat_week'] = df['activity_key_kat_week'].astype(np.float64)
df['activity_key_kat_1month'] = df['activity_key_kat_1month'].astype(np.float64)
df['activity_key_kat_3month'] = df['activity_key_kat_3month'].astype(np.float64)
df['div_unique_category_lvl_3_month_three_last_month'] = df['div_unique_category_lvl_3_month_three_last_month'].astype(np.float64)
df['div_unique_category_lvl_3_month_six_last_month'] = df['div_unique_category_lvl_3_month_six_last_month'].astype(np.float64)
df['activity_key_kat_6month'] = df['activity_key_kat_6month'].astype(np.float64)

## Создание целевой переменной

In [None]:
df=df.sort_values(by='Datum', ascending=True)

In [None]:
df['Churn'] = df['cnt_days_last_visit'].apply(lambda x: 1 if x >=7 else 0)
df['Churn'] = df.groupby('mems_id')['Churn'].shift(-1)
df=df.dropna(subset=['Churn'])
df['Churn']=df['Churn'].astype('int16')

In [None]:
df = df.replace([np.inf, -np.inf], np.nan)
df = df.fillna(0)

## Разделение на папки для обучения и тестирования 

In [None]:
df['Fold_1'] = 0 
df['Fold_2'] = 0 
df['Fold_3'] = 0 
df['Validation'] = 0

df.loc[(df['Datum'] >= '2018-06-01') & (df['Datum'] <= '2018-09-30'), 'Fold_1'] = 'train'
df.loc[(df['Datum'] >= '2018-10-15') & (df['Datum'] <= '2018-11-30'), 'Fold_1'] = 'test'
df.loc[(df['Datum'] >= '2018-08-01') & (df['Datum'] <= '2018-12-31'), 'Fold_2'] = 'train'
df.loc[(df['Datum'] >= '2019-01-15') & (df['Datum'] <= '2019-02-28'), 'Fold_2'] = 'test'
df.loc[(df['Datum'] >= '2018-10-01') & (df['Datum'] <= '2019-02-28'), 'Fold_3'] = 'train'
df.loc[(df['Datum'] >= '2019-03-15') & (df['Datum'] <= '2019-04-30'), 'Fold_3'] = 'test'
df.loc[(df['Datum'] >= '2018-07-01') & (df['Datum'] <= '2019-03-31'), 'Validation'] = 'train'
df.loc[(df['Datum'] >= '2019-06-01') & (df['Datum'] <= '2019-08-31'), 'Validation'] = 'test'

In [None]:
df.to_hdf('/mnt/cda/week_churn_kat_1802.hdf', "df")

## Считываем собранный файл

In [2]:
df = pd.read_hdf('/mnt/cda/week_churn_kat_1802.hdf', "df")

In [3]:
print ("Кол-во уникальных клиентов", df['mems_id'].nunique())
print ()
print ("Кол-во дат за весь период", df['Datum'].nunique())
print ()
print ("Первая дата", df['Datum'].min())
print ()
print ("Последняя дата", df['Datum'].max())

Кол-во уникальных клиентов 2027802

Кол-во дат за весь период 61

Первая дата 2018-06-03 00:00:00

Последняя дата 2019-07-28 00:00:00


## Создаем набор данных для обучения

In [11]:
train_cv = df[(df['Fold_1'] == 'train') | (df['Fold_2'] == 'train') |\
              (df['Fold_1'] == 'test') | (df['Fold_2'] == 'test')|\
             (df['Fold_3'] == 'train') | (df['Fold_3'] == 'test')].sample(500000).reset_index()

### Индексы данных для обучения

In [13]:
train_ind_1 = train_cv[train_cv['Fold_1'] == 'train'].index.values.astype(int)
test_ind_1 = train_cv[train_cv['Fold_1'] == 'test'].index.values.astype(int)
train_ind_2 = train_cv[train_cv['Fold_2'] == 'train'].index.values.astype(int)
test_ind_2 = train_cv[train_cv['Fold_2'] == 'test'].index.values.astype(int)
train_ind_3 = train_cv[train_cv['Fold_3'] == 'train'].index.values.astype(int)
test_ind_3 = train_cv[train_cv['Fold_3'] == 'test'].index.values.astype(int)


### Целевая переменная и признаки для обучения модели

In [15]:
y_train_cv = train_cv['Churn']
X_train_cv = train_cv.drop(['index', 'Churn', 'mems_id', 'Datum', 
                              'Fold_1', 'Fold_2', 'Fold_3', 'Validation'],axis=1)


Размер тестовой и убочающей выборки (500000, 42) (500000,)


## Бизнес метрика для обучения модели

In [16]:
def custom_asymmetric_train(  y_true, y_pred ):# для грида такой порядок переменных
    function=np.where((y_pred>0.5)&(y_true==0), -1000, 0)
    function2=np.where((y_pred>0.5)&(y_true==1), 1, 0)
    res=np.mean(function+function2).astype(float)
    return  res

In [18]:
from sklearn.metrics import make_scorer
my_func = make_scorer(custom_asymmetric_train, greater_is_better=True)

### Генератор для разделения на обучающую и тестовую выборку

In [19]:
def generator(train_cv):
    i = 1
    while i <= 3:
        train_ind_1 = train_cv[train_cv['Fold_'+str(i)] == 'train'].index.values.astype(int)
        test_ind_1 = train_cv[train_cv['Fold_'+str(i)] == 'test'].index.values.astype(int)
        yield train_ind_1, test_ind_1
        i += 1

## Модель 

In [20]:
mdl = lgbm.LGBMClassifier(silent=True)

In [21]:
gridParams = {
    'learning_rate': [0.5, 0.2, 0.7],
    'n_estimators':  [450, 500, 550],
    'num_leaves': [6, 5, 4, 3],
    'boosting_type' : ['rf'],
    'objective' : ['binary'],
    'random_state' : [501],
    "max_depth":[2, 3, 4, 5],
    'bagging_fraction':[0.1 ,0.3, 0.5,  0.6],
    'bagging_freq': [1],
    'colsample_bytree' : [0.5, 0.7, 1],
    'subsample' : [0.1, 0.5],
    'reg_alpha' : [5, 3, 1],
    'reg_lambda' : [0.4, 0.6, 0.8, 1]
    }

### Подбор параметров

In [22]:
grid_lgbm = GridSearchCV(mdl, gridParams,
                        verbose=3,
                        cv = generator(train_cv),
                         n_jobs=11, scoring=my_func)

grid_3 = grid_lgbm.fit(X_train_cv, y_train_cv)

Fitting 3 folds for each of 15552 candidates, totalling 46656 fits


[Parallel(n_jobs=11)]: Using backend LokyBackend with 11 concurrent workers.
[Parallel(n_jobs=11)]: Done  10 tasks      | elapsed:   14.5s
[Parallel(n_jobs=11)]: Done 106 tasks      | elapsed:  1.7min
[Parallel(n_jobs=11)]: Done 266 tasks      | elapsed:  4.4min
[Parallel(n_jobs=11)]: Done 490 tasks      | elapsed:  8.4min
[Parallel(n_jobs=11)]: Done 778 tasks      | elapsed: 73.3min
[Parallel(n_jobs=11)]: Done 1130 tasks      | elapsed: 79.4min
[Parallel(n_jobs=11)]: Done 1546 tasks      | elapsed: 86.6min
[Parallel(n_jobs=11)]: Done 2026 tasks      | elapsed: 94.8min
[Parallel(n_jobs=11)]: Done 2570 tasks      | elapsed: 104.6min
[Parallel(n_jobs=11)]: Done 3178 tasks      | elapsed: 115.3min
[Parallel(n_jobs=11)]: Done 3850 tasks      | elapsed: 127.0min
[Parallel(n_jobs=11)]: Done 4586 tasks      | elapsed: 139.6min
[Parallel(n_jobs=11)]: Done 5386 tasks      | elapsed: 153.7min
[Parallel(n_jobs=11)]: Done 6250 tasks      | elapsed: 170.9min
[Parallel(n_jobs=11)]: Done 7178 tasks  

In [None]:
print("Ошибка на обучении:", grid_3.best_score_)
print("Модель с лучшими параметрами:", grid_3.best_params_)

### Обучающий и тестовый набор данных для модели с лучшими параметрами

In [24]:
df_test = df[df['Validation'] == 'test']


y_test = df_test['Churn']
X_test = df_test.drop(['Churn', 'mems_id', 'Datum', 
                              'Fold_1', 'Fold_2', 'Fold_3', 'Validation'], axis=1)

df_train = df[df['Validation'] == 'train']

y_train = df_train['Churn']
X_train = df_train.drop(['Churn', 'mems_id', 'Datum', 
                              'Fold_1', 'Fold_2', 'Fold_3', 'Validation'], axis=1)

### Обучение модели с ранним завершением, чтобы выбрать лучшую модель по бизнес метрике 

In [25]:
mdl_metric = lgbm.LGBMClassifier(**grid_3.best_params_, metric='custom')

mdl_metric.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
   eval_metric=custom_asymmetric_train,
    early_stopping_rounds=1000,
    verbose=5
)

Training until validation scores don't improve for 1000 rounds.
[5]	training's busines_metric: -170.635
[10]	training's busines_metric: -147.465
[15]	training's busines_metric: -140.648
[20]	training's busines_metric: -138.333
[25]	training's busines_metric: -138.203
[30]	training's busines_metric: -136.527
[35]	training's busines_metric: -135.522
[40]	training's busines_metric: -133.938
[45]	training's busines_metric: -133.693
[50]	training's busines_metric: -132.897
[55]	training's busines_metric: -132.474
[60]	training's busines_metric: -132.771
[65]	training's busines_metric: -133.429
[70]	training's busines_metric: -133.756
[75]	training's busines_metric: -133.76
[80]	training's busines_metric: -133.829
[85]	training's busines_metric: -133.841
[90]	training's busines_metric: -134.036
[95]	training's busines_metric: -134.252
[100]	training's busines_metric: -134.501
[105]	training's busines_metric: -133.898
[110]	training's busines_metric: -134.12
[115]	training's busines_metric: -

LGBMClassifier(bagging_fraction=0.3, boosting_type='gbdt', class_weight=None,
               colsample_bytree=0.5, importance_type='split', learning_rate=0.2,
               max_depth=2, metric='custom', min_child_samples=20,
               min_child_weight=0.001, min_split_gain=0.0, n_estimators=500,
               n_jobs=-1, num_leaves=3, objective='binary', random_state=501,
               reg_alpha=0.1, reg_lambda=1, silent=True, subsample=0.1,
               subsample_for_bin=200000, subsample_freq=0)

In [26]:
mdl_metric.best_score_

defaultdict(dict, {'training': {'busines_metric': -132.1383352914261}})

### Проверка процентного соотношения ошибок

In [28]:
def negative_percent(y_pred, y_true):
    residual = (2*y_true - y_pred)
    true=np.sum(np.where(residual==1, 1, 0))/len(y_true)
    false=np.sum(np.where(residual==-1, 1, 0))/len(y_true)
    falsep=np.sum(np.where(residual==2, 1, 0))/len(y_true)
    return 'true_neg:', true, 'false_neg:', false, 'false_p:', falsep, 'true_p:', 1-false-true-falsep

In [29]:
negative_percent(mdl_metric.predict(X_test), y_test)

('true_neg:',
 0.6550504197068816,
 'false_neg:',
 0.13279338571113297,
 'false_p:',
 0.061432373069966385,
 'true_p:',
 0.15072382151201905)

In [30]:
print("Accuracy score (training): {0:.3f}".format(mdl_metric.score(X_train, y_train)))
print("Accuracy score (validation): {0:.3f}".format(mdl_metric.score(X_test, y_test)))

Accuracy score (training): 0.769
Accuracy score (validation): 0.806


In [44]:
with open("Churn_forest2802.pkl", 'wb') as fid:
    pickle.dump(mdl_metric, fid)