"классическое" обучение - без распределенных вычислений на сжатых фичах

In [1]:
import sys
import pathlib
cwd = pathlib.Path().cwd()
sys.path.append(cwd.parent.as_posix())
data_folder = cwd.parent.joinpath('data')

import logging
import json
import datetime as dt
import pandas as pd
from sklearn.utils.class_weight import compute_class_weight

from dags.jobs.transformers import ColumnsCorrector, TimeDifference, Clusterer, Merger, AsDummies, PurchaseRatio, BasicFiller
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import KFold, GridSearchCV

from auxiliary import cv_fit, whole_fit, cv_compare
from functools import partial
from sklearn.metrics import f1_score

from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
# from catboost import CatBoostClassifier

FORMAT = '%(asctime)s > %(message)s'
logging.basicConfig(filename='fit.log', level=logging.INFO, format=FORMAT)

In [2]:
# basic parameters
bound_date = '2018-11-19'
n_folds = 5

## load data

In [3]:
# read train data
train_data = pd.read_csv(data_folder.joinpath('data_train.csv')).drop('Unnamed: 0', axis=1)
# extract required train data
train_data = train_data[train_data['buy_time'] >= dt.datetime.fromisoformat('2018-11-19').timestamp()]
# data/target split
data = train_data.drop('target', axis=1)
target = train_data['target']

# read compressed features
features = pd.read_csv(data_folder.joinpath('compressed_features.csv'))

# calc class weights
class_weights = dict(enumerate(compute_class_weight('balanced', classes=[0, 1], y=train_data['target'])))
# prepare folds for CV fit
folds = KFold(n_splits=n_folds, shuffle=True, random_state=29)
# prepare scoring function
f1_macro = partial(f1_score, average='macro')

# prepare estimators list
estimators = [
    SGDClassifier(random_state=17, n_jobs=-1, class_weight=class_weights),
    GradientBoostingClassifier(random_state=17, loss='deviance'),
    RandomForestClassifier(random_state=17, class_weight=class_weights, n_jobs=-1),
    LGBMClassifier(random_state=17, class_weight=class_weights, n_jobs=-1),
    XGBClassifier(random_state=17),
    # CatBoostClassifier(random_state=17, logging_level='Silent', allow_writing_files=False)
]

## basic pipeline

In [6]:
preparer = make_pipeline(
    # Merger(features, method='nearest', fillna='default'),
    Merger(features, method='backward', fillna='mean'),
    # Merger(features, method='backward', fillna='nearest'),
    TimeDifference('feats_time', 'train_time'),

    Clusterer(['0', '1', '2'], n_clusters=8, random_state=13),
    # AsDummies(['cluster']),
    PurchaseRatio(by=['cluster']),

    ColumnsCorrector('drop', ['id', 'train_time', 'feats_time',]),
    BasicFiller(strategy='mean', apply_on_fly=True),
)

# METHOD_PREFIX = 'NEAREST'
METHOD_PREFIX = 'BACKWARD+MEAN'
# METHOD_PREFIX = 'BACKWARD+NEAREST'

## CV fit

In [5]:
# # cross-validation fit, apply pipeline on whole data before splitting: the results in general are the same as for CVFIT
# prepared = preparer.fit_transform(data, target)
# for est in estimators:
#     cv_fit(est, prepared, target, cv=folds, scorer=f1_macro, logger=logging, prefix=f'[CVPRE] [{METHOD_PREFIX}]')

In [5]:
# cross-validation fit (apply pipeline to train/valid separately) - it is logically the most correct way
for est in estimators:
    cv_fit(est, data, target, cv=folds, pipe=preparer, scorer=f1_macro, logger=logging, prefix=f'[CVFIT] [{METHOD_PREFIX}]')

[CVFIT] [BACKWARD+MEAN] SGDClassifier: 0.4250944480257971
[CVFIT] [BACKWARD+MEAN] GradientBoostingClassifier: 0.6058501875814677
[CVFIT] [BACKWARD+MEAN] RandomForestClassifier: 0.7395984747209049
[CVFIT] [BACKWARD+MEAN] LGBMClassifier: 0.7534257205470898
[CVFIT] [BACKWARD+MEAN] XGBClassifier: 0.6400530861279424


## fit on whole

In [65]:
# # fit and validate on whole data
# for est in estimators:
#     whole_fit(est, data, target, pipe=preparer, scorer=f1_macro, logger=logging, prefix=f'[WHOLE] [{METHOD_PREFIX}]')

[WHOLE] [BACKWARD+NEAREST] SGDClassifier: 0.46482219297679356
[WHOLE] [BACKWARD+NEAREST] GradientBoostingClassifier: 0.5579754575825518
[WHOLE] [BACKWARD+NEAREST] RandomForestClassifier: 0.8261811848099537
[WHOLE] [BACKWARD+NEAREST] LGBMClassifier: 0.7387400692796617
[WHOLE] [BACKWARD+NEAREST] XGBClassifier: 0.6481633960305903


## parse logs

In [13]:
LAST_LINES = len(estimators) * 3

with open('fit.log', 'r') as logs:
    lines = [line.strip().replace(':', '').split()[-4:] for line in logs.readlines()[-LAST_LINES:]]
result = pd.DataFrame(lines, columns=['ftype', 'mtype', 'name', 'score'])
# result.to_csv(data_folder.joinpath('cvfit_compare.csv'), index=False)

In [4]:
# result = pd.read_csv(data_folder.joinpath('cvfit_compare.csv'))

In [21]:
# overview top5
result.sort_values('score', ascending=False).head()

Unnamed: 0,ftype,mtype,name,score
8,[CVFIT],[BACKWARD+MEAN],LGBMClassifier,0.7534617117595721
7,[CVFIT],[BACKWARD+MEAN],RandomForestClassifier,0.7393420823022738
13,[CVFIT],[NEAREST],LGBMClassifier,0.7376994846703424
3,[CVFIT],[BACKWARD+NEAREST],LGBMClassifier,0.7376726747992317
2,[CVFIT],[BACKWARD+NEAREST],RandomForestClassifier,0.7204945029996006


In [8]:
result[result['ftype'] == '[CVFIT]'].pivot_table(index='mtype', columns='name', values='score')
# result[result['ftype'] == '[WHOLE]'].pivot_table(index='mtype', columns='name', values='score')

name,GradientBoostingClassifier,LGBMClassifier,RandomForestClassifier,SGDClassifier,XGBClassifier
mtype,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
[BACKWARD+MEAN],0.606686,0.753462,0.739342,0.425094,0.638578
[BACKWARD+NEAREST],0.580423,0.737673,0.720495,0.437933,0.62254
[NEAREST],0.578498,0.737699,0.720219,0.44958,0.622437


На параметрах по умолчанию наилучший результат показали LGBMClassifier и RandomForestClassifier.

При этом метрика варьируется в зависимости от способа сопоставления фичей: вариант `backward + mean` эмпирически дает результат выше, а `backward + nearest` практически не отличается от `nearest`.<br>
Предположение о сопоставлении фичей остается неизменным - профили пользователей, сформированные после даты коммерческого предложения вряд ли могли влиять на решение о покупке.

## GridSearch

In [26]:
preparer = make_pipeline(
    Merger(features, method='backward', fillna='mean'),
    TimeDifference('feats_time', 'train_time'),
    Clusterer(['0', '1', '2'], n_clusters=8, random_state=13),
    PurchaseRatio(by=['cluster']),
    ColumnsCorrector('drop', ['id', 'train_time', 'feats_time']),
    BasicFiller(strategy='mean', apply_on_fly=True),
)

defaults = {
    'random_state': 17,
    'n_jobs': -1,
    'class_weight': class_weights,
}

estimators = [
    # RandomForestClassifier(**defaults),
    LGBMClassifier(**defaults)
]

# =============================================================
# grids = [
#     {   # RandomForest grid
#         'n_estimators': [50, 100, 150],
#         'max_depth': [None, 3, 4, 6],
#         'min_samples_leaf': [20, 31, 45],
#     },
#     # {   # LGBM grid
#     #     'n_estimators': [50, 100, 150],
#     #     'learning_rate': [0.05, 0.1, 0.2, 0.3],
#     #     'max_depth': [-1, 4, 6],
#     #     'num_leaves': [20, 31, 45, 60],
#     # }
# ]
# =============================================================
grids = [
    {   # parameters grid
        'n_estimators': [100, 125, 150, 200],          # def=100
        'learning_rate': [0.05, 0.1, 0.15, 0.2],     # def=0.1
        # 'max_depth': [*range(0, 6)],            # def=-1
        # 'max_depth': [0, *range(3, 8)],            # def=-1
        'num_leaves': [17, 23, 31, 37, 43],         # def=31
    },
]

result, gscv = cv_compare(estimators, data, target, grids=grids, cv=folds, pipe=preparer, scorer=f1_macro, logger=logging)
pd.DataFrame(result)

[GridSearchCV] [CVFIT] LGBMClassifier: 0.7534041122934074
[GridSearchCV] [WHOLE] LGBMClassifier: 0.7556387802624731


Unnamed: 0,model,GS score,CV score,WH score
0,LGBMClassifier,0.779123,0.753404,0.755639


In [9]:
USE_ESTIMATOR = 0
json.dump(estimators[USE_ESTIMATOR].get_params(), open(data_folder.joinpath('fit_params.json').as_posix(), 'w'))

In [43]:
#