"классическое" обучение - без распределенных вычислений на сжатых фичах

In [6]:
import sys
import pathlib
cwd = pathlib.Path().cwd()
sys.path.append(cwd.parent.as_posix())
data_folder = cwd.parent.joinpath('data')

import logging
import json
import datetime as dt
import pandas as pd
from sklearn.utils.class_weight import compute_class_weight

from dags.jobs.transformers import ColumnsCorrector, TimeDifference, Clusterer, Merger, AsDummies, PurchaseRatio, BasicFiller
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import KFold, GridSearchCV

from auxiliary import cv_fit, whole_fit, cv_compare
from functools import partial
from sklearn.metrics import f1_score

from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
# from catboost import CatBoostClassifier

FORMAT = '%(asctime)s > %(message)s'
logging.basicConfig(filename='fit.log', level=logging.INFO, format=FORMAT)

In [7]:
# basic parameters
bound_date = '2018-11-19'
n_folds = 5

## load data

In [8]:
# read train data
train_data = pd.read_csv(data_folder.joinpath('data_train.csv')).drop('Unnamed: 0', axis=1)
# extract required train data
train_data = train_data[train_data['buy_time'] >= dt.datetime.fromisoformat('2018-11-19').timestamp()]
# data/target split
data = train_data.drop('target', axis=1)
target = train_data['target']

# read compressed features
features = pd.read_csv(data_folder.joinpath('compressed_features.csv'))

# calc class weights
class_weights = dict(enumerate(compute_class_weight('balanced', classes=[0, 1], y=train_data['target'])))
# prepare folds for CV fit
folds = KFold(n_splits=n_folds, shuffle=True, random_state=29)
# prepare scoring function
f1_macro = partial(f1_score, average='macro')

# prepare estimators list
estimators = [
    SGDClassifier(random_state=17, n_jobs=-1, class_weight=class_weights),
    GradientBoostingClassifier(random_state=17, loss='deviance'),
    RandomForestClassifier(random_state=17, class_weight=class_weights, n_jobs=-1),
    LGBMClassifier(random_state=17, class_weight=class_weights, n_jobs=-1),
    XGBClassifier(random_state=17),
    # CatBoostClassifier(random_state=17, logging_level='Silent', allow_writing_files=False)
]

## basic pipeline

In [9]:
preparer = make_pipeline(
    # Merger(features, method='nearest', fillna='default'),
    # Merger(features, method='backward', fillna='mean'),
    Merger(features, method='backward', fillna='nearest'),
    TimeDifference('feats_time', 'train_time'),

    Clusterer(['0', '1', '2'], n_clusters=8, random_state=13),
    # AsDummies(['cluster']),
    PurchaseRatio(by=['cluster']),

    ColumnsCorrector('drop', ['id', 'train_time', 'feats_time']),
    BasicFiller(strategy='mean', apply_on_fly=True),
)

# METHOD_PREFIX = 'NEAREST'
# METHOD_PREFIX = 'BACKWARD+MEAN'
METHOD_PREFIX = 'BACKWARD+NEAREST'

## CV fit

In [5]:
# # cross-validation fit, apply pipeline on whole data before splitting: the results in general are the same as for CVFIT
# prepared = preparer.fit_transform(data, target)
# for est in estimators:
#     cv_fit(est, prepared, target, cv=folds, scorer=f1_macro, logger=logging, prefix=f'[CVPRE] [{METHOD_PREFIX}]')

In [10]:
# cross-validation fit (apply pipeline to train/valid separately) - it is logically the most correct way
for est in estimators:
    cv_fit(est, data, target, cv=folds, pipe=preparer, scorer=f1_macro, logger=logging, prefix=f'[CVFIT] [{METHOD_PREFIX}]')

[CVFIT] [BACKWARD+NEAREST] SGDClassifier: 0.4379331979235939
[CVFIT] [BACKWARD+NEAREST] GradientBoostingClassifier: 0.5804225783314
[CVFIT] [BACKWARD+NEAREST] RandomForestClassifier: 0.7204945029996006
[CVFIT] [BACKWARD+NEAREST] LGBMClassifier: 0.7376726747992317
[CVFIT] [BACKWARD+NEAREST] XGBClassifier: 0.6225401324826635


## fit on whole

In [65]:
# fit and validate on whole data
for est in estimators:
    whole_fit(est, data, target, pipe=preparer, scorer=f1_macro, logger=logging, prefix=f'[WHOLE] [{METHOD_PREFIX}]')

[WHOLE] [BACKWARD+NEAREST] SGDClassifier: 0.46482219297679356
[WHOLE] [BACKWARD+NEAREST] GradientBoostingClassifier: 0.5579754575825518
[WHOLE] [BACKWARD+NEAREST] RandomForestClassifier: 0.8261811848099537
[WHOLE] [BACKWARD+NEAREST] LGBMClassifier: 0.7387400692796617
[WHOLE] [BACKWARD+NEAREST] XGBClassifier: 0.6481633960305903


## parse logs

In [66]:
LAST_LINES = len(estimators) * 2 * 3

with open('fit.log', 'r') as logs:
    lines = [line.strip().replace(':', '').split()[-4:] for line in logs.readlines()[-LAST_LINES:]]
result = pd.DataFrame(lines, columns=['ftype', 'mtype', 'name', 'score'])

In [71]:
result.sort_values('score', ascending=False).head()

Unnamed: 0,ftype,mtype,name,score
7,[WHOLE],[NEAREST],RandomForestClassifier,0.8262131255209326
27,[WHOLE],[BACKWARD+NEAREST],RandomForestClassifier,0.8261811848099537
17,[WHOLE],[BACKWARD+MEAN],RandomForestClassifier,0.8213173279013498
18,[WHOLE],[BACKWARD+MEAN],LGBMClassifier,0.7545058962682158
13,[CVFIT],[BACKWARD+MEAN],LGBMClassifier,0.753410157714281


In [76]:
result[result['ftype'] == '[CVFIT]'].pivot_table(index='mtype', columns='name', values='score')

name,GradientBoostingClassifier,LGBMClassifier,RandomForestClassifier,SGDClassifier,XGBClassifier
mtype,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
[BACKWARD+MEAN],0.586954,0.75341,0.739461,0.425094,0.644542
[BACKWARD+NEAREST],0.550974,0.73749,0.720716,0.437933,0.619077
[NEAREST],0.551522,0.737395,0.720384,0.44958,0.619565


In [77]:
result[result['ftype'] == '[WHOLE]'].pivot_table(index='mtype', columns='name', values='score')

name,GradientBoostingClassifier,LGBMClassifier,RandomForestClassifier,SGDClassifier,XGBClassifier
mtype,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
[BACKWARD+MEAN],0.584157,0.754506,0.821317,0.328661,0.665028
[BACKWARD+NEAREST],0.557975,0.73874,0.826181,0.464822,0.648163
[NEAREST],0.556358,0.738444,0.826213,0.43894,0.650315


На параметрах по умолчанию наилучший результат показали RandomForestClassifier и LGBMClassifier.

При этом метрика незначительно варьируется в зависимости от способа сопоставления фичей: варианты `backward` эмпирически дают результат выше.<br>
Предположение о сопоставлении фичей остается неизменным - профили пользователей, сформированные после даты коммерческого предложения вряд ли могли влиять на решение о покупке.
Для обеспечения возможности модели отследить эту зависимость был введен признак `time_diff`.

## GridSearch

In [4]:
preparer = make_pipeline(
    Merger(features, method='backward', fillna='nearest'),
    TimeDifference('feats_time', 'train_time'),
    Clusterer(['0', '1', '2'], n_clusters=8, random_state=13),
    PurchaseRatio(by=['cluster']),
    ColumnsCorrector('drop', ['id', 'train_time', 'feats_time']),
    BasicFiller(strategy='mean', apply_on_fly=True),
)

defaults = {
    'random_state': 17,
    'n_jobs': -1,
    'class_weight': class_weights,
}

estimators = [
    RandomForestClassifier(**defaults),
    # LGBMClassifier(**defaults)
]

# =============================================================
# grids = [
#     {   # RandomForest grid
#         'n_estimators': [50, 100, 150],
#         'max_depth': [None, 3, 4, 6],
#         'min_samples_leaf': [20, 31, 45],
#     },
#     # {   # LGBM grid
#     #     'n_estimators': [50, 100, 150],
#     #     'learning_rate': [0.05, 0.1, 0.2, 0.3],
#     #     'max_depth': [-1, 4, 6],
#     #     'num_leaves': [20, 31, 45, 60],
#     # }
# ]
# =============================================================
grids = [
    {   # RandomForest grid
        'n_estimators': [50, 65, 75, 100],
        # 'max_depth': [None, *range(3, 8)],
        'min_samples_leaf': [3, 4, 5, 7, 11, 13],
    },
]

result, gscv = cv_compare(estimators, data, target, grids=grids, cv=folds, pipe=preparer, scorer=f1_macro, logger=logging)
pd.DataFrame(result)

[GridSearchCV] [CVFIT] RandomForestClassifier: 0.7378030158943013
[GridSearchCV] [WHOLE] RandomForestClassifier: 0.7445321437620445


Unnamed: 0,model,GS score,CV score,WH score
0,RandomForestClassifier,0.78079,0.737803,0.744532


In [9]:
USE_ESTIMATOR = 0
json.dump(estimators[USE_ESTIMATOR].get_params(), open(data_folder.joinpath('fit_params.json').as_posix(), 'w'))

In [43]:
#