"классическое" обучение - без распределенных вычислений на сжатых фичах

In [2]:
import sys
import pathlib
cwd = pathlib.Path().cwd()
sys.path.append(cwd.parent.as_posix())
data_folder = cwd.parent.joinpath('data')

import logging
import json
import datetime as dt
import pandas as pd

from dags.jobs.transformers import ColumnsCorrector, TimeDifference, Clusterer
from sklearn.pipeline import make_pipeline
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import KFold, GridSearchCV

from sklearn.metrics import f1_score
from functools import partial
from common import cv_fit, cv_compare
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

logging.basicConfig(filename='fit.logs', level=logging.INFO)

In [2]:
# basic parameters
bound_date = '2018-11-19'
n_folds = 5

## load data

In [3]:
# read train data
train_data = pd.read_csv(data_folder.joinpath('data_train.csv')).drop('Unnamed: 0', axis=1)

# extract required and sort train data
used_mask = train_data['buy_time'] >= dt.datetime.fromisoformat(bound_date).timestamp()
train_data = train_data[used_mask].sort_values(by='buy_time')

train_data.head()

Unnamed: 0,id,vas_id,buy_time,target
220950,2442789,4.0,1542574800,1.0
112530,2461729,1.0,1542574800,0.0
762642,1954991,2.0,1542574800,1.0
530675,2189588,2.0,1542574800,1.0
762686,200964,1.0,1542574800,0.0


In [4]:
# read compressed features
features = pd.read_csv(data_folder.joinpath('compressed_features.csv')).drop('Unnamed: 0', axis=1)

# extract required and sort train data
used_mask = features['id'].isin(train_data['id'].unique())
features = features[used_mask].sort_values(by='buy_time')

features.head()

Unnamed: 0,id,buy_time,0,1,2
4491432,382085,1531083600,-246214700.0,1198001000.0,-153166800.0
1161905,3968934,1531083600,789889900.0,-698845400.0,-125198100.0
2176793,165505,1531083600,-677941300.0,-263496300.0,-86690990.0
3154089,2038465,1531083600,-677941300.0,-263496300.0,-86690990.0
2659959,2810048,1531083600,-677941300.0,-263496300.0,-86690990.0


## merging

In [5]:
# # nearest merge
# merged = pd.merge_asof(train_data.rename(columns={'buy_time': 'train_time'}), 
#                        features.rename(columns={'buy_time': 'feats_time'}),
#                        by='id', left_on='train_time', right_on='feats_time', direction='nearest')
# logging.info(f'Merging method: NEAREST')

# merged.head()

In [6]:
# backward merge + fillna with mean
merged = pd.merge_asof(train_data.rename(columns={'buy_time': 'train_time'}), 
                       features.rename(columns={'buy_time': 'feats_time'}),
                       by='id', left_on='train_time', right_on='feats_time', direction='backward')
merged.fillna(merged.mean(), inplace=True)
logging.info(f'Merging method: BACKWARD. NaNs are filled with mean')

merged.head()

Unnamed: 0,id,vas_id,train_time,target,feats_time,0,1,2
0,2442789,4.0,1542574800,1.0,1537917000.0,-252063.6,24265080.0,3804274.0
1,2461729,1.0,1542574800,0.0,1532293000.0,-677941300.0,-263496300.0,-86690990.0
2,1954991,2.0,1542574800,1.0,1532293000.0,-622639700.0,-210964100.0,1427409000.0
3,2189588,2.0,1542574800,1.0,1537917000.0,-252063.6,24265080.0,3804274.0
4,200964,1.0,1542574800,0.0,1532898000.0,-248129600.0,1191519000.0,-152871900.0


In [7]:
# # backward merge + fillna with nearest merge
# merged = pd.merge_asof(train_data.rename(columns={'buy_time': 'train_time'}), 
#                        features.rename(columns={'buy_time': 'feats_time'}),
#                        by='id', left_on='train_time', right_on='feats_time', direction='backward')

# nan_rows = merged.isna().any(axis=1)
# nan_columns = merged.columns[merged.isna().any()].to_list()

# merged[nan_rows] = pd.merge_asof(merged[nan_rows].drop(nan_columns, axis=1), 
#                                  features.rename(columns={'buy_time': 'feats_time'}), 
#                                  by='id', left_on='train_time', right_on='feats_time', direction='nearest').values
# logging.info(f'Merging method: BACKWARD. NaNs are filled with NEAREST merging')

# merged.head()

In [8]:
# DRAFT
# # backward merge + fillna features with mean, date with ..... ?
# merged = pd.merge_asof(train_data.rename(columns={'buy_time': 'train_time'}), 
#                        features.rename(columns={'buy_time': 'feats_time'}),
#                        by='id', left_on='train_time', right_on='feats_time', direction='backward')
# merged['feats_time'].fillna( ... )
# merged.fillna(merged.mean(), inplace=True)
# logging.info(f'Merging method: BACKWARD. NaNs are filled with mean')

# merged.head()

## featuring

In [9]:
# build featuring pipeline - if it is required to apply separately to train and valid data
pipeline = make_pipeline(
    TimeDifference('feats_time', 'train_time'),
    # Clusterer(['0', '1', '2'], n_clusters=8, random_state=13),
    ColumnsCorrector('drop', ['id', 'train_time', 'feats_time']),
)

target = merged['target']
data = pipeline.fit_transform(merged.drop('target', axis=1), target)
data.head()

Unnamed: 0,vas_id,0,1,2,time_diff
0,4.0,-252063.6,24265080.0,3804274.0,54
1,1.0,-677941300.0,-263496300.0,-86690990.0,119
2,2.0,-622639700.0,-210964100.0,1427409000.0,119
3,2.0,-252063.6,24265080.0,3804274.0,54
4,1.0,-248129600.0,1191519000.0,-152871900.0,112


In [10]:
# prepare folds, class weights and scoring function
class_weights = dict(enumerate(compute_class_weight('balanced', classes=[0, 1], y=target)))
folds = KFold(n_splits=n_folds, shuffle=True, random_state=29)

f1_macro = partial(f1_score, average='macro')

## manual fit

In [138]:
estimator = SGDClassifier(random_state=17, n_jobs=-1, class_weight=class_weights,
                        #   max_iter=500,
                        #   loss='log',
                        #   learning_rate='adaptive',
                        #   eta0=0.1,
                          )

avg, metrics, models = cv_fit(estimator, data, target, cv=folds, scorer=f1_macro)
logging.info(f'Manual: model={estimator.__class__.__name__}, parameters={estimator.get_params()}, score={avg}')
print(f'Avg. f-score: {sum(metrics) / n_folds}')

Avg. f-score: 0.43494967128540213


In [106]:
estimator = GradientBoostingClassifier(random_state=17, loss='deviance',   # in newer versions of sklearn use 'log_loss' instead
                                    #    n_estimators=100,
                                    #    learning_rate=0.15,
                                    #    max_depth=4,
                                    #    min_samples_leaf=1
                                       )

avg, metrics, models = cv_fit(estimator, data, target, cv=folds, scorer=f1_macro)
logging.info(f'Manual: model={estimator.__class__.__name__}, parameters={estimator.get_params()}, score={avg}')
print(f'Avg. f-score: {sum(metrics) / n_folds}')

Avg. f-score: 0.5875697192226178


In [11]:
estimator = RandomForestClassifier(random_state=17, class_weight=class_weights, n_jobs=-1,
                                #    n_estimators=100,
                                #    max_depth=None,
                                #    min_samples_leaf=1,
                                   )

avg, metrics, models = cv_fit(estimator, data, target, cv=folds, scorer=f1_macro)
logging.info(f'Manual: model={estimator.__class__.__name__}, parameters={estimator.get_params()}, score={avg}')
print(f'Avg. f-score: {sum(metrics) / n_folds}')

Avg. f-score: 0.7390303944066823


In [12]:
estimator = LGBMClassifier(random_state=17, class_weight=class_weights, n_jobs=-1,
                        #    n_estimators=100,
                        #    learning_rate=0.1,
                        #    max_depth=-1,
                        #    num_leaves=31
                           )

avg, metrics, models = cv_fit(estimator, data, target, cv=folds, scorer=f1_macro)
logging.info(f'Manual: model={estimator.__class__.__name__}, parameters={estimator.get_params()}, score={avg}')
print(f'Avg. f-score: {sum(metrics) / n_folds}')

Avg. f-score: 0.7535735497832563


In [109]:
estimator = XGBClassifier(random_state=17,
                        #   max_depth=6,
                        #   learning_rate=0.3,
                          )

avg, metrics, models = cv_fit(estimator, data, target, cv=folds, scorer=f1_macro)
logging.info(f'Manual: model={estimator.__class__.__name__}, parameters={estimator.get_params()}, score={avg}')
print(f'Avg. f-score: {sum(metrics) / n_folds}')

Avg. f-score: 0.641247711694636


In [110]:
estimator = CatBoostClassifier(random_state=17, logging_level='Silent', allow_writing_files=False,
                            #    n_estimators=500,
                            #    learning_rate=0.2,
                            #    max_depth=None,
                            #    min_child_samples=20
                               )

avg, metrics, models = cv_fit(estimator, data, target, cv=folds, scorer=f1_macro)
logging.info(f'Manual: model={estimator.__class__.__name__}, parameters={estimator.get_params()}, score={avg}')
print(f'Avg. f-score: {sum(metrics) / n_folds}')

Avg. f-score: 0.6444532391062732


На параметрах по умолчанию наилучший результат показывают LGBMClassifier и RandomForestClassifier.

При этом метрика зависит от способа сопоставления фичей: в основном способ `backward` с заполнением пропусков средней показывает лучший результат. Предположение о причине остается неизменным - профили пользователей, сформированные после даты коммерческого предложения вряд ли могли влиять на решение пользователя.
Для обеспечения возможности модели отследить эту зависимость был введен признак `time_diff`.

In [111]:
df = pd.DataFrame(json.load(open('merge_method_metrics.json', 'r')), index=['backward', 'nearest']).T
df['N/B ratio'] = df['nearest'] / df['backward']
df.T

Unnamed: 0,SGDClassifier,GradientBoostingClassifier,RandomForestClassifier,LGBMClassifier,XGBClassifier,CatBoostClassifier
backward,0.43495,0.58757,0.73903,0.753574,0.641248,0.644453
nearest,0.439437,0.552324,0.719862,0.737224,0.618495,0.619937
N/B ratio,1.010316,0.940014,0.974063,0.978304,0.964518,0.961957


## GridSearch

In [22]:
defaults = {
    'random_state': 17,
    'n_jobs': -1,
    'class_weight': class_weights,
}

estimators = [
    RandomForestClassifier(**defaults),
    LGBMClassifier(**defaults)
]

# =============================================================
# grids = [
#     {       # RandomForest grid
#         'n_estimators': [50, 100, 150],
#         'max_depth': [None, 4, 6],
#         'min_samples_leaf': [20, 31, 45, 60],

#     }, {    # LGBM grid
#         'n_estimators': [50, 100, 150],
#         'learning_rate': [0.05, 0.1, 0.2, 0.3],
#         'max_depth': [-1, 4, 6],
#         'num_leaves': [20, 31, 45, 60],
#     }
# ]

# =============================================================
grids = [
    {       # RandomForest grid
        'n_estimators': [50, 100, 150],
        'max_depth': [None, 3, 4, 8],
        'min_samples_leaf': [19, 31, 37],

    }, {    # LGBM grid
        'n_estimators': [50, 100, 125],
        'learning_rate': [0.05, 0.15, 0.3],
        'max_depth': [-1, 4, 6],
        'num_leaves': [25, 31, 37],
    }
]

# =============================================================

comp, _ = cv_compare(estimators, data, target, grids=grids, cv=folds, scorer=f1_macro, logger=logging)
pd.DataFrame(comp)

Unnamed: 0,model,GS score,CV score
0,RandomForestClassifier,0.752802,0.73903
1,LGBMClassifier,0.753896,0.753574


In [43]:
#