"классическое" обучение - без распределенных вычислений на сжатых фичах

In [2]:
import logging
import pathlib
import json
import datetime as dt
import pandas as pd

from transformers import ColumnsCorrector
from sklearn.pipeline import make_pipeline
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import KFold, GridSearchCV

from sklearn.metrics import f1_score
from functools import partial
from common import cv_fit, cv_compare
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

data_folder = pathlib.Path('data')

logging.basicConfig(filename='fit.logs', level=logging.INFO)

In [3]:
# basic parameters
bound_date = '2018-11-19'
n_folds = 5

## load data

In [4]:
# read train data
train_data = pd.read_csv(data_folder.joinpath('data_train.csv')).drop('Unnamed: 0', axis=1)

# extract required and sort train data
used_mask = train_data['buy_time'] >= dt.datetime.fromisoformat(bound_date).timestamp()
train_data = train_data[used_mask].sort_values(by='buy_time')

train_data.head()

Unnamed: 0,id,vas_id,buy_time,target
220950,2442789,4.0,1542574800,1.0
112530,2461729,1.0,1542574800,0.0
762642,1954991,2.0,1542574800,1.0
530675,2189588,2.0,1542574800,1.0
762686,200964,1.0,1542574800,0.0


In [5]:
# read compressed features
features = pd.read_csv(data_folder.joinpath('compressed_features.csv')).drop('Unnamed: 0', axis=1)

# extract required and sort train data
used_mask = features['id'].isin(train_data['id'].unique())
features = features[used_mask].sort_values(by='buy_time')

features.head()

Unnamed: 0,id,buy_time,0,1,2
4491432,382085,1531083600,-246214700.0,1198001000.0,-153166800.0
1161905,3968934,1531083600,789889900.0,-698845400.0,-125198100.0
2176793,165505,1531083600,-677941300.0,-263496300.0,-86690990.0
3154089,2038465,1531083600,-677941300.0,-263496300.0,-86690990.0
2659959,2810048,1531083600,-677941300.0,-263496300.0,-86690990.0


In [15]:
# merge
merged = pd.merge_asof(train_data, features, by='id', on='buy_time', direction='nearest')

# split into data/target & send to cluster
data = merged.drop(['id', 'target'], axis=1)
target = merged['target']
logging.info(f'Merging method: NEAREST')

merged.head()

Unnamed: 0,id,vas_id,buy_time,target,0,1,2
0,2442789,4.0,1542574800,1.0,-251704500.0,1179417000.0,-152321500.0
1,2461729,1.0,1542574800,0.0,-677941300.0,-263496300.0,-86690990.0
2,1954991,2.0,1542574800,1.0,-622639700.0,-210964100.0,1427409000.0
3,2189588,2.0,1542574800,1.0,802311000.0,-702529400.0,-125524000.0
4,200964,1.0,1542574800,0.0,-248129600.0,1191519000.0,-152871900.0


In [7]:
# # backward merge + fillna(mean)
# merged = pd.merge_asof(train_data, features, by='id', on='buy_time', direction='backward')
# merged.fillna(merged.mean(), inplace=True)

# # split into data/target & send to cluster
# data = merged.drop(['id', 'target'], axis=1)
# target = merged['target']
# logging.info(f'Merging method: BACKWARD. NaNs are filled with mean')

# merged.head()

In [8]:
# build featuring pipeline - if it is required to apply separately to train and valid data
# pipeline = make_pipeline(ColumnsCorrector('drop', ['id', ]), )

In [9]:
# prepare folds, class weights and scoring function
class_weights = dict(enumerate(compute_class_weight('balanced', classes=[0, 1], y=target)))
folds = KFold(n_splits=n_folds, shuffle=True, random_state=29)

f1_macro = partial(f1_score, average='macro')

## manual fit

In [21]:
estimator = SGDClassifier(random_state=17, n_jobs=-1, class_weight=class_weights,
                        #   max_iter=500,
                        #   loss='log',
                        #   learning_rate='adaptive',
                        #   eta0=0.1,
                          )

avg, metrics, models = cv_fit(estimator, data, target, cv=folds, scorer=f1_macro)
logging.info(f'Manual: model={estimator.__class__.__name__}, parameters={estimator.get_params()}, score={avg}')
print(f'Avg. f-score: {sum(metrics) / n_folds}')

Avg. f-score: 0.3590468367137458


In [22]:
estimator = GradientBoostingClassifier(random_state=17, loss='deviance',   # in newer versions of sklearn use 'log_loss' instead
                                    #    n_estimators=100,
                                    #    learning_rate=0.15,
                                    #    max_depth=4,
                                    #    min_samples_leaf=1
                                       )

avg, metrics, models = cv_fit(estimator, data, target, cv=folds, scorer=f1_macro)
logging.info(f'Manual: model={estimator.__class__.__name__}, parameters={estimator.get_params()}, score={avg}')
print(f'Avg. f-score: {sum(metrics) / n_folds}')

Avg. f-score: 0.7122954789931877


In [23]:
estimator = RandomForestClassifier(random_state=17, class_weight=class_weights, n_jobs=-1,
                                #    n_estimators=100,
                                #    max_depth=None,
                                #    min_samples_leaf=1,
                                   )

avg, metrics, models = cv_fit(estimator, data, target, cv=folds, scorer=f1_macro)
logging.info(f'Manual: model={estimator.__class__.__name__}, parameters={estimator.get_params()}, score={avg}')
print(f'Avg. f-score: {sum(metrics) / n_folds}')

Avg. f-score: 0.7621558069832386


In [24]:
estimator = LGBMClassifier(random_state=17, class_weight=class_weights, n_jobs=-1,
                        #    n_estimators=100,
                        #    learning_rate=0.1,
                        #    max_depth=-1,
                        #    num_leaves=31
                           )

avg, metrics, models = cv_fit(estimator, data, target, cv=folds, scorer=f1_macro)
logging.info(f'Manual: model={estimator.__class__.__name__}, parameters={estimator.get_params()}, score={avg}')
print(f'Avg. f-score: {sum(metrics) / n_folds}')

Avg. f-score: 0.7777611819511739


In [25]:
estimator = XGBClassifier(random_state=17,
                        #   max_depth=6,
                        #   learning_rate=0.3,
                          )

avg, metrics, models = cv_fit(estimator, data, target, cv=folds, scorer=f1_macro)
logging.info(f'Manual: model={estimator.__class__.__name__}, parameters={estimator.get_params()}, score={avg}')
print(f'Avg. f-score: {sum(metrics) / n_folds}')

Avg. f-score: 0.7175973181182661


In [26]:
estimator = CatBoostClassifier(random_state=17, logging_level='Silent', allow_writing_files=False,
                            #    n_estimators=500,
                            #    learning_rate=0.2,
                            #    max_depth=None,
                            #    min_child_samples=20
                               )

avg, metrics, models = cv_fit(estimator, data, target, cv=folds, scorer=f1_macro)
logging.info(f'Manual: model={estimator.__class__.__name__}, parameters={estimator.get_params()}, score={avg}')
print(f'Avg. f-score: {sum(metrics) / n_folds}')

Avg. f-score: 0.7213276301212798


На параметрах по умолчанию наилучший результат показывают LGBMClassifier и RandomForestClassifier.

При этом метрика незначительно зависит от способа сопоставления фичей, и в основном способ `nearest` показывает лучший результат. Вероятно, это связано с отсутствием пропусков в данных в случае применения этого метода.

In [11]:
df = pd.DataFrame(json.load(open('merge_method_metrics.json', 'r')), index=['backward', 'nearest']).T
df['N/B ratio'] = df['nearest'] / df['backward']
df.T

Unnamed: 0,SGDClassifier,GradientBoostingClassifier,RandomForestClassifier,LGBMClassifier,XGBClassifier,CatBoostClassifier
backward,0.332481,0.710183,0.766309,0.777602,0.714569,0.716206
nearest,0.359047,0.712295,0.762156,0.777761,0.717597,0.721328
N/B ratio,1.079902,1.002974,0.994581,1.000205,1.004238,1.00715


## GridSearch

In [13]:
defaults = {
    'random_state': 17,
    'n_jobs': -1,
    'class_weight': class_weights,
}

estimators = [
    RandomForestClassifier(**defaults),
    LGBMClassifier(**defaults)
]

# =============================================================
# grids = [
#     {       # RandomForest grid
#         'n_estimators': [50, 100, 150],
#         'max_depth': [None, 4, 6],
#         'min_samples_leaf': [20, 31, 45, 60],

#     }, {    # LGBM grid
#         'n_estimators': [50, 100, 150],
#         'learning_rate': [0.05, 0.1, 0.2, 0.3],
#         'max_depth': [-1, 4, 6],
#         'num_leaves': [20, 31, 45, 60],
#     }
# ]

# =============================================================
grids = [
    {       # RandomForest grid
        'n_estimators': [175, 200],
        'max_depth': [None, 3, 4, 8],
        'min_samples_leaf': [19, 31, 37],

    }, {    # LGBM grid
        'n_estimators': [50, 100, 125],
        'learning_rate': [0.05, 0.15, 0.3],
        'max_depth': [-1, 4, 6],
        'num_leaves': [25, 31, 37],
    }
]

# =============================================================

comp, _ = cv_compare(estimators, data, target, grids=grids, cv=folds, scorer=f1_macro, logger=logging)
pd.DataFrame(comp)

Unnamed: 0,model,GS score,CV score
0,RandomForestClassifier,0.784422,0.762156
1,LGBMClassifier,0.777837,0.777761


In [43]:
#