Обучение на части данных после 19 ноября (включительно)

In [2]:
import pathlib
import datetime as dt
import pandas as pd
import dask.dataframe as dd

from dask.distributed import Client, wait
from common import trim_memory, select_and_sort

from transformers import ColumnsCorrector
from sklearn.pipeline import make_pipeline
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import KFold
from lightgbm import LGBMClassifier
from sklearn.metrics import f1_score

data_folder = pathlib.Path('data')

In [3]:
# dask.config.set({'MALLOC_TRIM_THRESHOLD_': '65536'})
client = Client(n_workers=1, threads_per_worker=8)
client

2022-05-23 00:36:35,615 - distributed.diskutils - INFO - Found stale lock file and directory '/home/avagadro/projects/gb_megafon/dask-worker-space/worker-rs9u602k', purging


0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 1
Total threads: 8,Total memory: 15.59 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:34257,Workers: 1
Dashboard: http://127.0.0.1:8787/status,Total threads: 8
Started: Just now,Total memory: 15.59 GiB

0,1
Comm: tcp://127.0.0.1:39975,Total threads: 8
Dashboard: http://127.0.0.1:37751/status,Memory: 15.59 GiB
Nanny: tcp://127.0.0.1:42761,
Local directory: /home/avagadro/projects/gb_megafon/dask-worker-space/worker-6ywnr9yv,Local directory: /home/avagadro/projects/gb_megafon/dask-worker-space/worker-6ywnr9yv


## load & prepare

In [4]:
# set parameters
blocksize = '200MB'         # 200MB per chunk
drop_feats = ['75', '81', '85', '139', '203']
bound_date = '2018-11-19'

# load data
train_data = dd.read_csv(data_folder.joinpath('data_train.csv'), blocksize=blocksize).drop('Unnamed: 0', axis=1)
user_feats = dd.read_csv(data_folder.joinpath('features.csv'), blocksize=blocksize, sep='\t').drop(['Unnamed: 0', *drop_feats], axis=1)

# select required train part
bound_timestamp = dt.datetime.fromisoformat(bound_date).timestamp()
used_train_mask = client.submit(lambda df, bound: df['buy_time'] >= bound, train_data, bound_timestamp, key='get_train_data_mask')
# extract and sort train data
train_data = client.submit(select_and_sort, train_data, mask=used_train_mask, by='buy_time', key='train_data_sort')

# select required user features
unique_ids = client.submit(lambda df: df['id'].unique().compute(), train_data, key='unique_ids_compute')
used_feats_mask = client.submit(user_feats['id'].isin, unique_ids, key='get_used_feats_mask')

# extract and sort user features
user_feats = client.submit(select_and_sort, user_feats, mask=used_feats_mask, by='buy_time', key='user_feats_sort')

# compute data (recast to pandas DataFrame)
train_data_df = client.submit(train_data.result().compute, key='recast_train_data')
user_feats_df = client.submit(user_feats.result().compute, key='recast_user_feats')
wait([train_data_df, user_feats_df])

# remove no longer needed tasks from cluster
client.cancel([used_train_mask, unique_ids, used_feats_mask, train_data, user_feats])

In [5]:
# merge
merged = client.submit(pd.merge_asof, train_data_df, user_feats_df, by='id', on='buy_time', direction='nearest', key='data_merge')

# split into data/target & send to cluster
data = client.scatter(merged.result().drop('target', axis=1))
target = client.scatter(merged.result()['target'])
wait([data, target])

# remove no longer needed tasks from cluster
client.cancel([merged, train_data_df, user_feats_df])

In [6]:
# trim cluster memory
client.run(trim_memory)

{'tcp://127.0.0.1:39975': 1}

## featuring

In [7]:
# build pre-featuring pipeline to apply to the entire dataset
pipeline = make_pipeline(ColumnsCorrector('drop', ['id', ]), )

В принципе, поскольку имеющиеся фичи никак не зависят от распределения данных, то можно применить их до train/valid разделения. Но в любой момент могут добавиться фичи, зависящие от распределения, поэтому сразу реализуем и другой подход.

In [8]:
# # apply pre-featuring pipeline
# featuring = client.submit(pipeline.fit, data, target, key='pre_featuring_fit')
# transformed = client.submit(featuring.result().transform, data, key='pre_featuring_transform')

# # push transformed data to the cluster
# prepared = client.scatter(transformed.result())
# wait(prepared)

# # remove no longer needed tasks from cluster
# client.cancel([data, featuring, transformed])
# data = transformed        # this for compatibility

## CV fit

In [9]:
n_folds = 5

metrics = []
models = []
class_weights = dict(enumerate(compute_class_weight('balanced', classes=[0, 1], y=target.result())))
folds = KFold(n_splits=n_folds, shuffle=True, random_state=29)

for train_index, valid_index in folds.split(target.result()):
    # push train/valid dataframes to the cluster
    train_df = client.scatter(data.result().iloc[train_index])
    valid_df = client.scatter(data.result().iloc[valid_index])
    # fit and apply featuring pipeline
    featuring = client.submit(pipeline.fit, train_df, target, key='featuring_fit')
    X_train = client.submit(featuring.result().transform, train_df, key='train_featuring_transform')
    X_valid = client.submit(featuring.result().transform, valid_df, key='valid_featuring_transform')
    # exctract targets and push them to the cluster
    y_train = client.scatter(target.result().iloc[train_index])
    y_valid = client.scatter(target.result().iloc[valid_index])

    # LGBM
    estimator = LGBMClassifier(random_state=17,
                               class_weight=class_weights,
                               n_estimators=100,
                               learning_rate=0.15,
                               max_depth=-1,
                               num_leaves=31,
                               )
    model = client.submit(estimator.fit, X_train, y_train)     # TODO parallel_backend?

    # predicts & metrics
    prediction = client.submit(lambda mdl, df: mdl.predict(df), model, X_valid, key='compute_predictions')
    score = client.submit(f1_score, y_valid, prediction, average='macro', key='scoring')
    wait(score)
    # append step result
    models.append(model.result())
    metrics.append(score.result())
    # remove no longer needed tasks from cluster
    client.cancel([model, featuring, train_df, valid_df, X_train, y_train, X_valid, y_valid, prediction, score])
    # trim cluster memory
    client.run(trim_memory)

print(f'Avg. f-score: {sum(metrics) / n_folds}')

Avg. f-score: 0.7776767294528835


In [10]:
# 0.7776767294528835