In [1]:
import sys
import pathlib
cwd = pathlib.Path().cwd()
sys.path.append(cwd.parent.as_posix())
data_folder = cwd.parent.joinpath('data')

import pandas as pd
import dask.dataframe as dd

from dask.distributed import Client, wait
from auxiliary import trim_memory

from telecom.transformers import ColumnsCorrector
from sklearn.pipeline import make_pipeline
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import KFold
from lightgbm import LGBMClassifier
from sklearn.metrics import f1_score

In [2]:
# dask.config.set({'MALLOC_TRIM_THRESHOLD_': '65536'})
client = Client(n_workers=1)
# client

2022-06-08 20:04:18,649 - distributed.diskutils - INFO - Found stale lock file and directory '/home/avagadro/projects/mega_telecom/research/dask-worker-space/worker-cggprzyb', purging


## load & prepare

In [3]:
# set parameters
blocksize = '200MB'         # 200MB per chunk
drop_feats = ['75', '81', '85', '139', '203']

# load data
train_data = dd.read_csv(data_folder.joinpath('data_train.csv'), blocksize=blocksize).drop('Unnamed: 0', axis=1)
user_feats = dd.read_csv(data_folder.joinpath('features.csv'), blocksize=blocksize, sep='\t').drop(['Unnamed: 0', *drop_feats], axis=1)

# select required user features
unique_ids = client.submit(train_data['id'].unique().compute, key='unique_ids_compute')
used_feats_mask = client.submit(user_feats['id'].isin, unique_ids, key='get_used_feats_mask')

# sort all data
train_data = client.submit(train_data.sort_values, by='buy_time', key='train_data_sort')
user_feats = client.submit(user_feats[used_feats_mask.result()].sort_values, by='buy_time', key='user_feats_sort')
wait([train_data, user_feats])

# remove no longer needed tasks from cluster
del unique_ids, used_feats_mask

In [4]:
# compute data (recast to pandas DataFrame)
train_data_df = client.submit(train_data.result().compute, key='recast_train_data')
user_feats_df = client.submit(user_feats.result().compute, key='recast_user_feats')
wait([train_data_df, user_feats_df])

# remove no longer needed tasks from cluster
del train_data, user_feats

In [5]:
# merge
merged = client.submit(pd.merge_asof, train_data_df, user_feats_df, by='id', on='buy_time', direction='nearest', key='data_merge')

# split into data/target & send to cluster
data = client.scatter(merged.result().drop('target', axis=1))
target = client.scatter(merged.result()['target'])
wait([data, target])

# remove no longer needed tasks from cluster
del merged, train_data_df, user_feats_df

In [6]:
# trim cluster memory
client.run(trim_memory)

{'tcp://127.0.0.1:32883': 1}

## featuring

In [7]:
# build pre-featuring pipeline to apply to the entire dataset
pipeline = make_pipeline(ColumnsCorrector('drop', ['id', ]), )

В принципе, поскольку имеющиеся фичи никак не зависят от распределения данных, то можно применить их до train/valid разделения. Но в любой момент могут добавиться фичи, зависящие от распределения, поэтому сразу реализуем и другой подход.

In [8]:
# # apply pre-featuring pipeline
# featuring = client.submit(pipeline.fit, data, target, key='pre_featuring_fit')
# transformed = client.submit(featuring.result().transform, data, key='pre_featuring_transform')

# # push transformed data to the cluster
# prepared = client.scatter(transformed.result())
# wait(prepared)

# # remove no longer needed tasks from cluster
# del data, featuring, transformed
# data = transformed        # this for compatibility

## CV fit

In [9]:
n_folds = 5

metrics = []
models = []
class_weights = dict(enumerate(compute_class_weight('balanced', classes=[0, 1], y=target.result())))
folds = KFold(n_splits=n_folds, shuffle=True, random_state=29)

for train_index, valid_index in folds.split(target.result()):
    # push train/valid dataframes to the cluster
    train_df = client.scatter(data.result().iloc[train_index])
    valid_df = client.scatter(data.result().iloc[valid_index])
    # fit and apply featuring pipeline
    featuring = client.submit(pipeline.fit, train_df, target, key='featuring_fit')
    X_train = client.submit(featuring.result().transform, train_df, key='train_featuring_transform')
    X_valid = client.submit(featuring.result().transform, valid_df, key='valid_featuring_transform')
    # exctract targets and push them to the cluster
    y_train = client.scatter(target.result().iloc[train_index])
    y_valid = client.scatter(target.result().iloc[valid_index])

    # LGBM
    estimator = LGBMClassifier(random_state=17,
                               class_weight=class_weights,
                               n_estimators=100,
                               learning_rate=0.15,
                               max_depth=-1,
                               num_leaves=31,
                               )
    model = client.submit(estimator.fit, X_train, y_train)

    # predicts & metrics
    prediction = client.submit(lambda mdl, df: mdl.predict(df), model, X_valid, key='compute_predictions')
    score = client.submit(f1_score, y_valid, prediction, average='macro', key='scoring')
    # append step result
    models.append(model.result())
    metrics.append(score.result())
    # remove no longer needed tasks from cluster
    del model, featuring, train_df, valid_df, X_train, y_train, X_valid, y_valid, prediction, score
    # trim cluster memory
    client.run(trim_memory)

print(f'Avg. f-score: {sum(metrics) / n_folds}')

Avg. f-score: 0.7136670695029068


In [10]:
# 0.7136670695029068