In [1]:
import pathlib
import pandas as pd
import dask.dataframe as dd

from dask.distributed import Client, wait
from sklearn.utils.class_weight import compute_class_weight

from dask_ml.wrappers import Incremental        # applicable only on estimators with partial_fit() support
from sklearn.linear_model import SGDClassifier
from lightgbm import LGBMClassifier

# from dask_ml.model_selection import KFold
from sklearn.model_selection import KFold

from sklearn.metrics import f1_score, classification_report, ConfusionMatrixDisplay

data_folder = pathlib.Path('data')

In [2]:
# dask.config.set({'MALLOC_TRIM_THRESHOLD_': '65536'})
client = Client(n_workers=1, threads_per_worker=8)
client

2022-05-21 20:10:55,261 - distributed.diskutils - INFO - Found stale lock file and directory '/home/avagadro/projects/gb_megafon/dask-worker-space/worker-mvu3_4kx', purging


0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 1
Total threads: 8,Total memory: 15.59 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:35643,Workers: 1
Dashboard: http://127.0.0.1:8787/status,Total threads: 8
Started: Just now,Total memory: 15.59 GiB

0,1
Comm: tcp://127.0.0.1:40253,Total threads: 8
Dashboard: http://127.0.0.1:46639/status,Memory: 15.59 GiB
Nanny: tcp://127.0.0.1:39369,
Local directory: /home/avagadro/projects/gb_megafon/dask-worker-space/worker-5jig2v8k,Local directory: /home/avagadro/projects/gb_megafon/dask-worker-space/worker-5jig2v8k


In [3]:
import ctypes
def trim_memory() -> int:
     libc = ctypes.CDLL("libc.so.6")
     return libc.malloc_trim(0)
# client.run(trim_memory)

## load & prepare

In [4]:
# load data
drop_feats = ['75', '81', '85', '139', '203']
data = pd.read_csv(data_folder.joinpath('data_train.csv'), index_col='Unnamed: 0')
feats = dd.read_csv(data_folder.joinpath('features.csv'), blocksize='200MB', sep='\t').drop(['Unnamed: 0', *drop_feats], axis=1)     # 200MB per chunk

# select and sort required user features
used_feats_mask = feats['id'].isin(data['id'].unique())

feats = client.submit(feats[used_feats_mask].sort_values, 'buy_time')
data = data.sort_values(by='buy_time')

wait(feats)
del used_feats_mask

In [5]:
# merge
features = client.submit(feats.result().compute)
client.cancel(feats)
merged = client.submit(pd.merge_asof, data, features.result(), by='id', on='buy_time', direction='nearest')     # вот тут из-за features.result() видимо и есть UserWarning
wait(merged)
client.cancel(features)
client.run(trim_memory)

  [             id  vas_id    buy_time  target
57881 ... x 250 columns]]
Consider scattering large objects ahead of time
with client.scatter to reduce scheduler burden and 
keep data on workers

    future = client.submit(func, big_data)    # bad

    big_future = client.scatter(big_data)     # good
    future = client.submit(func, big_future)  # good


{'tcp://127.0.0.1:40253': 1}

In [11]:
# extract target & send to cluster
train = client.scatter(merged.result().drop(['target', 'id'], axis=1))
# train = client.scatter(merged.result().drop('target', axis=1))
target = client.scatter(merged.result()['target'])
# client.cancel(merged)
client.run(trim_memory)

{'tcp://127.0.0.1:40253': 1}

In [10]:
# client.cancel(train)
# client.cancel(target)

## fit

In [12]:
n_folds = 3

metrics = []
models = []
class_weights = dict(enumerate(compute_class_weight('balanced', classes=[0, 1], y=target.result())))
folds = KFold(n_splits=n_folds, shuffle=True, random_state=29)

for train_index, valid_index in folds.split(merged.result()):
    # X_train = client.scatter(train.result().iloc[train_index])        # this doesn't work
    # y_train = client.scatter(target.result().iloc[train_index])
    # X_valid = client.scatter(train.result().iloc[valid_index])
    # y_valid = client.scatter(target.result().iloc[valid_index])

    X_train = train.result().iloc[train_index]
    y_train = target.result().iloc[train_index]
    X_valid = train.result().iloc[valid_index]
    y_valid = target.result().iloc[valid_index]

    # LGBM
    estimator = LGBMClassifier(random_state=17,
                            class_weight=class_weights,
                            n_estimators=100,
                            learning_rate=0.15,
                            max_depth=-1,
                            num_leaves=31,
                            )
    model = client.submit(estimator.fit, X_train, y_train)     # TODO parallel_backend?
    wait(model)
    models.append(model.result())

    # predicts & metrics
    predict = model.result().predict(X_valid)
    metrics.append(f1_score(y_valid, predict, average='macro'))

print(f'Avg. f-score: {sum(metrics) / n_folds}')

Avg. f-score: 0.7137957827510393


In [17]:
# client.run(trim_memory)

{'tcp://127.0.0.1:42689': 1}

In [None]:
# 0.7137957827510393