In [1]:
import sys
import pathlib
cwd = pathlib.Path().cwd()
sys.path.append(cwd.parent.as_posix())
data_folder = cwd.parent.joinpath('data')
import pickle
import cloudpickle
import dill
import datetime as dt
import pandas as pd
from sklearn.metrics import f1_score

from sklearn.utils.class_weight import compute_class_weight
from sklearn.pipeline import make_pipeline
from transformers import Merger, TimeDifference, ColumnsCorrector
from sklearn.ensemble import RandomForestClassifier

## airflow check

In [3]:
# check metric
model = pickle.load(open('model.pkl', 'rb'))
# model = cloudpickle.load(open('model.pkl', 'rb'))
# model = dill.load(open('model.pkl.dill', 'rb'))

# select part of train
train = pd.read_csv(data_folder.joinpath('data_train.csv')).drop('Unnamed: 0', axis=1)
train = train[train['buy_time'] >= dt.datetime.fromisoformat('2018-11-19').timestamp()]

data = train.drop('target', axis=1)
target = train['target']

predicts = model.predict(data)
f1_score(target, predicts, average='macro')

0.48433428366337

## manual fit

In [11]:
# read train data
train_data = pd.read_csv(data_folder.joinpath('data_train.csv')).drop('Unnamed: 0', axis=1)
# extract required train data
used_mask = train_data['buy_time'] >= dt.datetime.fromisoformat('2018-11-19').timestamp()
train_data = train_data[used_mask]
target = train_data['target']

# read compressed features
features = pd.read_csv(data_folder.joinpath('compressed_features.csv'))

# calc class weights
class_weights = dict(enumerate(compute_class_weight('balanced', classes=[0, 1], y=target)))

# build featuring pipeline
pipeline = make_pipeline(
    Merger(features, method='backward', fillna='nearest'),
    TimeDifference('feats_time', 'train_time'),
    # Clusterer(['0', '1', '2'], n_clusters=8, random_state=13),
    ColumnsCorrector('drop', ['id', 'train_time', 'feats_time']),        
    # LGBMClassifier(random_state=17, class_weight='balanced', n_jobs=-1, **fit_params)
    RandomForestClassifier(random_state=17, class_weight=class_weights, n_jobs=-1)
)
# fit model
data = train_data.drop('target', axis=1)
pipeline.fit(data, target)
# pickle.dump(pipeline, open('manual.pkl', 'wb'))

f1_score(target, pipeline.predict(data), average='macro')

0.6369033424389914

In [6]:
# # different estimator random states
# for n in range(5):
#     pipeline.fit(data, target)
#     score = f1_score(target, pipeline.predict(data), average='macro')
#     print(score)

0.6394154839821656
0.6348730250602365
0.6372832291689188
0.6349499672909246
0.6320313884530818


## check manual model

In [7]:
model = pickle.load(open('manual.pkl', 'rb'))
# select part of train
train = pd.read_csv(data_folder.joinpath('data_train.csv')).drop('Unnamed: 0', axis=1)
train = train[train['buy_time'] >= dt.datetime.fromisoformat('2018-11-19').timestamp()]

data = train.drop('target', axis=1)
target = train['target']

predicts = model.predict(data)
f1_score(target, predicts, average='macro')

0.6369033424389914

In [None]:
#