In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

pd.set_option('display.max_columns', None)

%load_ext autoreload
%autoreload 2

In [4]:
target = pd.read_csv("../data/target_train.csv")
sample = pd.read_csv("../data/sample_submission.csv")

plavki_train = pd.read_csv("../data/plavki_train.csv")
plavki_test = pd.read_csv("../data/plavki_test.csv")

gas_train = pd.read_csv("../data/gas_train.csv")
gas_test = pd.read_csv("../data/gas_test.csv")

chugun_train = pd.read_csv("../data/chugun_train.csv")
chugun_test = pd.read_csv("../data/chugun_test.csv")

lom_train = pd.read_csv("../data/lom_train.csv")
lom_test = pd.read_csv("../data/lom_test.csv")

produv_train = pd.read_csv("../data/produv_train.csv")
produv_test = pd.read_csv("../data/produv_test.csv")

chronom_train = pd.read_csv("../data/chronom_train.csv")
chronom_test = pd.read_csv("../data/chronom_test.csv")

from pipeline import merge_data

params = {
    "chugun": {},
    "plavki": {"bow_count": 10}
}

train, test, y, num_features, cat_features = merge_data(
    sample, target, plavki_train, plavki_test, gas_train, gas_test, chugun_train, chugun_test, 
    lom_train, lom_test, produv_train, produv_test, chronom_train, chronom_test, params
)

  lom_train_transformed['ves_loma/ves_chuguna'] = lom_train_transformed['ves_loma'].values/chugun_train['VES'].values
  lom_test_transformed['ves_loma/ves_chuguna'] = lom_test_transformed['ves_loma'].values/chugun_test['VES'].values


In [21]:
from sklearn.model_selection import KFold
from tqdm.auto import tqdm, trange

def metric(c_true, tst_true, c_pred, tst_pred):
    delta_c = np.abs(c_true - c_pred)
    hit_rate_c = np.int64(delta_c < 0.02)

    delta_t = np.abs(tst_true - tst_pred)
    hit_rate_t = np.int64(delta_t < 20)

    N = c_pred.shape[0]    
    return np.sum(hit_rate_c) / N, np.sum(hit_rate_t) / N, np.sum(hit_rate_c + hit_rate_t) / 2 / N

def pipeline(model_c, model_tst, train, test, y, sample, n_splits=10):
    kf = KFold(n_splits=n_splits, shuffle=True)

    sample["C"] = 0
    sample["TST"] = 0
    
    res_c, res_t, res = [], [], []
    for train_idx, test_idx in tqdm(kf.split(train)):
        cur_train = train[train_idx]
        cur_eval = train[test_idx]

        cur_train_y = y[train_idx]
        cur_eval_y = y[test_idx]

        model_c.fit(cur_train, cur_train_y["C"])
        model_tst.fit(cur_train, cur_train_y["TST"])
        
        eval_pred_c = model_c.predict(cur_eval)
        eval_pred_tst = model_tst.predict(cur_eval)
    
        sample["C"] += model_c.predict(cur_eval) / n_splits
        sample["TST"] += model_tst.predict(cur_eval) / n_splits
        
        hit_rate_c, hit_rate_t, hit_rate = metric(cur_eval_y["C"], cur_eval_y["TST"], eval_pred_c, eval_pred_tst)
        res_c.append(hit_rate_c)
        res_t.append(hit_rate_t)
        res.append(hit_rate)
    
    res_c = np.array(res_c)
    res_t = np.array(res_t)
    res = np.array(res)
    print(f"Carbon score: {res_c.mean()} ± {res_c.std()}")
    print(f"Temperature score: {res_tst.mean()} ± {res_tst.std()}")    
    print(f"Overall score: {res.mean()} ± {res.std()}") 
    
    return sample