In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

pd.set_option('display.max_columns', None)

%load_ext autoreload
%autoreload 2

In [None]:
target = pd.read_csv("../data/target_train.csv")
sample = pd.read_csv("../data/sample_submission.csv")

plavki_train = pd.read_csv("../data/plavki_train.csv")
plavki_test = pd.read_csv("../data/plavki_test.csv")

gas_train = pd.read_csv("../data/gas_train.csv")
gas_test = pd.read_csv("../data/gas_test.csv")

chugun_train = pd.read_csv("../data/chugun_train.csv")
chugun_test = pd.read_csv("../data/chugun_test.csv")

lom_train = pd.read_csv("../data/lom_train.csv")
lom_test = pd.read_csv("../data/lom_test.csv")

produv_train = pd.read_csv("../data/produv_train.csv")
produv_test = pd.read_csv("../data/produv_test.csv")

chronom_train = pd.read_csv("../data/chronom_train.csv")
chronom_test = pd.read_csv("../data/chronom_test.csv")

sip_train = pd.read_csv("../data/sip_train.csv")
sip_test = pd.read_csv("../data/sip_test.csv")

from pipeline import merge_data

params = {
    "chugun": {"drop_outliers": True},
    "plavki": {"bow_count": 10, "clip_duration": True},
    "vector_size": 10
}

train, test, y, num_features, cat_features = merge_data(
    sample, target, plavki_train, plavki_test, gas_train, gas_test, chugun_train, chugun_test, 
    lom_train, lom_test, produv_train, produv_test, chronom_train, chronom_test, 
    sip_train, sip_test, params
)

In [None]:
from sklearn.model_selection import KFold, TimeSeriesSplit
from tqdm.auto import tqdm, trange

def metric(c_true, tst_true, c_pred, tst_pred, pwc=None, pwt=None):
    
    if pwc is not None:
        c_pred = pwc.inverse_transform(c_pred.reshape(-1, 1)).reshape(-1)
    if pwt is not None:
        tst_pred = pwt.inverse_transform(tst_pred.reshape(-1, 1)).reshape(-1)
    
    delta_c = np.abs(c_true - c_pred)
    hit_rate_c = np.int64(delta_c < 0.02)

    delta_t = np.abs(tst_true - tst_pred)
    hit_rate_t = np.int64(delta_t < 20)

    N = c_pred.shape[0]    
    return np.sum(hit_rate_c) / N, np.sum(hit_rate_t) / N, np.sum(hit_rate_c + hit_rate_t) / 2 / N

def pipeline(model_c, model_tst, train_c, test_c, train_tst, test_tst, y, sample, n_splits=10, pwc=None, pwt=None):
    kf = KFold(n_splits=n_splits, shuffle=True)

    sample["C"] = 0
    sample["TST"] = 0
    
    res_c, res_t, res = [], [], []
    for train_idx, test_idx in tqdm(kf.split(train_c), total=n_splits):
        cur_train_c = train_c[train_idx]
        cur_eval_c = train_c[test_idx]        

        cur_train_tst = train_tst[train_idx]
        cur_eval_tst = train_tst[test_idx]

        cur_train_y = y.iloc[train_idx]
        cur_eval_y = y.iloc[test_idx]
        
        if pwc is not None:
            if type(model_c) == CatBoostRegressor:
                model_c.fit(cur_train_c, pwc.transform(cur_train_y["C"].values.reshape(-1, 1)).reshape(-1),
                           eval_set=(cur_eval_c, pwc.transform(cur_eval_y["C"].values.reshape(-1, 1)).reshape(-1)),
                           early_stopping_rounds=35)
            else:
                model_c.fit(cur_train_c, pwc.transform(cur_train_y["C"].values.reshape(-1, 1)).reshape(-1))

        else:
            model_c.fit(cur_train_c, cur_train_y["C"])
            
        if pwt is not None:
            if type(model_tst) == CatBoostRegressor:
                model_tst.fit(cur_train_tst, pwt.transform(cur_train_y["TST"].values.reshape(-1, 1)).reshape(-1),
                              eval_set=(cur_eval_tst, pwc.transform(cur_eval_y["TST"].values.reshape(-1, 1)).reshape(-1)),
                              early_stopping_rounds=35)
            else:
                model_tst.fit(cur_train_tst, pwt.transform(cur_train_y["TST"].values.reshape(-1, 1)).reshape(-1))
        else:
            model_tst.fit(cur_train_tst, cur_train_y["TST"])
        
        eval_pred_c = model_c.predict(cur_eval_c)
        eval_pred_tst = model_tst.predict(cur_eval_tst)
    
        sample["C"] += model_c.predict(test_c) / n_splits
        sample["TST"] += model_tst.predict(test_tst) / n_splits
        
        hit_rate_c, hit_rate_t, hit_rate = metric(cur_eval_y["C"], cur_eval_y["TST"], eval_pred_c, eval_pred_tst, pwc, pwt)
        res_c.append(hit_rate_c)
        res_t.append(hit_rate_t)
        res.append(hit_rate)
    
    res_c = np.array(res_c)
    res_t = np.array(res_t)
    res = np.array(res)
    print(f"Carbon score: {res_c.mean()} ± {res_c.std()}")
    print(f"Temperature score: {res_t.mean()} ± {res_t.std()}")    
    print(f"Overall score: {res.mean()} ± {res.std()}") 
    
    if pwt is not None:
        sample["TST"] = pwt.inverse_transform(sample["TST"].values.reshape(-1, 1)).reshape(-1)
    if pwc is not None:
        sample["C"] = pwc.inverse_transform(sample["C"].values.reshape(-1, 1)).reshape(-1)
    
    sample["C"] = sample["C"].clip(0, 1)
    
    return sample, (res_c.mean(), res_t.mean(), res.mean())

In [None]:
from sklearn.preprocessing import PowerTransformer

pwrC = PowerTransformer('box-cox')
pwrT = PowerTransformer('box-cox')

pwrC2 = PowerTransformer()
pwrT2 = PowerTransformer()

y_exp = y.copy()

y_exp["C"] = pwrC.fit_transform(y['C'].values.reshape(-1, 1))
y_exp["TST"] = pwrT.fit_transform(y['TST'].values.reshape(-1, 1))

pwrC2.fit(y['C'].values.reshape(-1, 1))
pwrT2.fit(y['TST'].values.reshape(-1, 1))

In [None]:
gas_filtered_for_C = pd.read_csv("../gas/gas_filtered_for_C.csv").set_index("NPLV")
gas_filtered_for_TST = pd.read_csv("../gas/gas_filtered_for_TST.csv").set_index("NPLV")

gas_test_autofeatures = pd.read_csv("../gas/gas_test_autofeatures.csv").set_index("NPLV")

In [None]:
train_gas_C = gas_filtered_for_C
train_gas_TST = gas_filtered_for_TST

test_gas_C = gas_test_autofeatures[gas_filtered_for_C.columns]
test_gas_TST = gas_test_autofeatures[gas_filtered_for_TST.columns]

In [None]:
train_C = train.join(train_gas_C, on="NPLV")
train_TST = train.join(train_gas_TST, on="NPLV")

test_C = test.join(test_gas_C, on="NPLV")
test_TST = test.join(test_gas_TST, on="NPLV")

In [None]:
res_train = pd.read_csv("final_train.csv")
res_test = pd.read_csv("final_test.csv")

train_C = train_C.join(res_train.set_index("NPLV"), on="NPLV")
train_TST = train_TST.join(res_train.set_index("NPLV"), on="NPLV")
test_C = test_C.join(res_test.set_index("NPLV"), on="NPLV")
test_TST = test_TST.join(res_test.set_index("NPLV"), on="NPLV")

In [None]:
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Ridge

model_c = CatBoostRegressor(n_estimators=500, grow_policy="Depthwise", silent=True)
model_tst = XGBRegressor(n_estimators=40, n_jobs=-1)

# model_c = Ridge()
# model_tst = Ridge()

res, metr = pipeline(model_c, model_tst, train_C.values, test_C.values, train_TST.values, 
                     test_TST.values, y, sample, n_splits=50, pwc=pwrC, pwt=pwrT)

In [None]:
res.to_csv("final_result.csv", index=False)