In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

pd.set_option('display.max_columns', None)

%load_ext autoreload
%autoreload 2

In [2]:
target = pd.read_csv("../data/target_train.csv")
sample = pd.read_csv("../data/sample_submission.csv")

plavki_train = pd.read_csv("../data/plavki_train.csv")
plavki_test = pd.read_csv("../data/plavki_test.csv")

gas_train = pd.read_csv("../data/gas_train.csv")
gas_test = pd.read_csv("../data/gas_test.csv")

chugun_train = pd.read_csv("../data/chugun_train.csv")
chugun_test = pd.read_csv("../data/chugun_test.csv")

lom_train = pd.read_csv("../data/lom_train.csv")
lom_test = pd.read_csv("../data/lom_test.csv")

produv_train = pd.read_csv("../data/produv_train.csv")
produv_test = pd.read_csv("../data/produv_test.csv")

chronom_train = pd.read_csv("../data/chronom_train.csv")
chronom_test = pd.read_csv("../data/chronom_test.csv")

sip_train = pd.read_csv("../data/sip_train.csv")
sip_test = pd.read_csv("../data/sip_test.csv")

from pipeline import merge_data

params = {
    "chugun": {},
    "plavki": {"bow_count": 10}
}

train, test, y, num_features, cat_features = merge_data(
    sample, target, plavki_train, plavki_test, gas_train, gas_test, chugun_train, chugun_test, 
    lom_train, lom_test, produv_train, produv_test, chronom_train, chronom_test, 
    sip_train, sip_test, params
)

  lom_train_transformed['ves_loma/ves_chuguna'] = lom_train_transformed['ves_loma'].values/chugun_train['VES'].values
  lom_test_transformed['ves_loma/ves_chuguna'] = lom_test_transformed['ves_loma'].values/chugun_test['VES'].values


In [10]:
from sklearn.model_selection import KFold, TimeSeriesSplit
from tqdm.auto import tqdm, trange

def metric(c_true, tst_true, c_pred, tst_pred):
    delta_c = np.abs(c_true - c_pred)
    hit_rate_c = np.int64(delta_c < 0.02)

    delta_t = np.abs(tst_true - tst_pred)
    hit_rate_t = np.int64(delta_t < 20)

    N = c_pred.shape[0]    
    return np.sum(hit_rate_c) / N, np.sum(hit_rate_t) / N, np.sum(hit_rate_c + hit_rate_t) / 2 / N

def pipeline(model_c, model_tst, train_c, test_c, train_tst, test_tst, y, sample, n_splits=10):
    kf = KFold(n_splits=n_splits, shuffle=True)

    sample["C"] = 0
    sample["TST"] = 0
    
    res_c, res_t, res = [], [], []
    for train_idx, test_idx in tqdm(kf.split(train_c), total=n_splits):
        cur_train_c = train_c[train_idx]
        cur_eval_c = train_c[test_idx]        

        cur_train_tst = train_tst[train_idx]
        cur_eval_tst = train_tst[test_idx]

        cur_train_y = y.iloc[train_idx]
        cur_eval_y = y.iloc[test_idx]

        model_c.fit(cur_train_c, cur_train_y["C"])
        model_tst.fit(cur_train_tst, cur_train_y["TST"])
        
        eval_pred_c = model_c.predict(cur_eval_c)
        eval_pred_tst = model_tst.predict(cur_eval_tst)
    
        sample["C"] += model_c.predict(test_c) / n_splits
        sample["TST"] += model_tst.predict(test_tst) / n_splits
        
        hit_rate_c, hit_rate_t, hit_rate = metric(cur_eval_y["C"], cur_eval_y["TST"], eval_pred_c, eval_pred_tst)
        res_c.append(hit_rate_c)
        res_t.append(hit_rate_t)
        res.append(hit_rate)
    
    res_c = np.array(res_c)
    res_t = np.array(res_t)
    res = np.array(res)
    print(f"Carbon score: {res_c.mean()} ± {res_c.std()}")
    print(f"Temperature score: {res_t.mean()} ± {res_t.std()}")    
    print(f"Overall score: {res.mean()} ± {res.std()}") 
    
    sample["C"] = sample["C"].clip(0, 1)
    
    return sample

In [5]:
from pipeline import transform_for_linear

train_scaled, test_scaled = transform_for_linear(train, num_features, cat_features, test)

In [6]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge, Lasso
from sklearn.svm import SVR

from xgboost import XGBRegressor

In [11]:
model_c = XGBRegressor(n_estimators=20, max_depth=7)
model_tst = XGBRegressor(n_estimators=20, max_depth=7)

res = pipeline(model_c, model_tst, train.values, test.values, train.values, test.values, y, sample, n_splits=20)

  0%|          | 0/20 [00:00<?, ?it/s]

Carbon score: 0.6560072815533982 ± 0.04387345736163744
Temperature score: 0.6414395070948469 ± 0.04961485763430513
Overall score: 0.6487233943241225 ± 0.04153974605676401


In [15]:
sorted(zip(model_c.feature_importances_, train.columns), reverse=True)[:20]

[(0.4214841, 'truncated_NMZ'),
 (0.052253712, 'gas_sum_volume_O2'),
 (0.042881526, 'VES'),
 (0.038848143, 'total_operations_опер_0'),
 (0.036443055, 'min_ratio'),
 (0.028542645, 'total_duration_опер_0'),
 (0.026982587, 'SI'),
 (0.021258013, 'unique_ratio'),
 (0.020968104, 'gas_sum_volume_CO2'),
 (0.015138853, 'gas_mean_volume_O2'),
 (0.014687493, 'P'),
 (0.011710698, 'O2_опер_0'),
 (0.011186087, 'durationproduv_'),
 (0.011016624, 'gas_std_volume_AR'),
 (0.010167464, 'max_duration_опер_0'),
 (0.009297233, 'TI'),
 (0.008904938, 'gas_std_V'),
 (0.008696275, 'gas_mean_O2_pressure'),
 (0.008431477, 'total_operations_межпл.прост._1'),
 (0.008256948, 'dayofmonth')]

In [56]:
res.to_csv("forest.csv", index=False)