In [21]:
import pandas as pd 
import numpy as np
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.svm import LinearSVR
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
MAX_TOWERS = 6
from catboost import CatBoostRegressor
from sklearn.preprocessing import LabelEncoder
np.random.seed(17)
import random
random.seed(17)
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error

In [2]:
def get_train():
    train_main = pd.read_csv("../data/task1/train_1.7.csv", encoding="cp1251")
    train_aux_coords = pd.read_csv("../data/task1_additional/coords_train_1.1.csv", encoding="cp1251")
    train_aux_frac = pd.read_csv("../data/task1_additional/frac_train_1.csv", encoding="cp1251")
    train_aux_gdis = pd.read_csv("../data/task1_additional/gdis_train1.2.csv", encoding="cp1251")
    
    
    train_frac_main = pd.merge(train_main, train_aux_frac,how="left", left_on="Скважина", right_on="Скважина")
    all_recs = pd.merge(train_frac_main, train_aux_gdis,how="left", left_on="Скважина", right_on="Скважина")
    print(all_recs.shape)
    return all_recs

def get_test():
    test_main = pd.read_csv("../data/task1/test_1.9.csv", encoding="cp1251")
    test_aux_coords = pd.read_csv("../data/task1_additional/coords_train_1.1.csv", encoding="cp1251")
    test_aux_frac = pd.read_csv("../data/task1_additional/frac_test_1.csv", encoding="cp1251")
    test_aux_gdis = pd.read_csv("../data/task1_additional/gdis_test1.2.csv", encoding="cp1251")
    
    
    test_frac_main = pd.merge(test_main, test_aux_frac,how="left", left_on="Скважина", right_on="Скважина")
    all_recs = pd.merge(test_frac_main, test_aux_gdis,how="left", left_on="Скважина", right_on="Скважина")
    print(all_recs.shape)
    return all_recs

In [3]:
#sort by converted date and group
def convert_and_sort(df):
    df["Дата"] =  df["Дата"].apply(pd.to_datetime)
    return df.sort_values(by=["Скважина", "Дата"])

def get_non_useful(df):
    non_useful_columns = []
    for c in df.columns:
        null_columns = df[df[c].isnull()]
        if len(null_columns)== len(df):
            non_useful_columns.append(c)
    return non_useful_columns

def drop_non_useful(train, test):
    non_useful = set(get_non_useful(train)) |set(get_non_useful(test))
    print("%s dropped"% non_useful)
    return train.drop(list(non_useful), axis=1), test.drop(list(non_useful), axis=1)

def get_float(v):
    v = str(v)
    if v != "NaN":
        new = v.replace(",",".")
        return float(new)
    return v

def get_target(df, column="Нефть, т"):
    target = df[column]
    print("%s dropped"% column)
    return df.drop([column], axis=1), target.apply(get_float)

#drop non present columns in test
def drop_not_present(train, test):
    absent_columns = list(set(train.columns) - set(test.columns))
    print("%s dropped"% absent_columns)
    return train.drop(absent_columns, axis=1), test
    
def show_uniq_test_train(train, test):
    #check all values that have zero ans nan only
    for c in train.columns:
        un = train[c].unique()
        if len(un)<100:
            tun = test[c].unique()
            print("%s ;train: %s; test:%s"%(c, un, tun))

In [4]:
def common_data_pipeline(train, test):
    print(train.shape)
    print(test.shape)
    
    y = None
    train=convert_and_sort(train)
    train, test = drop_non_useful(train, test)
    #remove target from train
    train, y = get_target(train)
    train, test = drop_not_present(train, test)
    print(train.shape)
    print(test.shape)
    
    return train, test, y

In [5]:
def split_continious_date_categorical_text(df):
    group_id = ["Скважина"]
    text = ["Причина простоя",
            "Куст",
            "Состояние на конец месяца",
            "Причина простоя.1",
            "Мероприятия",
            "Проппант"]
    categorical = ["Тип испытания",
                   "Тип скважины",
                   "Неустановившийся режим",
                   "ГТМ",
                   "Метод",
                   "Характер работы",
                   "Состояние",
                   "Пласт МЭР", 
                   "Способ эксплуатации", 
                   "Тип насоса", 
                   "Состояние на конец месяца", 
                   "Номер бригады", 
                   "Фонтан через насос", 
                   "Нерентабельная",
                   "Назначение по проекту",
                   "Группа фонда",
                   "Тип дополнительного оборудования",
                   "Марка ПЭД",
                   "Тип ГЗУ",
                   "ДНС",
                   "КНС",
                   #useless potentially
                   "Диаметр плунжера",
                   "Природный газ, м3",
                   "Конденсат, т",
                   "Длина хода плунжера ШГН",
                   "Коэффициент подачи насоса",
                   "Дебит конденсата",
                   "Вязкость воды в пластовых условиях",
                   "Газ из газовой шапки, м3",
                   "Число качаний ШГН",
                   "Коэффициент сепарации",
                   "SKIN",
                   "КН закрепленный",
                   # radically different
                   "Время в работе",
                   "Радиус контура питания",
                   "Время в накоплении",
                   "Время накопления"
                   ]
    dates = ["Дата", 
             "Дата ГРП",
             "Время до псевдоуст-ся режима", 
             "Дата запуска после КРС", 
             "Дата пуска", 
             "Дата останова",
             "Дата ввода в эксплуатацию"]
    continious = list(set(df.columns) - set(dates) - set(categorical) - set(text) - set(group_id))
    return (df[group_id],df[continious], df[dates], df[categorical], df[text])

In [6]:
def get_object_columns(df):
    objects = []
    for c in df.columns:
        if df[c].dtype != pd.np.float:
            objects.append(c)
    return objects

def convert_locale_to_float(df):
    loc_float = get_object_columns(df)
    converted = df.copy()
    for c in loc_float:
        converted.loc[:,c] = df[c].apply(get_float)
    return converted
        
def fill_with_mean(train, test):
    means=train.mean()
    norm_train = train.fillna(means)
    norm_test = test.fillna(means)
    return norm_train, norm_test

# now we have clear non-normalized data, let's normalize first
def normalize(train, test):
    scaler = StandardScaler()
    norm_train = pd.DataFrame(scaler.fit_transform(train), columns=train.columns, index = train.index)
    norm_test = pd.DataFrame(scaler.transform(test), columns=test.columns, index = test.index)
    return norm_train, norm_test

In [7]:
def cont_transform_pipeline(train, test):
    train_f = convert_locale_to_float(train)
    test_f = convert_locale_to_float(test)
    train_cont, test_cont = fill_with_mean(train_f, test_f)
    train_cont, test_cont = normalize(train_cont, test_cont)
    print(train_cont.isnull().values.any() or test_cont.isnull().values.any())
    
    print(train_cont.shape)
    print(test_cont.shape)
    return train_cont, test_cont

In [8]:
def transform_cats_to_labels(train_cat, test_cat):
    transformed_df = train_cat.copy()
    trans_test = test_cat.copy()
    for c in train_cat.columns:
        encoder = LabelEncoder()
        column_train = train_cat[c].astype(str)
        column_test = test_cat[c].astype(str)
        combined = pd.concat([column_train, column_test])
        encoder.fit(combined)
        transformed_df[c] = encoder.transform(column_train).reshape(-1,1)
        trans_test[c] = encoder.transform(column_test).reshape(-1,1)
    return transformed_df, trans_test

In [9]:
def cat_transform_pipeline(train, test):
    train_cat, test_cat = transform_cats_to_labels(train, test)
    print(train_cat.shape)
    print(test_cat.shape)
    return train_cat, test_cat

In [10]:
def clean_non_targeted(train_array, y_train):
    clean_array = []
    train_array.append(y_train)
    #clear nans in target
    indexes_to_delete = y_train[y_train.isnull()].index
    for df in train_array:
        item = df.drop(index=indexes_to_delete)
        clean_array.append(item)
        print(item.shape)
    return clean_array

In [11]:
def get_preds_for_cats(train, test, y):
    cb_regressor = CatBoostRegressor()
    train_catboost_preds = cross_val_predict(cb_regressor, train, y)
    cb_regressor.fit(train,y=y)
    test_catboost_preds = pd.Series(cb_regressor.predict(test), index=test.index)
    return train_catboost_preds, test_catboost_preds

In [12]:
def get_cont_ensemble():
    ridge = Ridge()
    rtree = RandomForestRegressor(n_jobs=-1, n_estimators=50)
    svr = LinearSVR()
    return [ridge, rtree, svr]

def get_cont_ensemble_names():
    return ["ridge", "rtree", "svr"]

In [13]:
def get_meta_train_preds(X, y, train_mixture, mix_cols):
    predicts = []
    for cl in get_cont_ensemble():
        predicts.append(cross_val_predict(cl, X,y, n_jobs=-1))
    predicts.append(train_mixture)
    return pd.DataFrame(np.vstack(predicts).transpose(), index=y.index, columns=get_cont_ensemble_names()+mix_cols)

def get_meta_test_predict(X_train, y_train, X_test, test_mixture, mix_cols):
    test_predicts = []
    for cl in get_cont_ensemble():
        print(cross_val_score(cl, X_train, y_train, n_jobs=-1, scoring="neg_mean_absolute_error"))
        cl.fit(X_train, y_train)
        pr = cl.predict(X_test)
        test_predicts.append(pr)
    test_predicts.append(test_mixture)
    return  pd.DataFrame(np.vstack(test_predicts).transpose(), index=X_test.index, columns=get_cont_ensemble_names()+mix_cols)

def get_stacked_ensemble_predict(X_meta, y, X_test):
    regressor = xgb.XGBRegressor()
    regressor.fit(X_meta, y)
    return pd.DataFrame(regressor.predict(X_test), index=X_test.index)

In [14]:
def get_n_item_index(group_size, df, group):
    new_df = pd.concat([df, group], axis = 1)
    index = []
    group = new_df.groupby(["Скважина"])
    for name, group in group:
        if len(group)<group_size:
            continue
        for start in range(len(group.index) - group_size):
            gr =group.index[start:start+group_size]
            index.append(gr)
    return index

def get_timed_ds(meta_size, df, group, y):
    if meta_size >= 1:
        meta_indexes = get_n_item_index(meta_size, df, group)
        first_value_idx = []
        timed_ds = df.copy()
        metas = []
        columns = []
        for i in range(meta_size):
            columns.append("meta%s"%str(i))
        for a in meta_indexes:
            first_value_idx.append(a[0])
            metas.append(list(y.loc[a]))
        metas_df = pd.DataFrame.from_records(metas, index=first_value_idx, columns=columns)
        return pd.concat([timed_ds.loc[first_value_idx], metas_df], axis=1)
    elif meta_size == 0:
        return df
    
def get_n_tower_predictions(n, train, y, test, train_group, train_mix, test_mix, mix_col):
    X_meta_train = get_meta_train_preds(train, y, train_mixture=train_mix, mix_cols=mix_col)
    X_meta_test = get_meta_test_predict(train, y, test, test_mixture=test_mix, mix_cols=mix_col)
    test_predictions= []
    for i in range(0,n):
        train_timed_ds = get_timed_ds(i,X_meta_train, train_group, y)
        y_timed = y.loc[train_timed_ds.index]
        test_predict = get_stacked_ensemble_predict(train_timed_ds, y_timed,X_meta_test)
        test_predictions.append(test_predict)
        X_meta_test["meta%s"%i]=test_predict
    return pd.concat(test_predictions,axis=1)

In [15]:
def create_submission(fname, df, constant):
    print(701.4750 - np.mean(df))
    final_pred = df+ (constant - np.mean(df))
    final_pred = pd.Series(np.squeeze(final_pred))
    final_pred.to_csv(fname,header=["_VAL_"],index_label=["_ID_"])

In [32]:
def get_clean_data(train, test):
    train, test, y_train  = common_data_pipeline(train, test)
    train_group, train_cont, train_dat, train_cat, train_text = split_continious_date_categorical_text(train)
    test_group, test_cont, test_dat, test_cat, test_text = split_continious_date_categorical_text(test)
    train_cont, test_cont = cont_transform_pipeline(train_cont, test_cont)
    train_cat, test_cat = cat_transform_pipeline(train_cat, test_cat)
    train_cont, train_group, train_cat, y_train = clean_non_targeted([train_cont, train_group, train_cat], y_train)
    train_cat_preds, test_cat_preds = get_preds_for_cats(train_cat, test_cat, y_train)
    return train_cont, y_train, test_cont, train_group, train_cat_preds, test_cat_preds

In [35]:
def get_prediction(train, test):
    train_cont, y_train, test_cont, train_group, train_cat_preds, test_cat_preds = get_clean_data(train,test)
    time_serie_pred = get_n_tower_predictions(
        6, train_cont, y_train,test_cont, train_group, train_cat_preds, test_cat_preds, ["catboost"]
    ).values.reshape(-1,1)
    return time_serie_pred

In [36]:
preds = get_prediction(get_train(), get_test())

(5735, 147)
(319, 138)
(5735, 147)
(319, 138)
{'Агент закачки', 'Примечание', 'Станок-качалка', 'Тип газосепаратора', 'Фирма ГРП'} dropped
Нефть, т dropped
['ТП(ГРП) Дебит жидкости скорр-ый', 'ГП - Общий прирост Qн', 'Дебит жидкости', 'Жидкость, м3', 'ТП(ИДН) Дебит жидкости', 'ТП(ИДН) Дебит жидкости скорр-ый', 'Нефть, м3', 'ТП(ГРП) Дебит жидкости'] dropped
(5735, 133)
(319, 133)
False
(5735, 83)
(319, 83)
(5735, 37)
(319, 37)
(4764, 83)
(4764, 1)
(4764, 37)
(4764,)
0:	learn: 578.0507488	total: 11.1ms	remaining: 11.1s
1:	learn: 568.0717072	total: 15.8ms	remaining: 7.89s
2:	learn: 559.5670331	total: 20.5ms	remaining: 6.8s
3:	learn: 550.8867115	total: 24.5ms	remaining: 6.11s
4:	learn: 542.2041746	total: 31.3ms	remaining: 6.22s
5:	learn: 533.6993548	total: 36.1ms	remaining: 5.99s
6:	learn: 526.2889341	total: 41ms	remaining: 5.82s
7:	learn: 519.0656201	total: 44.9ms	remaining: 5.57s
8:	learn: 511.5704769	total: 49.1ms	remaining: 5.41s
9:	learn: 504.8806050	total: 53.5ms	remaining: 5.3s
10:	

157:	learn: 326.3427919	total: 756ms	remaining: 4.03s
158:	learn: 326.3162697	total: 762ms	remaining: 4.03s
159:	learn: 326.2833733	total: 771ms	remaining: 4.05s
160:	learn: 326.1395752	total: 781ms	remaining: 4.07s
161:	learn: 325.9605755	total: 787ms	remaining: 4.07s
162:	learn: 325.9343898	total: 790ms	remaining: 4.06s
163:	learn: 325.8113063	total: 795ms	remaining: 4.05s
164:	learn: 325.6673948	total: 800ms	remaining: 4.05s
165:	learn: 325.6584381	total: 803ms	remaining: 4.03s
166:	learn: 325.5397259	total: 808ms	remaining: 4.03s
167:	learn: 325.4267182	total: 812ms	remaining: 4.02s
168:	learn: 325.3793397	total: 816ms	remaining: 4.01s
169:	learn: 325.1070563	total: 820ms	remaining: 4s
170:	learn: 325.0451223	total: 824ms	remaining: 3.99s
171:	learn: 324.8802244	total: 828ms	remaining: 3.99s
172:	learn: 324.8096615	total: 832ms	remaining: 3.98s
173:	learn: 324.6821317	total: 837ms	remaining: 3.97s
174:	learn: 324.4491500	total: 841ms	remaining: 3.97s
175:	learn: 324.3265632	total: 

331:	learn: 309.4598886	total: 1.54s	remaining: 3.09s
332:	learn: 309.3072583	total: 1.55s	remaining: 3.1s
333:	learn: 309.2137002	total: 1.56s	remaining: 3.1s
334:	learn: 309.1693731	total: 1.56s	remaining: 3.1s
335:	learn: 309.0932885	total: 1.57s	remaining: 3.1s
336:	learn: 309.0639097	total: 1.57s	remaining: 3.1s
337:	learn: 308.8967620	total: 1.58s	remaining: 3.09s
338:	learn: 308.8734431	total: 1.58s	remaining: 3.08s
339:	learn: 308.7863866	total: 1.59s	remaining: 3.08s
340:	learn: 308.7453509	total: 1.59s	remaining: 3.08s
341:	learn: 308.5803325	total: 1.59s	remaining: 3.07s
342:	learn: 308.4293566	total: 1.6s	remaining: 3.06s
343:	learn: 308.3566900	total: 1.6s	remaining: 3.06s
344:	learn: 308.2587157	total: 1.61s	remaining: 3.06s
345:	learn: 308.2109780	total: 1.61s	remaining: 3.05s
346:	learn: 308.1166286	total: 1.62s	remaining: 3.05s
347:	learn: 308.1100431	total: 1.63s	remaining: 3.04s
348:	learn: 307.9722562	total: 1.63s	remaining: 3.04s
349:	learn: 307.9190612	total: 1.63

491:	learn: 300.9741858	total: 2.52s	remaining: 2.6s
492:	learn: 300.8851179	total: 2.52s	remaining: 2.6s
493:	learn: 300.8684003	total: 2.53s	remaining: 2.6s
494:	learn: 300.8682379	total: 2.54s	remaining: 2.59s
495:	learn: 300.8475337	total: 2.55s	remaining: 2.59s
496:	learn: 300.7915445	total: 2.55s	remaining: 2.58s
497:	learn: 300.7576440	total: 2.56s	remaining: 2.58s
498:	learn: 300.7509836	total: 2.56s	remaining: 2.57s
499:	learn: 300.7374454	total: 2.56s	remaining: 2.56s
500:	learn: 300.7342872	total: 2.57s	remaining: 2.56s
501:	learn: 300.7188443	total: 2.57s	remaining: 2.55s
502:	learn: 300.7028987	total: 2.58s	remaining: 2.55s
503:	learn: 300.6761386	total: 2.58s	remaining: 2.54s
504:	learn: 300.6553337	total: 2.58s	remaining: 2.53s
505:	learn: 300.6449347	total: 2.59s	remaining: 2.53s
506:	learn: 300.6448180	total: 2.59s	remaining: 2.52s
507:	learn: 300.6193080	total: 2.6s	remaining: 2.52s
508:	learn: 300.5802886	total: 2.6s	remaining: 2.51s
509:	learn: 300.5547249	total: 2.

665:	learn: 296.1063357	total: 3.49s	remaining: 1.75s
666:	learn: 296.1058594	total: 3.5s	remaining: 1.75s
667:	learn: 296.0938017	total: 3.51s	remaining: 1.74s
668:	learn: 296.0463357	total: 3.52s	remaining: 1.74s
669:	learn: 296.0308743	total: 3.53s	remaining: 1.74s
670:	learn: 295.9694306	total: 3.53s	remaining: 1.73s
671:	learn: 295.9401522	total: 3.54s	remaining: 1.73s
672:	learn: 295.9032281	total: 3.55s	remaining: 1.72s
673:	learn: 295.8687513	total: 3.56s	remaining: 1.72s
674:	learn: 295.8687232	total: 3.56s	remaining: 1.72s
675:	learn: 295.8616360	total: 3.57s	remaining: 1.71s
676:	learn: 295.8492535	total: 3.58s	remaining: 1.71s
677:	learn: 295.8239331	total: 3.59s	remaining: 1.71s
678:	learn: 295.8239070	total: 3.59s	remaining: 1.7s
679:	learn: 295.8044240	total: 3.6s	remaining: 1.69s
680:	learn: 295.8044147	total: 3.61s	remaining: 1.69s
681:	learn: 295.7769004	total: 3.61s	remaining: 1.68s
682:	learn: 295.7645255	total: 3.62s	remaining: 1.68s
683:	learn: 295.7371976	total: 

827:	learn: 293.2886943	total: 4.46s	remaining: 926ms
828:	learn: 293.2452818	total: 4.47s	remaining: 922ms
829:	learn: 293.2224120	total: 4.48s	remaining: 918ms
830:	learn: 293.2218047	total: 4.49s	remaining: 913ms
831:	learn: 293.2080476	total: 4.5s	remaining: 908ms
832:	learn: 293.1972794	total: 4.51s	remaining: 903ms
833:	learn: 293.1966806	total: 4.51s	remaining: 898ms
834:	learn: 293.1943501	total: 4.52s	remaining: 892ms
835:	learn: 293.1811378	total: 4.52s	remaining: 887ms
836:	learn: 293.1602135	total: 4.53s	remaining: 882ms
837:	learn: 293.1274760	total: 4.54s	remaining: 877ms
838:	learn: 293.1255426	total: 4.54s	remaining: 872ms
839:	learn: 293.1202139	total: 4.56s	remaining: 868ms
840:	learn: 293.0726459	total: 4.57s	remaining: 865ms
841:	learn: 293.0466055	total: 4.58s	remaining: 860ms
842:	learn: 293.0367474	total: 4.59s	remaining: 855ms
843:	learn: 292.9781635	total: 4.6s	remaining: 850ms
844:	learn: 292.9635143	total: 4.61s	remaining: 845ms
845:	learn: 292.9412215	total:

995:	learn: 290.9195057	total: 5.63s	remaining: 22.6ms
996:	learn: 290.9108381	total: 5.64s	remaining: 17ms
997:	learn: 290.8750448	total: 5.65s	remaining: 11.3ms
998:	learn: 290.8695542	total: 5.66s	remaining: 5.66ms
999:	learn: 290.8612196	total: 5.66s	remaining: 0us
0:	learn: 559.4979738	total: 3.7ms	remaining: 3.7s
1:	learn: 550.9790112	total: 8.39ms	remaining: 4.19s
2:	learn: 543.2702553	total: 11.3ms	remaining: 3.75s
3:	learn: 535.1415088	total: 15.2ms	remaining: 3.8s
4:	learn: 527.2962561	total: 19.7ms	remaining: 3.92s
5:	learn: 520.5822198	total: 22.6ms	remaining: 3.74s
6:	learn: 513.5694323	total: 27.3ms	remaining: 3.88s
7:	learn: 507.3851834	total: 31ms	remaining: 3.84s
8:	learn: 501.5412882	total: 35.1ms	remaining: 3.86s
9:	learn: 495.1134075	total: 38.5ms	remaining: 3.81s
10:	learn: 489.2020213	total: 41ms	remaining: 3.69s
11:	learn: 483.5324351	total: 44.7ms	remaining: 3.68s
12:	learn: 477.8815463	total: 49ms	remaining: 3.72s
13:	learn: 473.1278643	total: 54ms	remaining: 3

150:	learn: 342.8485007	total: 712ms	remaining: 4s
151:	learn: 342.5756199	total: 721ms	remaining: 4.02s
152:	learn: 342.4500750	total: 731ms	remaining: 4.05s
153:	learn: 342.1449082	total: 736ms	remaining: 4.04s
154:	learn: 342.0066453	total: 741ms	remaining: 4.04s
155:	learn: 341.8771891	total: 746ms	remaining: 4.04s
156:	learn: 341.7354010	total: 751ms	remaining: 4.03s
157:	learn: 341.6471542	total: 756ms	remaining: 4.03s
158:	learn: 341.5111398	total: 764ms	remaining: 4.04s
159:	learn: 341.2985195	total: 770ms	remaining: 4.04s
160:	learn: 341.1319373	total: 776ms	remaining: 4.04s
161:	learn: 340.9886338	total: 785ms	remaining: 4.06s
162:	learn: 340.8875652	total: 789ms	remaining: 4.05s
163:	learn: 340.6972908	total: 798ms	remaining: 4.07s
164:	learn: 340.5566528	total: 804ms	remaining: 4.07s
165:	learn: 340.4838357	total: 813ms	remaining: 4.09s
166:	learn: 340.3062130	total: 820ms	remaining: 4.09s
167:	learn: 340.1510146	total: 828ms	remaining: 4.1s
168:	learn: 340.0042385	total: 8

304:	learn: 326.1410579	total: 1.71s	remaining: 3.89s
305:	learn: 326.0150741	total: 1.72s	remaining: 3.89s
306:	learn: 325.8582949	total: 1.72s	remaining: 3.89s
307:	learn: 325.8370051	total: 1.73s	remaining: 3.89s
308:	learn: 325.7114492	total: 1.73s	remaining: 3.88s
309:	learn: 325.6089135	total: 1.74s	remaining: 3.87s
310:	learn: 325.4474648	total: 1.74s	remaining: 3.87s
311:	learn: 325.3342403	total: 1.75s	remaining: 3.86s
312:	learn: 325.2534963	total: 1.75s	remaining: 3.85s
313:	learn: 325.0526499	total: 1.76s	remaining: 3.84s
314:	learn: 324.9104175	total: 1.76s	remaining: 3.83s
315:	learn: 324.8458083	total: 1.77s	remaining: 3.82s
316:	learn: 324.8076452	total: 1.77s	remaining: 3.81s
317:	learn: 324.6838991	total: 1.77s	remaining: 3.81s
318:	learn: 324.6401226	total: 1.78s	remaining: 3.8s
319:	learn: 324.5705464	total: 1.78s	remaining: 3.79s
320:	learn: 324.4819560	total: 1.79s	remaining: 3.78s
321:	learn: 324.4363579	total: 1.79s	remaining: 3.77s
322:	learn: 324.3424140	total

488:	learn: 312.7218574	total: 2.68s	remaining: 2.81s
489:	learn: 312.6724400	total: 2.69s	remaining: 2.8s
490:	learn: 312.6297619	total: 2.7s	remaining: 2.8s
491:	learn: 312.6055748	total: 2.71s	remaining: 2.8s
492:	learn: 312.5322446	total: 2.72s	remaining: 2.79s
493:	learn: 312.4661498	total: 2.72s	remaining: 2.79s
494:	learn: 312.4540769	total: 2.72s	remaining: 2.78s
495:	learn: 312.4136884	total: 2.73s	remaining: 2.77s
496:	learn: 312.3480576	total: 2.73s	remaining: 2.77s
497:	learn: 312.2899096	total: 2.74s	remaining: 2.76s
498:	learn: 312.2699744	total: 2.74s	remaining: 2.75s
499:	learn: 312.2598923	total: 2.75s	remaining: 2.75s
500:	learn: 312.2480288	total: 2.75s	remaining: 2.74s
501:	learn: 312.2418617	total: 2.76s	remaining: 2.73s
502:	learn: 312.1197621	total: 2.76s	remaining: 2.73s
503:	learn: 312.0836320	total: 2.77s	remaining: 2.72s
504:	learn: 312.0547980	total: 2.77s	remaining: 2.72s
505:	learn: 311.9945042	total: 2.77s	remaining: 2.71s
506:	learn: 311.9597763	total: 2

648:	learn: 306.9951820	total: 3.67s	remaining: 1.99s
649:	learn: 306.9951081	total: 3.68s	remaining: 1.98s
650:	learn: 306.9586769	total: 3.69s	remaining: 1.98s
651:	learn: 306.9530586	total: 3.7s	remaining: 1.97s
652:	learn: 306.9398934	total: 3.7s	remaining: 1.97s
653:	learn: 306.9286185	total: 3.71s	remaining: 1.96s
654:	learn: 306.8615034	total: 3.71s	remaining: 1.96s
655:	learn: 306.8233570	total: 3.72s	remaining: 1.95s
656:	learn: 306.7924134	total: 3.72s	remaining: 1.94s
657:	learn: 306.7786811	total: 3.73s	remaining: 1.94s
658:	learn: 306.7744803	total: 3.73s	remaining: 1.93s
659:	learn: 306.7585636	total: 3.74s	remaining: 1.93s
660:	learn: 306.7390869	total: 3.74s	remaining: 1.92s
661:	learn: 306.6993956	total: 3.75s	remaining: 1.91s
662:	learn: 306.6846436	total: 3.75s	remaining: 1.91s
663:	learn: 306.6629291	total: 3.76s	remaining: 1.9s
664:	learn: 306.6399322	total: 3.76s	remaining: 1.9s
665:	learn: 306.6295135	total: 3.77s	remaining: 1.89s
666:	learn: 306.5850871	total: 3

829:	learn: 303.1791672	total: 4.65s	remaining: 952ms
830:	learn: 303.1711959	total: 4.65s	remaining: 946ms
831:	learn: 303.1145078	total: 4.66s	remaining: 942ms
832:	learn: 303.1099097	total: 4.67s	remaining: 937ms
833:	learn: 303.0949323	total: 4.68s	remaining: 932ms
834:	learn: 303.0728144	total: 4.68s	remaining: 926ms
835:	learn: 303.0409572	total: 4.7s	remaining: 921ms
836:	learn: 303.0290157	total: 4.7s	remaining: 915ms
837:	learn: 303.0260362	total: 4.7s	remaining: 909ms
838:	learn: 303.0052955	total: 4.71s	remaining: 904ms
839:	learn: 302.9860128	total: 4.71s	remaining: 898ms
840:	learn: 302.9389961	total: 4.72s	remaining: 893ms
841:	learn: 302.9267166	total: 4.73s	remaining: 887ms
842:	learn: 302.8971137	total: 4.74s	remaining: 882ms
843:	learn: 302.8864741	total: 4.74s	remaining: 877ms
844:	learn: 302.8635884	total: 4.75s	remaining: 871ms
845:	learn: 302.8599481	total: 4.75s	remaining: 865ms
846:	learn: 302.8019805	total: 4.76s	remaining: 859ms
847:	learn: 302.7884551	total: 

992:	learn: 300.6400173	total: 5.62s	remaining: 39.6ms
993:	learn: 300.6373819	total: 5.63s	remaining: 34ms
994:	learn: 300.5874758	total: 5.63s	remaining: 28.3ms
995:	learn: 300.5640521	total: 5.64s	remaining: 22.7ms
996:	learn: 300.5468769	total: 5.65s	remaining: 17ms
997:	learn: 300.5447970	total: 5.66s	remaining: 11.3ms
998:	learn: 300.5409400	total: 5.66s	remaining: 5.67ms
999:	learn: 300.5150806	total: 5.67s	remaining: 0us
0:	learn: 560.6206445	total: 3.43ms	remaining: 3.43s
1:	learn: 552.8616956	total: 9.29ms	remaining: 4.64s
2:	learn: 545.4910160	total: 14.6ms	remaining: 4.85s
3:	learn: 537.4945137	total: 18.7ms	remaining: 4.65s
4:	learn: 529.6300534	total: 23ms	remaining: 4.58s
5:	learn: 521.9252711	total: 27.7ms	remaining: 4.58s
6:	learn: 515.3714213	total: 31.4ms	remaining: 4.46s
7:	learn: 509.0297554	total: 34.4ms	remaining: 4.27s
8:	learn: 502.9170257	total: 38.7ms	remaining: 4.27s
9:	learn: 497.2840049	total: 41.5ms	remaining: 4.1s
10:	learn: 492.1109208	total: 46.8ms	rem

162:	learn: 324.2885747	total: 906ms	remaining: 4.65s
163:	learn: 323.8888844	total: 913ms	remaining: 4.65s
164:	learn: 323.6350246	total: 921ms	remaining: 4.66s
165:	learn: 323.4136936	total: 932ms	remaining: 4.68s
166:	learn: 323.1534095	total: 937ms	remaining: 4.67s
167:	learn: 323.0796871	total: 942ms	remaining: 4.66s
168:	learn: 322.9773630	total: 947ms	remaining: 4.66s
169:	learn: 322.8716866	total: 952ms	remaining: 4.65s
170:	learn: 322.5414768	total: 957ms	remaining: 4.64s
171:	learn: 322.3514370	total: 962ms	remaining: 4.63s
172:	learn: 322.1079338	total: 967ms	remaining: 4.62s
173:	learn: 321.9599888	total: 973ms	remaining: 4.62s
174:	learn: 321.8987859	total: 977ms	remaining: 4.61s
175:	learn: 321.6740259	total: 982ms	remaining: 4.6s
176:	learn: 321.5586761	total: 986ms	remaining: 4.58s
177:	learn: 321.4246930	total: 990ms	remaining: 4.57s
178:	learn: 321.1864962	total: 995ms	remaining: 4.56s
179:	learn: 321.0074904	total: 999ms	remaining: 4.55s
180:	learn: 320.9469155	total

324:	learn: 304.5940243	total: 1.68s	remaining: 3.5s
325:	learn: 304.2920254	total: 1.69s	remaining: 3.5s
326:	learn: 304.2546395	total: 1.7s	remaining: 3.5s
327:	learn: 304.1461682	total: 1.71s	remaining: 3.5s
328:	learn: 304.0266660	total: 1.71s	remaining: 3.49s
329:	learn: 303.8933999	total: 1.72s	remaining: 3.49s
330:	learn: 303.8518832	total: 1.72s	remaining: 3.48s
331:	learn: 303.8194995	total: 1.73s	remaining: 3.48s
332:	learn: 303.7298878	total: 1.73s	remaining: 3.47s
333:	learn: 303.6939227	total: 1.74s	remaining: 3.46s
334:	learn: 303.6429778	total: 1.74s	remaining: 3.45s
335:	learn: 303.4327323	total: 1.74s	remaining: 3.45s
336:	learn: 303.3406835	total: 1.75s	remaining: 3.44s
337:	learn: 303.3014281	total: 1.75s	remaining: 3.43s
338:	learn: 303.1315726	total: 1.75s	remaining: 3.42s
339:	learn: 303.1133889	total: 1.76s	remaining: 3.42s
340:	learn: 302.9760464	total: 1.76s	remaining: 3.41s
341:	learn: 302.8589990	total: 1.77s	remaining: 3.4s
342:	learn: 302.7754400	total: 1.7

484:	learn: 293.0967494	total: 2.47s	remaining: 2.62s
485:	learn: 293.0749010	total: 2.48s	remaining: 2.62s
486:	learn: 293.0424178	total: 2.48s	remaining: 2.62s
487:	learn: 292.9846929	total: 2.49s	remaining: 2.61s
488:	learn: 292.9800042	total: 2.49s	remaining: 2.6s
489:	learn: 292.9770001	total: 2.5s	remaining: 2.6s
490:	learn: 292.8683448	total: 2.5s	remaining: 2.59s
491:	learn: 292.8601249	total: 2.5s	remaining: 2.59s
492:	learn: 292.8583038	total: 2.51s	remaining: 2.58s
493:	learn: 292.7893549	total: 2.51s	remaining: 2.57s
494:	learn: 292.7840224	total: 2.52s	remaining: 2.57s
495:	learn: 292.7735660	total: 2.52s	remaining: 2.56s
496:	learn: 292.7706812	total: 2.52s	remaining: 2.55s
497:	learn: 292.7462514	total: 2.53s	remaining: 2.55s
498:	learn: 292.7461876	total: 2.53s	remaining: 2.54s
499:	learn: 292.6932910	total: 2.53s	remaining: 2.53s
500:	learn: 292.6701666	total: 2.54s	remaining: 2.53s
501:	learn: 292.6116420	total: 2.54s	remaining: 2.52s
502:	learn: 292.5097105	total: 2.

650:	learn: 286.1601770	total: 3.25s	remaining: 1.74s
651:	learn: 286.1187781	total: 3.26s	remaining: 1.74s
652:	learn: 286.0845681	total: 3.27s	remaining: 1.74s
653:	learn: 286.0792394	total: 3.27s	remaining: 1.73s
654:	learn: 286.0583869	total: 3.28s	remaining: 1.73s
655:	learn: 285.9751704	total: 3.28s	remaining: 1.72s
656:	learn: 285.9140022	total: 3.29s	remaining: 1.72s
657:	learn: 285.8198879	total: 3.29s	remaining: 1.71s
658:	learn: 285.7711893	total: 3.29s	remaining: 1.7s
659:	learn: 285.7116666	total: 3.3s	remaining: 1.7s
660:	learn: 285.6330532	total: 3.3s	remaining: 1.69s
661:	learn: 285.5842894	total: 3.31s	remaining: 1.69s
662:	learn: 285.5633490	total: 3.31s	remaining: 1.68s
663:	learn: 285.5586911	total: 3.31s	remaining: 1.68s
664:	learn: 285.5306905	total: 3.32s	remaining: 1.67s
665:	learn: 285.5196035	total: 3.32s	remaining: 1.67s
666:	learn: 285.5135577	total: 3.33s	remaining: 1.66s
667:	learn: 285.4454247	total: 3.33s	remaining: 1.66s
668:	learn: 285.3976919	total: 3

830:	learn: 281.4588344	total: 4.01s	remaining: 816ms
831:	learn: 281.4452345	total: 4.02s	remaining: 812ms
832:	learn: 281.3954922	total: 4.03s	remaining: 808ms
833:	learn: 281.3607098	total: 4.04s	remaining: 803ms
834:	learn: 281.3574229	total: 4.04s	remaining: 799ms
835:	learn: 281.3181249	total: 4.04s	remaining: 794ms
836:	learn: 281.3115876	total: 4.05s	remaining: 789ms
837:	learn: 281.2560258	total: 4.05s	remaining: 784ms
838:	learn: 281.2469123	total: 4.06s	remaining: 779ms
839:	learn: 281.2092272	total: 4.06s	remaining: 774ms
840:	learn: 281.1739302	total: 4.07s	remaining: 769ms
841:	learn: 281.1739200	total: 4.07s	remaining: 764ms
842:	learn: 281.1689830	total: 4.07s	remaining: 759ms
843:	learn: 281.1576808	total: 4.08s	remaining: 754ms
844:	learn: 281.1572989	total: 4.08s	remaining: 748ms
845:	learn: 281.1446185	total: 4.08s	remaining: 743ms
846:	learn: 281.1446110	total: 4.08s	remaining: 738ms
847:	learn: 281.1058041	total: 4.09s	remaining: 733ms
848:	learn: 281.1057947	tota

4:	learn: 533.7136720	total: 26ms	remaining: 5.17s
5:	learn: 525.7514583	total: 35.3ms	remaining: 5.85s
6:	learn: 518.5755199	total: 44.1ms	remaining: 6.26s
7:	learn: 511.4912598	total: 49.5ms	remaining: 6.14s
8:	learn: 504.4614864	total: 54.6ms	remaining: 6.01s
9:	learn: 497.8676350	total: 60.6ms	remaining: 6s
10:	learn: 491.4598223	total: 65.4ms	remaining: 5.88s
11:	learn: 485.2676586	total: 69.9ms	remaining: 5.75s
12:	learn: 479.1700590	total: 74.8ms	remaining: 5.68s
13:	learn: 473.7512762	total: 80.3ms	remaining: 5.65s
14:	learn: 468.7352114	total: 84.7ms	remaining: 5.56s
15:	learn: 464.0027444	total: 88.8ms	remaining: 5.46s
16:	learn: 459.0353839	total: 93.8ms	remaining: 5.42s
17:	learn: 454.1822993	total: 99ms	remaining: 5.4s
18:	learn: 449.6395660	total: 104ms	remaining: 5.36s
19:	learn: 445.6375278	total: 109ms	remaining: 5.35s
20:	learn: 441.7112234	total: 113ms	remaining: 5.28s
21:	learn: 437.4609536	total: 119ms	remaining: 5.27s
22:	learn: 433.8970112	total: 124ms	remaining:

193:	learn: 329.5574812	total: 1.01s	remaining: 4.21s
194:	learn: 329.4057531	total: 1.02s	remaining: 4.22s
195:	learn: 329.3971169	total: 1.03s	remaining: 4.23s
196:	learn: 329.2979137	total: 1.04s	remaining: 4.24s
197:	learn: 329.2186060	total: 1.04s	remaining: 4.23s
198:	learn: 329.0960015	total: 1.05s	remaining: 4.23s
199:	learn: 329.0179222	total: 1.05s	remaining: 4.22s
200:	learn: 328.9388213	total: 1.06s	remaining: 4.21s
201:	learn: 328.8404438	total: 1.06s	remaining: 4.2s
202:	learn: 328.7387763	total: 1.07s	remaining: 4.2s
203:	learn: 328.7020293	total: 1.07s	remaining: 4.19s
204:	learn: 328.5650995	total: 1.08s	remaining: 4.18s
205:	learn: 328.3911047	total: 1.08s	remaining: 4.18s
206:	learn: 328.3171720	total: 1.09s	remaining: 4.17s
207:	learn: 328.2818134	total: 1.09s	remaining: 4.16s
208:	learn: 328.2536757	total: 1.1s	remaining: 4.15s
209:	learn: 328.2178314	total: 1.1s	remaining: 4.14s
210:	learn: 328.0806617	total: 1.11s	remaining: 4.14s
211:	learn: 328.0338522	total: 1

372:	learn: 315.0483058	total: 1.99s	remaining: 3.35s
373:	learn: 315.0255567	total: 2s	remaining: 3.35s
374:	learn: 314.9616673	total: 2.01s	remaining: 3.35s
375:	learn: 314.9062971	total: 2.02s	remaining: 3.35s
376:	learn: 314.7242348	total: 2.02s	remaining: 3.35s
377:	learn: 314.6152539	total: 2.03s	remaining: 3.34s
378:	learn: 314.4573477	total: 2.04s	remaining: 3.33s
379:	learn: 314.3737362	total: 2.04s	remaining: 3.33s
380:	learn: 314.3538027	total: 2.05s	remaining: 3.32s
381:	learn: 314.2863336	total: 2.05s	remaining: 3.32s
382:	learn: 314.1802105	total: 2.06s	remaining: 3.31s
383:	learn: 314.1032636	total: 2.06s	remaining: 3.31s
384:	learn: 313.9294446	total: 2.07s	remaining: 3.3s
385:	learn: 313.9090752	total: 2.07s	remaining: 3.29s
386:	learn: 313.8842308	total: 2.08s	remaining: 3.29s
387:	learn: 313.8506096	total: 2.08s	remaining: 3.28s
388:	learn: 313.7224102	total: 2.08s	remaining: 3.28s
389:	learn: 313.6726608	total: 2.09s	remaining: 3.27s
390:	learn: 313.6646152	total: 2

551:	learn: 306.3520727	total: 2.99s	remaining: 2.42s
552:	learn: 306.3289409	total: 3s	remaining: 2.43s
553:	learn: 306.2877815	total: 3.02s	remaining: 2.43s
554:	learn: 306.2815303	total: 3.03s	remaining: 2.43s
555:	learn: 306.2336568	total: 3.05s	remaining: 2.43s
556:	learn: 306.2085649	total: 3.05s	remaining: 2.43s
557:	learn: 306.2008748	total: 3.06s	remaining: 2.42s
558:	learn: 306.0472237	total: 3.06s	remaining: 2.42s
559:	learn: 306.0426678	total: 3.07s	remaining: 2.41s
560:	learn: 306.0069712	total: 3.07s	remaining: 2.4s
561:	learn: 305.9951196	total: 3.08s	remaining: 2.4s
562:	learn: 305.8771715	total: 3.08s	remaining: 2.39s
563:	learn: 305.8070827	total: 3.09s	remaining: 2.39s
564:	learn: 305.7568422	total: 3.09s	remaining: 2.38s
565:	learn: 305.7279249	total: 3.1s	remaining: 2.38s
566:	learn: 305.6858902	total: 3.1s	remaining: 2.37s
567:	learn: 305.6388474	total: 3.11s	remaining: 2.36s
568:	learn: 305.6033232	total: 3.11s	remaining: 2.36s
569:	learn: 305.5985918	total: 3.12

713:	learn: 301.8382298	total: 3.96s	remaining: 1.59s
714:	learn: 301.8258410	total: 3.98s	remaining: 1.58s
715:	learn: 301.8202624	total: 3.99s	remaining: 1.58s
716:	learn: 301.7962533	total: 3.99s	remaining: 1.57s
717:	learn: 301.7919543	total: 4s	remaining: 1.57s
718:	learn: 301.7910539	total: 4s	remaining: 1.56s
719:	learn: 301.7653739	total: 4.01s	remaining: 1.56s
720:	learn: 301.7623017	total: 4.01s	remaining: 1.55s
721:	learn: 301.7398858	total: 4.02s	remaining: 1.55s
722:	learn: 301.7390578	total: 4.02s	remaining: 1.54s
723:	learn: 301.7311464	total: 4.03s	remaining: 1.53s
724:	learn: 301.7293330	total: 4.03s	remaining: 1.53s
725:	learn: 301.7169529	total: 4.04s	remaining: 1.52s
726:	learn: 301.6296523	total: 4.04s	remaining: 1.52s
727:	learn: 301.5947257	total: 4.05s	remaining: 1.51s
728:	learn: 301.5823226	total: 4.05s	remaining: 1.51s
729:	learn: 301.5772640	total: 4.06s	remaining: 1.5s
730:	learn: 301.4872835	total: 4.06s	remaining: 1.49s
731:	learn: 301.4738444	total: 4.07

899:	learn: 298.5441434	total: 4.95s	remaining: 549ms
900:	learn: 298.5390164	total: 4.95s	remaining: 544ms
901:	learn: 298.5346576	total: 4.96s	remaining: 539ms
902:	learn: 298.4726720	total: 4.97s	remaining: 534ms
903:	learn: 298.4652141	total: 4.98s	remaining: 529ms
904:	learn: 298.4633021	total: 4.98s	remaining: 523ms
905:	learn: 298.4570321	total: 4.99s	remaining: 518ms
906:	learn: 298.4327903	total: 4.99s	remaining: 512ms
907:	learn: 298.4314131	total: 5s	remaining: 507ms
908:	learn: 298.3889342	total: 5s	remaining: 501ms
909:	learn: 298.3885637	total: 5.01s	remaining: 495ms
910:	learn: 298.3558809	total: 5.01s	remaining: 490ms
911:	learn: 298.3503116	total: 5.02s	remaining: 484ms
912:	learn: 298.3502730	total: 5.02s	remaining: 479ms
913:	learn: 298.3493800	total: 5.03s	remaining: 473ms
914:	learn: 298.3251653	total: 5.03s	remaining: 467ms
915:	learn: 298.3235213	total: 5.04s	remaining: 462ms
916:	learn: 298.3137869	total: 5.04s	remaining: 456ms
917:	learn: 298.3100616	total: 5.0

In [None]:
create_submission("all_data_sub.csv", preds,constant=701.4750)

In [22]:
train = get_train()

(5735, 147)


In [27]:
def cross_validate(train, n_folds = 5)
    k_fold = KFold(n_splits=n_folds)
    for tr_ix, test_ix in k_fold.split(train):
        train_cv = train.iloc[tr_ix]
        test_cv = train[test_ix]
        test_cv, y_test = get_cleaned_test(test_cv)
        test_preds = get_prediction(train_cv, test_cv)
    

(4301, 147)
(1434, 147)
(4301, 147)
(1434, 147)
(4301, 147)
(1434, 147)
(4302, 147)
(1433, 147)


In [29]:
1434+4301

5735

In [38]:
preds

array([[3.4345138 ],
       [2.7221231 ],
       [3.2070298 ],
       ...,
       [2.825615  ],
       [0.79803133],
       [1.1112623 ]], dtype=float32)

In [None]:
def cross_validate(tra)