### Part of Features

#### Libraries

In [9]:
from sklearn.model_selection import cross_val_score
import numpy as np
import pandas as pd
from collections import defaultdict
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from math import sqrt
from catboost import CatBoostRegressor, Pool
import pickle

pd.set_option('display.max_columns', 500)

#### Load Data

In [2]:
data_dir = '/Users/konstantinivanov/Documents/work/PIK/files-pik_digital_day'
input_dir = '/Users/konstantinivanov/Documents/work/PIK/input'
output_dir = '/Users/konstantinivanov/Documents/work/PIK/output'
sub_dir = '/Users/konstantinivanov/Documents/work/PIK/sub'

In [3]:
#train
train_data = pd.read_csv(data_dir + "/train.csv")
train_data.drop(['start_square', 'plan_s', 'plan_m', 'plan_l', 'vid_0', 'vid_1', 'vid_2'], axis=1, inplace=True)
train_data.to_csv(input_dir + "/train_full.csv", index=False)

#test
test_data = pd.read_csv(data_dir + "/test.csv")
test_data['value'] = 0
test_data.to_csv(input_dir + "/test_.csv", index=False, columns = train_data.columns)

#### Validation

In [4]:
train_data = pd.read_csv(input_dir + "/train_full.csv")

K = 20

kf = KFold(n_splits=K, random_state=42, shuffle=True)
kf.get_n_splits(range(len(train_data)))

print(kf)  

for i, (train_index, valid_index) in enumerate(kf.split(range(len(train_data)))):
    train = train_data.loc[train_index]
    valid = train_data.loc[valid_index]

    train.to_csv(input_dir + f"/train_{i}.csv",index=False)
    valid.to_csv(input_dir + f"/valid_{i}.csv",index=False)

KFold(n_splits=20, random_state=42, shuffle=True)


#### Fit and Predict

In [6]:
TEST_FILE = input_dir + '/test_.csv' #add
CD_FILE = input_dir + '/train_part_feat.txt'

test_pool = Pool(TEST_FILE, column_description=CD_FILE, has_header=True, delimiter=",")

In [7]:
test_data = pd.read_csv(TEST_FILE)

In [8]:
iterations = 5000
learning_rate = 0.05
depth = 8
random_seed = 42

Regressors = [
    CatBoostRegressor(iterations=iterations, learning_rate=learning_rate, depth=depth, random_seed=random_seed)
]

In [10]:
def FitOnKFolds(model):
    cum_y_test = np.zeros((len(test_data), 1))
    pred_y_train = np.zeros((len(train_data), 1))
    
    for i in range(K):
        TRAIN_FILE = input_dir + f'/train_{i}.csv'
        VAL_FILE = input_dir + f'/valid_{i}.csv'
        
        train_pool = Pool(TRAIN_FILE, column_description=CD_FILE, has_header=True, delimiter=",")
        val_pool = Pool(VAL_FILE, column_description=CD_FILE, has_header=True, delimiter=",")
        
        model.fit(train_pool, eval_set = val_pool)
        pickle.dump(model, open(input_dir + f'mod_part_feat_{i}.sav', 'wb'))
        
        pred = model.predict(test_pool)
        cum_y_test += pred.reshape((len(test_data),1))
        
        pred = model.predict(val_pool)
        df = pd.read_csv(VAL_FILE)
        pred_y_train[df.id] = pred.reshape((len(df),1))
        
    return (cum_y_test/K).clip(int(min(train_data.value)), int(max(train_data.value))).reshape(-1),\
        sqrt(mean_squared_error(list(train_data['value']), list(pred_y_train)))

In [None]:
def write_ans(fl_name, y_test):
    dict_ans = defaultdict(list)
    dict_ans['id'] = list(range(len(y_test)))
    dict_ans['value'] = list(y_test)
    df_ans = pd.DataFrame.from_dict(dict_ans)
    df_ans.to_csv(sub_dir+'/'+fl_name+'.csv', index=False)
    return 'Answer /%s succesfully writed' % fl_name

In [None]:
%%time

vec_qual = []

for itr, model in enumerate(Regressors):
    y_test, qual = FitOnKFolds(model)
    vec_qual.append(qual)
    write_ans(f'subsub_ctb_itr={iterations}_lr={learning_rate}_depth={depth}_fld={K}_{itr}vers', y_test)

In [None]:
print(vec_qual)
#model = pickle.load(open(input_dir + f'mod_part_feat_{i}.sav', 'rb'))