In [1]:
import os
import pandas as pd
import numpy as np
from scipy import stats
from pathlib import Path
from dotenv import load_dotenv

from src.metrics import pearson_metric
from src.feature_engineering import compute_kmeans_clusters
from src.augmentations import mixup

load_dotenv()

True

In [2]:
%%time
dataset_dir = Path(os.environ['dataset_dir'])
full_data = pd.read_csv(dataset_dir / 'train.csv')
sample_data = pd.read_csv(dataset_dir / 'train_sample.csv', index_col=0)

CPU times: user 2min 38s, sys: 11.7 s, total: 2min 50s
Wall time: 2min 50s


In [3]:
data = full_data

In [4]:
data = data.set_index('row_id')

In [5]:
# train on earlier data, test on later data
train = data[data.time_id < 1000]
test = data.query("1000 <= time_id")

x_train, y_train = train.drop('target', axis=1), train[['time_id', 'target']]
x_test, y_test = test.drop('target', axis=1), test[['time_id', 'target']]

In [6]:
x_train_aug, y_train_aug = mixup(x_train, y_train, n_samples=1000000)

In [7]:
x_train_aug = pd.concat([x_train, x_train_aug], axis=0)
y_train_aug = pd.concat([y_train, y_train_aug], axis=0)

In [8]:
from catboost import CatBoostRegressor, Pool
def evaluate_model(x_train, x_test, y_train, y_test):
    train_pool = Pool(x_train.drop('time_id', axis=1), y_train.target, cat_features=['investment_id'])

    model = CatBoostRegressor(
        iterations=1000,
        random_state=0,
        cat_features=['investment_id'], 
        verbose=200,
        task_type='GPU',
        devices='0:2',
        grow_policy='SymmetricTree',
        bootstrap_type="Bernoulli",
        max_depth=3,
    )
    model = model.fit(train_pool)
    
    test_pool = Pool(x_test, cat_features=['investment_id'])
    y_pred = model.predict(test_pool)
    y_pred = pd.DataFrame({'target': y_pred}, index=x_test.index)
    rmse = np.sqrt(np.mean((y_pred.target - y_test.target) ** 2))
    return rmse, pearson_metric(y_test, y_pred), model
    

In [9]:
rmse, pearson, model = evaluate_model(
    x_train_aug, 
    x_test, 
    y_train_aug, 
    y_test
)
print(f"RMSE: {rmse}, Pearson: {pearson}")

Learning rate set to 0.117352
0:	learn: 0.9233214	total: 32.8ms	remaining: 32.8s
200:	learn: 0.9140832	total: 8.91s	remaining: 35.4s
400:	learn: 0.9112620	total: 19.9s	remaining: 29.8s
600:	learn: 0.9091453	total: 31.8s	remaining: 21.1s
800:	learn: 0.9074304	total: 42s	remaining: 10.4s
999:	learn: 0.9059772	total: 50.8s	remaining: 0us
RMSE: 0.8975728572393166, Pearson: 0.12566607879912992


In [10]:
rmse, pearson, model = evaluate_model(
    x_train, 
    x_test, 
    y_train, 
    y_test,
)
print(f"RMSE: {rmse}, Pearson: {pearson}")

Learning rate set to 0.112177
0:	learn: 0.9234837	total: 23.8ms	remaining: 23.8s
200:	learn: 0.9143276	total: 7.75s	remaining: 30.8s
400:	learn: 0.9116920	total: 14.5s	remaining: 21.7s
600:	learn: 0.9096672	total: 24.1s	remaining: 16s
800:	learn: 0.9080123	total: 32.4s	remaining: 8.05s
999:	learn: 0.9066162	total: 40.8s	remaining: 0us
RMSE: 0.8975013110027095, Pearson: 0.12520946281322876
