In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
from catboost import CatBoostRegressor, Pool
from tqdm import tqdm

import os
os.chdir('..')

In [None]:
[
    p.strip('train_input').strip('.npy') 
    for p in os.listdir('data/performance_forecasting/') 
    if 'train_input' in p
] 

In [None]:
class Dataset():
    def __init__(self, dataset_affix):
        dataset_names = ['train', 'test', 'val']
        dataset = {}

        for key in dataset_names:
            dataset['x_'+key] = pd.read_csv(
                f'data/performance_forecasting/{key}_input_{dataset_affix}.csv',
                index_col=0
            )
            dataset['y_'+key] = pd.read_csv(
                f'data/performance_forecasting/{key}_output_{dataset_affix}.csv',
                index_col=0
            )

        self.x_train, self.y_train = dataset['x_train'], dataset['y_train']
        self.x_test, self.y_test = dataset['x_test'], dataset['y_test']
        self.x_val, self.y_val = dataset['x_val'], dataset['y_val']

In [None]:
dataset = Dataset('bn_cat_NO10EMA_JFCF_UP21bn')

In [None]:
dataset.y_train

In [None]:
symbol = [col for col in dataset.x_train.columns if 'symbol' in col]
symbol

In [None]:
cat_inputs = [
 # 'symbol',
 'year',
 'years_since_public',
 'sector',
 'country',
 'industry',
 'exchange',
 'currency'
]

train_pool = Pool(
    dataset.x_train.drop(symbol, axis=1), 
    label=dataset.y_train.drop('symbol', axis=1),
    cat_features=cat_inputs
)
test_pool = Pool(
    dataset.x_test.drop(symbol, axis=1), 
    label=dataset.y_test.drop('symbol', axis=1),
    cat_features=cat_inputs
)

loss_functions = [
    'MAE', 'MAPE', 'RMSE'
] #'Quantile', 'LogLinQuantile', 'Poisson', 'RMSE'


policy = 'Lossguide' # Depthwise, Lossguide, SymmetricTree 
model = None

for l in loss_functions:

    model = CatBoostRegressor(
        iterations=3000, verbose=False, learning_rate=0.01,
        depth=15, l2_leaf_reg=2, loss_function=l, grow_policy=policy, max_leaves=55
    )

    model.fit(
        train_pool, 
        plot=True,
        eval_set=test_pool
    )
    
    results = pd.DataFrame()
    results['preds'] = model.predict(test_pool)
    results['labels'] = test_pool.label
    results = results.sort_values(by='labels')

    plt = px.scatter(
        results,
        x='preds',
        y='labels'
    )
    plt.show()
    
    results = pd.DataFrame()
    results['preds'] = model.predict(train_pool)
    results['labels'] = train_pool.label
    results = results.sort_values(by='labels')

    plt = px.scatter(
        results,
        x='preds',
        y='labels'
    )
    plt.show()

# # {'learning_rate': [0.01, 0.1],
# grid = {
#     'depth': [8, 15],
#     'l2_leaf_reg': [1, 5, 15]
# }

# grid_search_result = model.grid_search(
#     grid, X=train_pool, plot=True,  cv=3
# )
"""
    model = CatBoostRegressor(
        iterations=900, verbose=False, learning_rate=0.01,
        depth=6, l2_leaf_reg=5, loss_function=l
    )

"""


In [None]:
#from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor

dtr = DecisionTreeRegressor(
    #min_samples_leaf=4, 
    #min_samples_split=6, 
    random_state=1221
)

param_grid = {
    "max_depth": [3], #, 4, 5 ], #6, 9, 12],
    "criterion":['mse', 'mae'] #,'friedman_mse', 'poisson'],
}
search = GridSearchCV(
    dtr, param_grid,
    cv=4,
    n_jobs=2
).fit(
    dataset.x_train.drop('symbol', axis=1), 
    dataset.y_train['next_year_freeCashFlow_bn_scale_tg']
)