In [1]:
from pathlib import Path
import pandas as pd
import plotly.graph_objects as go

import warnings
warnings.filterwarnings('ignore')

### Data

In [2]:
df = (
    pd.read_csv(Path.cwd() / "data" / "final.csv")
    .sort_values(["_code", "_year", "_month"], ascending=True)
    .reset_index(drop=True)
)

In [3]:
gics = (
    pd.read_csv(Path.cwd() / "data" / "gics_list.csv")
)
gics.columns = ['gics_sector', 'gics_sector_desc']

df = pd.merge(df, gics, on='gics_sector', how='left').drop('gics_sector', axis=1)

df['country'] = df['country'].astype('category')
df['gics_sector_desc'] = df['gics_sector_desc'].astype('category')

### Train - Test Split

In [4]:
train_df = df[df['_year'] < 2023]
test_df = df[df['_year'] >= 2023]

### Model

##### Settings

In [5]:
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.pipeline import Pipeline
from lightgbm import LGBMRegressor

In [6]:
X_train = train_df.drop(['_code', '_year', '_month', 'monthly_rtn_1mf', 'monthly_start_high_rtn_1mf'], axis=1)
y_train = train_df['monthly_rtn_1mf']
X_test = test_df.drop(['_code', '_year', '_month', 'monthly_rtn_1mf', 'monthly_start_high_rtn_1mf'], axis=1)
y_test = test_df['monthly_rtn_1mf']

##### LightGBM

In [9]:
hyper_params = {
    'lgbr__boosting_type': ['gbdt', 'rf'],
    'lgbr__objective': ['regression'],
    'lgbr__metric': ['l1','l2'],
    'lgbr__learning_rate': [0.1, 0.05, 0.01],
    'lgbr__feature_fraction': [0.5, 0.75, 0.9],
    'lgbr__bagging_fraction': [0.5, 0.75, 0.9],
    'lgbr__bagging_freq': [5, 10],
    'lgbr__max_depth': [-1],
    'lgbr__num_leaves': [31, 63, 127, 255],  
    'lgbr__max_bin': [512],
    'lgbr__num_iterations': [50000, 100000],
    'lgbr__colsample_bytree': [1],
}

fit_params = {
    'lgbr__categorical_feature': ['country', 'gics_sector_desc'],
}

# hyper_params = {
#     'lgbr__boosting_type': ['gbdt'],
#     'lgbr__objective': ['regression'],
#     'lgbr__metric': ['l2'],
#     'lgbr__learning_rate': [0.01],
#     'lgbr__feature_fraction': [0.9],
#     'lgbr__bagging_fraction': [0.9],
#     'lgbr__bagging_freq': [5],
#     'lgbr__max_depth': [-1],
#     'lgbr__num_leaves': [31],  
#     'lgbr__max_bin': [512],
#     'lgbr__colsample_bytree': [1],
# }

In [10]:
pipeline = Pipeline([('lgbr', LGBMRegressor())])
gscv_lgbr = GridSearchCV(estimator = pipeline, param_grid = hyper_params, scoring ='neg_mean_squared_error', cv = KFold(n_splits=8), refit=True, n_jobs=1, verbose=2)
gscv_lgbr.fit(X_train, y_train, **fit_params)

Fitting 8 folds for each of 1728 candidates, totalling 13824 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012963 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 20029
[LightGBM] [Info] Number of data points in the train set: 137769, number of used features: 43
[LightGBM] [Info] Start training from score -0.001530


KeyboardInterrupt: 

In [None]:
gscv_lgbr.predict(test_df.drop(['_code', '_year', '_month', 'monthly_rtn_1mf', 'monthly_start_high_rtn_1mf'], axis=1))