In [1]:
from pathlib import Path
import numpy as np
import pandas as pd

In [2]:
VAR_X = [
    '1mb_monthly_rtn',
    '1mb_monthly_start_high_rtn',
    '1mb_monthly_high_low_rtn',
    '1mb_monthly_high_end_rtn',
    '1mb_monthly_mdd',
    '1mb_monthly_vola',
    '1mb_monthly_dvola',
    '1mb_monthly_rtn_davg',
    '1mb_monthly_high_low_rtn_davg',
    '1mb_monthly_high_end_rtn_davg',
    '1mb_monthly_start_high_rtn_davg',
    'monthly_rtn',
    'monthly_start_high_rtn',
    'monthly_high_low_rtn',
    'monthly_high_end_rtn',
    'monthly_mdd',
    'monthly_vola',
    'monthly_dvola',
    'monthly_rtn_davg',
    'monthly_high_low_rtn_davg',
    'monthly_high_end_rtn_davg',
    'monthly_start_high_rtn_davg',
    # 'country',
    # 'gics_sector',
 ]

VAR_Y = '1mf_monthly_start_high_rtn' # '1mf_monthly_rtn' # 

In [3]:
COUNTRY = 'US'
SECTOR = None

In [4]:
meta = pd.read_csv(
    Path.cwd() / "data" / "meta.csv",
    parse_dates=["first_include"],
    date_format="%Y-%m-%d",
)

In [5]:
if COUNTRY is None:
    pass
else:
    meta = meta[meta['country'] == COUNTRY].reset_index(drop=True)
if SECTOR is None:
    pass
else:
    meta = meta[meta['gics_sector'] == SECTOR].reset_index(drop=True)

In [6]:
historical = (
    pd.read_csv(Path.cwd() / "data" / "historical_prices_monthly_stat.csv")
    .dropna()
    .sort_values(["_code", "_year", "_month"], ascending=True)
    .reset_index(drop=True)
)

In [7]:
df = pd.merge(historical, meta, how="inner", on="_code")
df["ym"] = pd.to_datetime(
    df["_year"].astype(str) + df["_month"].astype(str).str.rjust(2, "0"), 
    format="%Y%m"
)
df = df[df["ym"] >= df["first_include"]].reset_index(drop=True)

In [8]:
df = pd.concat(
    [
        df, 
        df.groupby("_code", as_index=False).shift(-1).rename(columns={c: "1mf_" + c for c in df.columns}),
        df.groupby("_code", as_index=False).shift(1).rename(columns={c: "1mb_" + c for c in df.columns})
    ],
    axis=1,
).dropna().reset_index(drop=True)

In [9]:
df_total = df[(df['monthly_high_end_rtn'] >= -0.4) & (df['monthly_high_end_rtn'] <= -0.1)].reset_index(drop=True)

In [10]:
df_train = df_total[(df_total['ym'] <= '2023-02-01')][VAR_X + [VAR_Y]]
# df_valid = df_total[(df_total['ym'] >= '2022-03-01') & (df_total['ym'] <= '2023-02-01')][VAR_X + [VAR_Y]]
df_test = df_total[(df_total['ym'] >= '2023-03-01') & (df_total['ym'] <= '2024-02-01')][VAR_X + [VAR_Y]]

In [11]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import math

In [12]:
x_t, x_v, y_t, y_v = train_test_split(
    df_train.iloc[:, :-1], 
    df_train.iloc[:, -1], 
    test_size=0.2, 
    random_state=0
)

In [13]:
params = {
    'n_estimators':(100, 200, 300, 500),
    'max_features': (5, 10, 15),
    'max_depth' : (5, 10, 15),
}

In [14]:
rf_run = RandomForestRegressor(random_state=1, n_jobs=-1, oob_score=False)
grid_cv = GridSearchCV(rf_run, param_grid=params, cv=2, n_jobs=-1)

In [15]:
grid_cv.fit(x_t, y_t)

In [16]:
print('최적 하이퍼 파라미터:', grid_cv.best_params_)
print('최적 예측 정확도: {0:.4f}'.format(grid_cv.best_score_))

최적 하이퍼 파라미터: {'max_depth': 15, 'max_features': 5, 'n_estimators': 500}
최적 예측 정확도: 0.1548


In [17]:
rf_run = RandomForestRegressor(random_state=1, oob_score=False, **grid_cv.best_params_)
rf_run.fit(x_t, y_t)


In [18]:
# train rmse
train_predict = rf_run.predict(x_t)
print("Train RMSE:{}".format(math.sqrt(mean_squared_error(train_predict, y_t))) )
 
# validation rmse
valid_predict = rf_run.predict(x_v)
print("Validate RMSE':{}".format(math.sqrt(mean_squared_error(valid_predict, y_v))) )
 
# test rmse
test_predict = rf_run.predict(df_test.iloc[:, :-1])
print("Test RMSE':{}".format(math.sqrt(mean_squared_error(test_predict, df_test.iloc[:, -1]))) )

Train RMSE:0.050671440540055956
Validate RMSE':0.09787953162631637
Test RMSE':0.09974763600540207


In [19]:
ftr_importances_values = rf_run.feature_importances_
ftr_importances = pd.Series(ftr_importances_values, index=df_test.iloc[:, :-1].columns)
ftr_top = ftr_importances.sort_values(ascending=False)[:20]

In [20]:
ftr_top

monthly_vola                       0.083395
1mb_monthly_vola                   0.075962
1mb_monthly_mdd                    0.057155
1mb_monthly_dvola                  0.057114
monthly_high_low_rtn               0.055704
1mb_monthly_rtn                    0.048073
1mb_monthly_rtn_davg               0.048009
1mb_monthly_high_low_rtn_davg      0.046112
1mb_monthly_start_high_rtn         0.044626
monthly_mdd                        0.043349
1mb_monthly_high_end_rtn_davg      0.042884
1mb_monthly_high_low_rtn           0.042169
monthly_dvola                      0.040895
monthly_high_end_rtn_davg          0.038640
1mb_monthly_start_high_rtn_davg    0.037731
monthly_high_low_rtn_davg          0.035562
monthly_rtn_davg                   0.035295
monthly_high_end_rtn               0.034877
monthly_rtn                        0.034833
1mb_monthly_high_end_rtn           0.033701
dtype: float64

In [21]:
import plotly.express as px
import plotly.graph_objects as go
fig = px.scatter(x=df_test.iloc[:, -1], y=test_predict)
fig.add_trace(
    go.Scatter(x=df_test.iloc[:, -1], y=df_test.iloc[:, -1], name="slope", line_shape='linear')
)
fig.show()