In [1]:
from pathlib import Path
from tqdm import tqdm
import numpy as np
import pandas as pd
import plotly.graph_objects as go

### Data

##### Filter

In [2]:
COUNTRY = 'US'
SECTOR = None

##### Meta

In [3]:
meta = pd.read_csv(
    Path.cwd() / "data" / "meta.csv",
    parse_dates=["first_include"],
    date_format="%Y-%m-%d",
)

In [26]:
isinstance(Path.cwd() / "data" / "meta.csv", Path)

True

In [4]:
if COUNTRY is not None:
    meta = meta[meta['country'] == COUNTRY].reset_index(drop=True)
if SECTOR is not None:
    meta = meta[meta['gics_sector'] == SECTOR].reset_index(drop=True)

##### Historical Prices (Monthly)

In [5]:
historical = (
    pd.read_csv(Path.cwd() / "data" / "historical_prices_monthly_stat.csv")
    .sort_values(["_code", "_year", "_month"], ascending=True)
    .reset_index(drop=True)
)

##### Merge Meta & Historical Prices

In [6]:
df = pd.merge(historical, meta, how="inner", on="_code")
df["ym"] = pd.to_datetime(
    df["_year"].astype(str) + df["_month"].astype(str).str.rjust(2, "0"), 
    format="%Y%m"
)
# Only use historical price data to remove survival effect
df = df[df["ym"] >= df["first_include"]].reset_index(drop=True)

In [7]:
df = df[df['monthly_high_end_rtn'] <= -0.0]

### Features

##### Add Features

In [8]:
df['monthly_high_position'] = df['monthly_start_high_nbdays'] / df['monthly_nbdays']

In [9]:
# temp = df.groupby(['_year', '_month'], as_index=False)['monthly_rtn'].median() # quantile(0.75)
# temp.columns = ['_year', '_month', 'monthly_rtn_median']
# df = pd.merge(df, temp, on=['_year', '_month'], how='left')
# df['monthly_rtn_class'] = (df['monthly_rtn'] > df['monthly_rtn_median']).astype(int)
df['monthly_rtn_class'] = (df['monthly_rtn'] > 0).astype(int)

##### 1 Month After

In [10]:
df = pd.concat(
    [
        df, 
        df.groupby("_code", as_index=False)[['monthly_rtn', 'monthly_start_high_rtn', 'monthly_rtn_class']].shift(-1).rename(columns={c: "1mf_" + c for c in df.columns}),
        # df.groupby("_code", as_index=False).shift(1).rename(columns={c: "1mb_" + c for c in df.columns}) 
    ],
    axis=1,
).reset_index(drop=True)

In [11]:
VAR_INFO = [
    '_code',
    '_year',
    '_month',
]

VAR_X = [
    # '1mb_monthly_rtn',
    # '1mb_monthly_start_high_rtn',
    # '1mb_monthly_high_low_rtn',
    # '1mb_monthly_high_end_rtn',
    # '1mb_monthly_mdd',
    # '1mb_monthly_vola',
    # '1mb_monthly_dvola',
    # '1mb_monthly_rtn_davg',
    # '1mb_monthly_high_low_rtn_davg',
    # '1mb_monthly_high_end_rtn_davg',
    # '1mb_monthly_start_high_rtn_davg',
    'monthly_rtn',
    'monthly_start_high_rtn',
    # 'monthly_high_low_rtn',
    'monthly_high_end_rtn',
    'monthly_mdd',
    'monthly_vola',
    'monthly_dvola',
    'monthly_high_position',
    # 'monthly_rtn_davg',
    # 'monthly_high_low_rtn_davg',
    # 'monthly_high_end_rtn_davg',
    # 'monthly_start_high_rtn_davg',
    # 'country',
    # 'gics_sector',
 ]

VAR_Y = [
    # '1mf_monthly_rtn_class'
    '1mf_monthly_start_high_rtn',
] # '1mf_monthly_rtn' # 

In [12]:
df = df[VAR_INFO+VAR_X+VAR_Y]

In [13]:
df.dropna(how='any', inplace=True)

### Train - Validate - Test Split

In [14]:
test_df = df.iloc[np.random.choice(len(df), len(df)//4), :].sort_index()
train_df = df.loc[~df.index.isin(test_df.index)]

### Model

##### Settings

##### DataSet

In [15]:
train_x = train_df[VAR_X]
train_y = train_df[VAR_Y]
test_x = test_df[VAR_X]
test_y = test_df[VAR_Y]

##### LightGBM

In [16]:
hyper_params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': ['l1','l2'],
    'learning_rate': 0.005,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.7,
    'bagging_freq': 10,
    'verbose': 0,
    "max_depth": 8,
    "num_leaves": 128,  
    "max_bin": 512,
    "num_iterations": 100000
}

In [17]:
from lightgbm import LGBMRegressor

# LGBMRegressor 모델 선언 후 Fitting
lgbr = LGBMRegressor(**hyper_params)
lgbr.fit(train_x, train_y, eval_set=[(test_x, test_y)], eval_metric='l1')

# Fitting된 모델로 x_valid를 통해 예측을 진행
y_pred = lgbr.predict(test_x)





In [18]:
feature_importance = pd.DataFrame(lgbr.feature_importances_.reshape((1, -1)), columns=train_x.columns, index=['feature_importance'])
feature_importance

Unnamed: 0,monthly_rtn,monthly_start_high_rtn,monthly_high_end_rtn,monthly_mdd,monthly_vola,monthly_dvola,monthly_high_position
feature_importance,693857,708668,662958,800265,833308,830023,635495
