In [1]:
from pathlib import Path
from tqdm import tqdm
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import plotly.graph_objects as go

### Data

##### Filter

In [2]:
COUNTRY = 'US'
SECTOR = None

##### Meta

In [3]:
meta = pd.read_csv(
    Path.cwd() / "data" / "meta.csv",
    parse_dates=["first_include"],
    date_format="%Y-%m-%d",
)

In [4]:
if COUNTRY is not None:
    meta = meta[meta['country'] == COUNTRY].reset_index(drop=True)
if SECTOR is not None:
    meta = meta[meta['gics_sector'] == SECTOR].reset_index(drop=True)

##### Historical Prices (Monthly)

In [5]:
historical = (
    pd.read_csv(Path.cwd() / "data" / "historical_prices_monthly_stat.csv")
    .sort_values(["_code", "_year", "_month"], ascending=True)
    .reset_index(drop=True)
)

##### Merge Meta & Historical Prices

In [6]:
df = pd.merge(historical, meta, how="inner", on="_code")
df["ym"] = pd.to_datetime(
    df["_year"].astype(str) + df["_month"].astype(str).str.rjust(2, "0"), 
    format="%Y%m"
)
# Only use historical price data to remove survival effect
df = df[df["ym"] >= df["first_include"]].reset_index(drop=True)

##### 1 Month After

In [7]:
df = pd.concat(
    [
        df, 
        df.groupby("_code", as_index=False)[['monthly_rtn', 'monthly_start_high_rtn']].shift(-1).rename(columns={c: "1mf_" + c for c in df.columns}),
        # df.groupby("_code", as_index=False).shift(1).rename(columns={c: "1mb_" + c for c in df.columns}) 
    ],
    axis=1,
).reset_index(drop=True)

In [8]:
df['monthly_high_position'] = df['monthly_start_high_nbdays'] / df['monthly_nbdays']

In [9]:
df = df[df['monthly_high_end_rtn'] <= -0.3]

### Features

In [10]:
VAR_INFO = [
    '_code',
    '_year',
    '_month',
]

VAR_X = [
    # '1mb_monthly_rtn',
    # '1mb_monthly_start_high_rtn',
    # '1mb_monthly_high_low_rtn',
    # '1mb_monthly_high_end_rtn',
    # '1mb_monthly_mdd',
    # '1mb_monthly_vola',
    # '1mb_monthly_dvola',
    # '1mb_monthly_rtn_davg',
    # '1mb_monthly_high_low_rtn_davg',
    # '1mb_monthly_high_end_rtn_davg',
    # '1mb_monthly_start_high_rtn_davg',
    'monthly_rtn',
    'monthly_start_high_rtn',
    # 'monthly_high_low_rtn',
    'monthly_high_end_rtn',
    'monthly_mdd',
    'monthly_vola',
    'monthly_dvola',
    'monthly_high_position',
    # 'monthly_rtn_davg',
    # 'monthly_high_low_rtn_davg',
    # 'monthly_high_end_rtn_davg',
    # 'monthly_start_high_rtn_davg',
    # 'country',
    # 'gics_sector',
 ]

VAR_Y = [
    '1mf_monthly_rtn',
] # '1mf_monthly_rtn' # 

In [11]:
df = df[VAR_INFO+VAR_X+VAR_Y]

In [12]:
df.dropna(how='any', inplace=True)

### Train - Validate - Test Split

In [13]:
train_df = df[df['_year'] <= 2022]
test_df = df[df['_year'] > 2022]

### Model

##### Settings

In [14]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
NUM_BATCH = 100
NUM_EPOCH = 10000
LR = 0.005

##### DataSet

In [15]:
train_x = torch.from_numpy(train_df[VAR_X].to_numpy()).float() * 100
train_y = torch.from_numpy(train_df[VAR_Y].to_numpy()).float() * 100

##### MLP

In [16]:
class MLP(nn.Module):

    def __init__(self, xdim: int = 4, ydim: int = 1,):
        super().__init__()
        self.mlp = nn.Sequential(
            nn.Linear(xdim, xdim * 2),
            # nn.Dropout(dropout),
            # nn.BatchNorm1d(xdim * 2),
            nn.ReLU(),
            nn.Linear(xdim * 2, xdim * 2),
            # nn.BatchNorm1d(xdim * 2),
            nn.ReLU(),
            nn.Linear(xdim * 2, ydim),
        )
    
    def forward(self, x):
        return self.mlp(x)

In [17]:
model = MLP(len(VAR_X), len(VAR_Y)).to(DEVICE)
loss_fn = nn.MSELoss().to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr = LR)

##### Train

In [18]:
tl_process = []

for epoch in tqdm(range(NUM_EPOCH)):
    batch_loss = []
    model.train()
    for _ in range(train_x.shape[0] // NUM_BATCH):
        idx = np.random.choice(train_x.shape[0], size=NUM_BATCH, replace=False)
        out = model(train_x[idx].to(DEVICE))
        loss = loss_fn(out, train_y[idx].to(DEVICE))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        batch_loss.append(loss.item())
    train_loss = np.mean(batch_loss)

    if (epoch+1) % 1000 == 0:
        print(f'epoch {epoch+1}: train loss = {train_loss:.6f}')
    tl_process.append(train_loss)

 10%|█         | 1011/10000 [00:14<02:10, 68.75it/s]

epoch 1000: train loss = 747.771037


 20%|██        | 2012/10000 [00:29<01:58, 67.55it/s]

epoch 2000: train loss = 652.531601


 30%|███       | 3010/10000 [00:43<01:41, 68.64it/s]

epoch 3000: train loss = 750.138112


 40%|████      | 4012/10000 [00:58<01:23, 71.92it/s]

epoch 4000: train loss = 714.804850


 50%|█████     | 5008/10000 [01:13<01:13, 68.32it/s]

epoch 5000: train loss = 681.810486


 60%|██████    | 6013/10000 [01:28<00:56, 70.03it/s]

epoch 6000: train loss = 649.687714


 70%|███████   | 7007/10000 [01:43<00:44, 67.94it/s]

epoch 7000: train loss = 654.309519


 80%|████████  | 8006/10000 [01:58<00:31, 63.72it/s]

epoch 8000: train loss = 722.185076


 90%|█████████ | 9010/10000 [02:14<00:14, 70.59it/s]

epoch 9000: train loss = 750.429899


100%|██████████| 10000/10000 [02:29<00:00, 66.97it/s]

epoch 10000: train loss = 556.881615





In [19]:
fig = go.Figure()
fig.add_trace(go.Scatter(y=tl_process, mode='lines', name='train loss', line_color='rgb(4, 59, 114)'))

fig.update_layout(width=800, height=400, title="Simple MLP Loss Graph", yaxis_title="loss", xaxis_title="epoch",)
fig.show()

In [20]:
model.eval()
with torch.no_grad():
    result = model(train_x)
    print(f"total loss: {loss_fn(result, train_y)}")

total loss: 625.3265380859375


In [21]:
import plotly.express as px
fig = px.scatter(x=result.squeeze().numpy()/100, y=train_y.squeeze().numpy()/100,
              title='custom tick labels')
fig.show()

In [22]:
test_x = torch.from_numpy(test_df[VAR_X].to_numpy()).float()
test_y = torch.from_numpy(test_df[VAR_Y].to_numpy()).float()

In [23]:
model.eval()
with torch.no_grad():
    result = model(test_x)
    print(f"total loss: {loss_fn(result, test_y)}")

total loss: 1158.1597900390625


In [24]:
import plotly.express as px
fig = px.scatter(x=result.squeeze().numpy()/100, y=test_y.squeeze().numpy()/100,
              title='custom tick labels')
fig.show()