<h1 align=center> FIA45001 Data Science for Finance Final Project Outline</h1>

<h6 align=right>20215174 JunPyoPark</h6>

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#-FIA45001-Data-Science-for-Finance-Final-Project-Outline" data-toc-modified-id="-FIA45001-Data-Science-for-Finance-Final-Project-Outline-1"> FIA45001 Data Science for Finance Final Project Outline</a></span><ul class="toc-item"><li><span><a href="#Process-Data" data-toc-modified-id="Process-Data-1.1">Process Data</a></span><ul class="toc-item"><li><span><a href="#Read-pkl" data-toc-modified-id="Read-pkl-1.1.1">Read pkl</a></span></li></ul></li><li><span><a href="#Naive-Strategies" data-toc-modified-id="Naive-Strategies-1.2">Naive Strategies</a></span></li><li><span><a href="#Momentum-Strategies" data-toc-modified-id="Momentum-Strategies-1.3">Momentum Strategies</a></span></li><li><span><a href="#LSTM-model" data-toc-modified-id="LSTM-model-1.4">LSTM model</a></span><ul class="toc-item"><li><span><a href="#Process-Dataset" data-toc-modified-id="Process-Dataset-1.4.1">Process Dataset</a></span></li><li><span><a href="#Load-Model" data-toc-modified-id="Load-Model-1.4.2">Load Model</a></span></li><li><span><a href="#Check-LSTM-Model-Performance" data-toc-modified-id="Check-LSTM-Model-Performance-1.4.3">Check LSTM Model Performance</a></span></li></ul></li></ul></li></ul></div>

## Process Data

In [362]:
import torch
import torch.nn as nn
from torch.autograd import Variable

In [363]:
import pandas as pd
import numpy as np
from dateutil.relativedelta import relativedelta
import warnings
import matplotlib.pyplot as plt
from utils import *
warnings.filterwarnings('ignore')

### Read pkl

In [3]:
crsp_returns = pd.read_pickle('crsp_returns.pkl')

In [4]:
crsp_returns = crsp_returns.sort_index()['1975':]

In [6]:
returns = crsp_returns['RET']
crsp_returns['RET'] = returns.apply(convert_to_float)

In [7]:
crsp_returns = crsp_returns.dropna()

In [8]:
mark_cap = pd.read_pickle('mark_cap.pkl').dropna()

In [9]:
# Create Market calendar for datetime reference
calendar = pd.Series(list(set(crsp_returns.index)))
calendar = calendar.sort_values()
calendar = calendar.where(calendar > '1975').dropna() # start at 1975-01-01

## Naive Strategies

```python
# 전 종목 equal-weighted
# 상폐 종목은 -1 의 return을 가짐
ret_eq = []
ret_value = []
for t_months in range(12, len(calendar)): # 전체 기간
# for t_months in range(12, 20): # 특정기간
    dt = calendar.iloc[t_months] # 1976-01-31
    st = calendar.iloc[t_months-12]
    et = calendar.iloc[t_months-1] 
    
    perm_list, past_returns, current_markcap = filter_perm(st, et) # 현재 시점 기준 과거 1년 정보
    
    # 전 종목 equal-weighted method
    eq_return = get_port_return(perm_list, weighting_method='Equal_Weighted')
    ret_eq.append(eq_return)
    
    val_return = get_port_return(perm_list, weighting_method='Value_Weighted')
    ret_value.append(val_return)
```

## Momentum Strategies
* PR1YR
* Acceleration

In [432]:
def get_pr1yr_return(returns_12m):
    # cumulative version
    """
    지난 12개월 return을 받아
    최근 1개월 제외 1년 return(PR1YR) 계산
    """
    cummulative_return = (returns_12m[:-1]['RET'] + 1).cumprod() - 1
    return cummulative_return[-1] # PR1YR

In [407]:
def get_acc_momentum(returns_12m):
    """
    simple weighting sceme 사용하여
    acceleration momentum 계산
    """
    weighting_scheme = np.array([1] * 6 + [-1] * 6)
    return (returns_12m['RET'] * weighting_scheme).sum()

In [None]:
pr1yr_returns = {}
# acc_returns = {}

for t_months in range(12, len(calendar)): # 전체 기간 (2010년 이후)
#for t_months in range(420, len(calendar)): # 테스트 기간 (2010년 이후)
# for t_months in range(420, 421): # 테스트 기간 (2010년 이후)
    
    dt = calendar.iloc[t_months] # 1976-01-31
    st = calendar.iloc[t_months-12]
    et = calendar.iloc[t_months-1] 
    
    perm_list, past_returns, current_markcap = filter_perm(st, et) # 현재 시점 기준 과거 1년 정보

    pr1yr_momentum = pd.Series(index=perm_list)
    acc_momentum = pd.Series(index=perm_list)

    for perm in perm_list:
        returns_12m = filtr(past_returns, col='PERMNO', isin=[perm]) # dt 기준 지난 1년의 수익률 정보
        pr1yr_momentum.loc[perm] = get_pr1yr_return(returns_12m) # calc pr1ry return
        acc_momentum.loc[perm] = get_acc_momentum(returns_12m) # calc acc momentum

        
    pr1yr_momentum = pr1yr_momentum.sort_values(ascending=False)
    acc_momentum = acc_momentum.sort_values(ascending=False)

    
    q = np.zeros(10) # 여기에 q1 ~ q10 저장
    for i in range(10):
          q[i] = calc_decile_return(pr1yr_momentum,(i+1))
    pr1yr_returns[dt] = q
    
    
    q = np.zeros(10) # 여기에 q1 ~ q10 저장
    for i in range(10):
          q[i] = calc_decile_return(acc_momentum,(i+1)
    acc_returns[dt] = q
                                    
    # print(dt)

In [202]:
pr1yr_returns = pd.DataFrame(pr1yr_returns).T
pr1yr_returns.columns = ['Q%d'%i for i in range(1,11)]

acc_returns = pd.DataFrame(acc_returns).T
acc_returns.columns = ['Q%d'%i for i in range(1,11)]

# save results
# acc_returns.to_pickle('acc_return.pkl')
# pr1yr_returns.to_pickle('pr1yr_return.pkl')

## LSTM model

In [30]:
# Before 2010 -> Train, After 2010 -> Test
# 75% training 후 2010년 이후부터 test set으로 사용

len(results.loc[:'2009']) / len(results)

0.7555555555555555

In [31]:
calendar.iloc[12]

Timestamp('1976-01-30 00:00:00')

In [32]:
calendar.iloc[419]

Timestamp('2009-12-31 00:00:00')

### Process Dataset

```python
trainX = []
trainY = []

for t_months in range(12, 420): # training_period
# for t_months in range(12, 13): 
    dt = calendar.iloc[t_months] # 1976-01-31
    st = calendar.iloc[t_months-12]
    et = calendar.iloc[t_months-1] 
    
    perm_list, past_returns, current_markcap = filter_perm(st, et) # 현재 시점 기준 과거 1년 정보

    for perm in perm_list:
    # for perm in perm_list[0:1]: # for specific perm
        returns_12m = filtr(past_returns, col='PERMNO', isin=[perm]) # dt 기준 지난 1년의 수익률 정보
        _x = returns_12m['RET'].values
        trainX.append(_x)
        
        _y = filtr(crsp_returns.loc[dt],col='PERMNO',isin=[perm])['RET'].values # next month return
        if _y: # Non-empty
            trainY.append(_y)
        else: # empty
            _y = np.array([-1]) # 상장폐지
            trainY.append(_y)
```

In [33]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [34]:
torch.cuda.get_device_name()

'GeForce RTX 3080'

MLP 해봤는데 잘 안되서 버림 (I tried MLP, but it didn't work so I threw it away.)

``` python
# Build NN
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.layer = nn.Sequential(
            nn.Linear(12, 512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1),
        )
    def forward(self, x):
        x = self.flatten(x)
        Net_Out = self.layer(x)
        return Net_Out
```

In [154]:
class LSTM(nn.Module):

    def __init__(self, num_classes, input_size, hidden_size, num_layers):
        super(LSTM, self).__init__()
        
        self.num_classes = num_classes
        self.num_layers = num_layers
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.seq_length = seq_length
        
        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size,
                            num_layers=num_layers, batch_first=True)
        
        self.fc = nn.Linear(hidden_size, num_classes)
        
    def forward(self, x):
        h_0 = Variable(torch.zeros(
            self.num_layers, x.size(0), self.hidden_size)).to(device)
        
        c_0 = Variable(torch.zeros(
            self.num_layers, x.size(0), self.hidden_size)).to(device)
        
        # Propagate input through LSTM
        ula, (h_out, _) = self.lstm(x, (h_0, c_0))
        h_out = h_out.view(-1, self.hidden_size)
        out = self.fc(h_out)
        
        return out

In [155]:
num_epochs = 1
num_layers = 1
learning_rate = 0.01
hidden_size = 256
batch_size = 256

### Load Model

* 2010년 이전의 데이터(1975-01 ~ 2009-12)로 학습된 모델
* Model trained with data prior to 2010 (1975-01 to 2009-12)

In [417]:
lstm = LSTM(num_classes, input_size, hidden_size, num_layers).to(device)
# lstm.load_state_dict(torch.load('lstm_model_epoch300'))
# lstm.load_state_dict(torch.load('lstm_model_epoch100'))
lstm.load_state_dict(torch.load('lstm_model_epoch150')) # best result
lstm.eval()

LSTM(
  (lstm): LSTM(1, 256, batch_first=True)
  (fc): Linear(in_features=256, out_features=1, bias=True)
)

```python
mlp = Net().to(device)
mlp.load_state_dict(torch.load('mlp_model_epoch300'))
mlp.eval()
```

In [418]:
def predict_model_return(model, returns_12m):
    """
    학습된 lstm 모델을 통해 미래 수익률 추정
    Estimating future returns through a trained lstm model
    """
    x = returns_12m['RET'].values
    x = (x+1).cumprod() # chagne to wealth series
    x = x.reshape(1,len(x),1)
    x = Variable(torch.Tensor(x)).to(device) # convert to tensor

    y = float(model(x)) # predicted wealth

    last_wealth = float(x[0][-1])
    y = (y - last_wealth) / last_wealth # predicted return
    
    return y

### Check LSTM Model Performance

In [None]:
test_lstm_returns = {}
# test_mlp_returns = {}

for t_months in range(12, len(calendar)): # 전체 기간 (2010년 이후)
    
    dt = calendar.iloc[t_months] # 1976-01-31
    st = calendar.iloc[t_months-12]
    et = calendar.iloc[t_months-1] 
    
    perm_list, past_returns, current_markcap = filter_perm(st, et) # 현재 시점 기준 과거 1년 정보

    predicted_lstm_return = pd.Series(index=perm_list)
    # predicted_mlp_return = pd.Series(index=perm_list)

    for perm in perm_list:
        returns_12m = filtr(past_returns, col='PERMNO', isin=[perm]) # dt 기준 지난 1년의 수익률 정보
        predicted_lstm_return.loc[perm] = predict_model_return(lstm, returns_12m) # 해당 기업의 수익률 추정, LSTM 모델 활용
        # predicted_mlp_return.loc[perm] = predict_model_return(mlp, returns_12m) # 해당 기업의 수익률 추정, MLP 모델 활용
        
    predicted_lstm_return = predicted_lstm_return.sort_values(ascending=False)
    # predicted_mlp_return = predicted_mlp_return.sort_values(ascending=False)
    
    
    q = np.zeros(10) # 여기에 q1 ~ q10 저장
    for i in range(10):
          q[i] = calc_decile_return(predicted_lstm_return,(i+1))
    test_lstm_returns[dt] = q
    
    """
    q = np.zeros(10) # 여기에 q1 ~ q10 저장
    for i in range(10):
          q[i] = calc_decile_return(predicted_mlp_return,(i+1))
    test_mlp_returns[dt] = q
    """

In [421]:
lstm_returns = pd.DataFrame(test_lstm_returns).T
lstm_returns.columns = ['Q%d'%i for i in range(1,11)]

In [422]:
lstm_returns.to_pickle('lstm_return_epoch150.pkl')