[![GitHub](https://img.shields.io/badge/Github-hibana2077-blue?style=plastic-square&logo=github)](https://github.com/hibana2077)
[![Colab](https://img.shields.io/badge/Colab-Open%20in%20Colab-blue?style=plastic-square&logo=googlecolab)](https://colab.research.google.com/github/hibana2077/hibana2077/blob/master/train/train.ipynb)

如果要訓練這份資料集會需要安裝talib套件，請參考[這裡](https://www.lfd.uci.edu/~gohlke/pythonlibs/#ta-lib)下載對應的版本，並使用pip安裝。

In [None]:
!pip install -U pytorch

In [1]:
from ccxt import binance
from torch.utils.data import Dataset, DataLoader
from torch import nn
from torch import optim
from torch.nn import functional as F
from talib import abstract
import pandas as pd
import numpy as np
import torch
import os
import sys

# 環境整理

In [2]:
ls_dir = os.listdir(path="..")
if "data" not in ls_dir:
    os.mkdir(path="../data")
if "model" not in ls_dir:
    os.mkdir(path="../model")
if "data" in ls_dir:
    ls_dir = os.listdir(path="../data")
    #remove all files in data folder
    for file in ls_dir:os.remove(path="../data/"+file)

# 下載資料集

In [3]:
# Binance BTC/USDT 1h candles from 2020-01-01 to 2021-01-01

binance = binance()
symbol = 'BTC/USDT'
timeframe = '1h'
start = binance.parse8601('2020-01-01T00:00:00Z')
end = binance.parse8601('2022-01-01T00:00:00Z')
cnt_time = start
data = []
while cnt_time < end:
    ohlcv = binance.fetch_ohlcv(symbol, timeframe, cnt_time)
    data += ohlcv
    cnt_time = ohlcv[-1][0] + 3600000 # 1h in ms    
df = pd.DataFrame(data, columns=['time', 'open', 'high', 'low', 'close', 'volume'])
df['time'] = pd.to_datetime(df['time'], unit='ms')
df.to_csv('../data/btc_usdt_1h.csv', index=False)

# 讀取資料集

In [4]:
#如果有下载好的數據，可以直接讀取
data_file = '../data/btc_usdt_1h.csv'
df = pd.read_csv(data_file)

# 數據處理

- 計算RSI
- 計算MACD
- 計算OBV
- 計算CCI
- 改成變化百分比 -> 標準化

關於技術指標的說明可以參考[這裡](https://www.investopedia.com/terms/t/technicalindicator.asp)，或是google。

In [5]:
df['RSI'] = abstract.RSI(df, timeperiod=14)
df['MACD'] = abstract.MACD(df, fastperiod=12, slowperiod=26, signalperiod=9)['macd'] #只取MACD
df['OBV'] = abstract.OBV(df, timeperiod=14)
df['CCI'] = abstract.CCI(df, timeperiod=14)
df['OPEN_percent'] = df['open'].pct_change()
df['CLOSE_percent'] = df['close'].pct_change()
df['HIGH_percent'] = df['high'].pct_change()
df['LOW_percent'] = df['low'].pct_change()
df['VOLUME_percent'] = df['volume'].pct_change()
#由於RSI MACD OBV CCI 他們已經是標準化的，所以不需要再標準化

# 設定買賣點

將買賣點分為下跌、上漲、不動，並將數據轉成one-hot編碼。

In [6]:
df['UP'] = df['CLOSE_percent'].apply(lambda x: 1 if x > 0 else 0)
df['DOWN'] = df['CLOSE_percent'].apply(lambda x: 1 if x < 0 else 0)
df['UP'] = df['UP'].shift(-1) #shift UP DOWN 一個單位，因為我們要預測的是下一個時間點的漲跌
df['DOWN'] = df['DOWN'].shift(-1)

df = df.dropna()

df

Unnamed: 0,time,open,high,low,close,volume,RSI,MACD,OBV,CCI,OPEN_percent,CLOSE_percent,HIGH_percent,LOW_percent,VOLUME_percent,UP,DOWN
33,2020-01-02 09:00:00,7153.57,7165.00,7135.36,7162.01,840.001328,46.653946,-18.772873,-1085.293960,-35.190579,0.005989,0.001069,0.001051,0.003692,-0.246517,0.0,1.0
34,2020-01-02 10:00:00,7162.18,7180.00,7153.33,7161.83,1446.219984,46.612835,-17.585034,-2531.513944,-4.322773,0.001204,-0.000025,0.002094,0.002518,0.721688,0.0,1.0
35,2020-01-02 11:00:00,7161.89,7168.67,7139.03,7139.79,761.156570,41.760421,-18.212168,-3292.670514,-32.074422,-0.000040,-0.003077,-0.001578,-0.001999,-0.473692,1.0,0.0
36,2020-01-02 12:00:00,7139.73,7163.40,7139.03,7158.29,794.030497,46.769498,-17.020182,-2498.640017,-11.448699,-0.003094,0.002591,-0.000735,0.000000,0.043189,0.0,1.0
37,2020-01-02 13:00:00,7158.86,7163.35,7107.43,7131.15,1566.280693,41.174606,-18.057343,-4064.920710,-69.799501,0.002679,-0.003791,-0.000007,-0.004426,0.972570,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17994,2022-01-21 01:00:00,40894.81,40929.76,39730.00,39762.99,4172.351850,25.308075,-334.514225,697285.662621,-152.190082,0.005258,-0.027676,-0.004142,-0.016682,0.284850,1.0,0.0
17995,2022-01-21 02:00:00,39763.00,40280.00,39267.86,39830.52,4354.458167,26.469820,-445.353256,701640.120788,-137.983834,-0.027676,0.001698,-0.015875,-0.011632,0.043646,0.0,1.0
17996,2022-01-21 03:00:00,39835.99,39944.45,38220.00,38465.65,9508.823455,19.775068,-635.996220,692131.297333,-150.716081,0.001836,-0.034267,-0.008330,-0.026685,1.183698,1.0,0.0
17997,2022-01-21 04:00:00,38465.65,39070.00,38437.15,38842.03,3091.420650,25.379860,-748.087930,695222.717983,-126.902770,-0.034400,0.009785,-0.021892,0.005682,-0.674889,1.0,0.0


# 儲存資料

In [7]:
df.to_csv('../data/btc_usdt_1h.csv', index=False)

# 分割成X、y

In [8]:
X,y = list(),list()
ref_bar = 10

for i in range(len(df)-ref_bar):
    X.append(df.iloc[i:i+ref_bar, 1:11].values) # i to i+ref_bar-1
    y.append(df.iloc[i+ref_bar-1, 11:13].values) # i+ref_bar-1

X = torch.tensor(X, dtype=torch.float32)
y = torch.tensor(y, dtype=torch.float32)

  X = torch.tensor(X, dtype=torch.float32)


In [9]:
print(f"X shape: {X.shape} , y shape: {y.shape}")

X shape: torch.Size([17956, 10, 10]) , y shape: torch.Size([17956, 2])


In [10]:
X[0]

tensor([[ 7.1536e+03,  7.1650e+03,  7.1354e+03,  7.1620e+03,  8.4000e+02,
          4.6654e+01, -1.8773e+01, -1.0853e+03, -3.5191e+01,  5.9893e-03],
        [ 7.1622e+03,  7.1800e+03,  7.1533e+03,  7.1618e+03,  1.4462e+03,
          4.6613e+01, -1.7585e+01, -2.5315e+03, -4.3228e+00,  1.2036e-03],
        [ 7.1619e+03,  7.1687e+03,  7.1390e+03,  7.1398e+03,  7.6116e+02,
          4.1760e+01, -1.8212e+01, -3.2927e+03, -3.2074e+01, -4.0490e-05],
        [ 7.1397e+03,  7.1634e+03,  7.1390e+03,  7.1583e+03,  7.9403e+02,
          4.6769e+01, -1.7020e+01, -2.4986e+03, -1.1449e+01, -3.0942e-03],
        [ 7.1589e+03,  7.1634e+03,  7.1074e+03,  7.1311e+03,  1.5663e+03,
          4.1175e+01, -1.8057e+01, -4.0649e+03, -6.9799e+01,  2.6794e-03],
        [ 7.1312e+03,  7.1571e+03,  7.1223e+03,  7.1356e+03,  9.5630e+02,
          4.2389e+01, -1.8310e+01, -3.1086e+03, -4.6120e+01, -3.8665e-03],
        [ 7.1356e+03,  7.1520e+03,  7.1200e+03,  7.1310e+03,  8.6243e+02,
          4.1433e+01, -1.8667e+0

# 建立資料集類別

- 要繼承torch.utils.data.Dataset

In [None]:
class MyDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# 建立模型

- 要繼承torch.nn.Module

In [None]:
class LSTMClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(LSTMClassifier, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        #input_size => input feature size
        #hidden_size => hidden state size (number of neurons)
        #num_layers => number of layers
        #batch_first => True: (batch, seq, feature) False: (seq, batch, feature)
        self.fc = nn.Linear(hidden_size, num_classes) # fc = fully connected
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    def forward(self, x):# x 是
        if self.device == torch.device('cuda'):
            h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).cuda()
            c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).cuda()
        else:
            h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size)
            c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size)
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out


## Note

1) model
    - `__init__`: define [layers](https://pytorch.org/docs/stable/nn.html)
    - forward: forward pass -> compute prediction
2) loss and optimizer
    - lr: learning rate [default=0.001]
    - momentum: momentum for optimizer [default=0.9]
    - criterion: loss function [in torch.nn] eg.nn.BCELoss()
    - optimizer: optimizer [in torch.optim] eg.torch.optim.SGD()
        - eg. optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=momentum)
3) training loop
    - forward pass: compute prediction and loss

        ```python
        output = model(data)
        loss = criterion(output, target)
        ```
        
    - backward pass: loss.backward()
    - update weights: optimizer.step()
    - zero the gradients: optimizer.zero_grad()

# TODO

- 資料集切割
    - train, valid, test
- 寫訓練方法
    - using dataloader
        - batch and epoch
- 寫驗證方法
    - using model.eval()


In [None]:
def train(model,opt,loss,x,y,eps):
    for Epoch in range(eps):
        y_pred = model(x)
        loss_value = loss(y_pred, y)
        opt.zero_grad()
        loss_value.backward()
        opt.step()
        if Epoch % 100 == 0:
            print('Epoch: {} | Loss: {:.4f}'.format(Epoch, loss_value.item()))

In [None]:
model = LSTMClassifier(input_size=10, hidden_size=64, num_layers=2, num_classes=2)