[![GitHub](https://img.shields.io/badge/Github-hibana2077-blue?style=plastic-square&logo=github)](https://github.com/hibana2077)
[![Colab](https://img.shields.io/badge/Colab-Open%20in%20Colab-blue?style=plastic-square&logo=googlecolab)](https://colab.research.google.com/github/hibana2077/hibana2077/blob/master/train/train.ipynb)

如果要訓練這份資料集會需要安裝talib套件，請參考[這裡](https://www.lfd.uci.edu/~gohlke/pythonlibs/#ta-lib)下載對應的版本，並使用pip安裝。

In [None]:
!pip install -U pytorch

In [1]:
from ccxt import binance
from torch.utils.data import Dataset, DataLoader
from torch import nn
from torch import optim
from torch.nn import functional as F
from talib import abstract
import pandas as pd
import numpy as np
import torch
import os
import sys

# 環境整理

In [2]:
ls_dir = os.listdir(path="..")
if "data" not in ls_dir:
    os.mkdir(path="../data")
if "model" not in ls_dir:
    os.mkdir(path="../model")
if "data" in ls_dir:
    ls_dir = os.listdir(path="../data")
    #remove all files in data folder
    for file in ls_dir:os.remove(path="../data/"+file)

# 下載資料集

In [3]:
# Binance BTC/USDT 1h candles from 2020-01-01 to 2021-01-01

binance = binance()
symbol = 'BTC/USDT'
timeframe = '1h'
file_name = f"../data/{symbol.replace('/', '_')}_{timeframe}.csv"
start = binance.parse8601('2020-01-01T00:00:00Z')
end = binance.parse8601('2022-01-01T00:00:00Z')
cnt_time = start
data = []
while cnt_time < end:
    ohlcv = binance.fetch_ohlcv(symbol, timeframe, cnt_time)
    data += ohlcv
    cnt_time = ohlcv[-1][0] + 3600000 # 1h in ms    
df = pd.DataFrame(data, columns=['time', 'open', 'high', 'low', 'close', 'volume'])
df['time'] = pd.to_datetime(df['time'], unit='ms')
df.to_csv(file_name, index=False)

# 讀取資料集

In [2]:
#如果有下载好的數據，可以直接讀取
data_file = '../data/btc_usdt_1h.csv' #-> 可以自行更換
df = pd.read_csv(data_file)

# 數據處理

- 計算RSI
- 計算MACD
- 計算OBV
- 計算CCI
- 改成變化百分比 -> 標準化

關於技術指標的說明可以參考[這裡](https://www.investopedia.com/terms/t/technicalindicator.asp)，或是google。

In [4]:
df['RSI'] = abstract.RSI(df, timeperiod=14)
df['MACD'] = abstract.MACD(df, fastperiod=12, slowperiod=26, signalperiod=9)['macd'] #只取MACD
df['OBV'] = abstract.OBV(df, timeperiod=14)
df['CCI'] = abstract.CCI(df, timeperiod=14)
df['OPEN_percent'] = df['open'].pct_change()
df['CLOSE_percent'] = df['close'].pct_change()
df['HIGH_percent'] = df['high'].pct_change()
df['LOW_percent'] = df['low'].pct_change()
df['VOLUME_percent'] = df['volume'].pct_change()
#由於RSI MACD OBV CCI 他們已經是標準化的，所以不需要再標準化

# 設定買賣點

將買賣點分為下跌、上漲、不動，並將數據轉成one-hot編碼。

In [5]:
df['UP'] = df['CLOSE_percent'].apply(lambda x: 1 if x > 0 else 0)
df['DOWN'] = df['CLOSE_percent'].apply(lambda x: 1 if x < 0 else 0)
df['UP'] = df['UP'].shift(-1) #shift UP DOWN 一個單位，因為我們要預測的是下一個時間點的漲跌
df['DOWN'] = df['DOWN'].shift(-1)

df = df.dropna()


In [3]:
df

Unnamed: 0,time,open,high,low,close,volume,RSI,MACD,OBV,CCI,OPEN_percent,CLOSE_percent,HIGH_percent,LOW_percent,VOLUME_percent,UP,DOWN
0,2020-01-02 09:00:00,7153.57,7165.00,7135.36,7162.01,840.001328,46.653946,-18.772873,-1085.293960,-35.190579,0.005989,0.001069,0.001051,0.003692,-0.246517,0.0,1.0
1,2020-01-02 10:00:00,7162.18,7180.00,7153.33,7161.83,1446.219984,46.612835,-17.585034,-2531.513944,-4.322773,0.001204,-0.000025,0.002094,0.002518,0.721688,0.0,1.0
2,2020-01-02 11:00:00,7161.89,7168.67,7139.03,7139.79,761.156570,41.760421,-18.212168,-3292.670514,-32.074422,-0.000040,-0.003077,-0.001578,-0.001999,-0.473692,1.0,0.0
3,2020-01-02 12:00:00,7139.73,7163.40,7139.03,7158.29,794.030497,46.769498,-17.020182,-2498.640017,-11.448699,-0.003094,0.002591,-0.000735,0.000000,0.043189,0.0,1.0
4,2020-01-02 13:00:00,7158.86,7163.35,7107.43,7131.15,1566.280693,41.174606,-18.057343,-4064.920710,-69.799501,0.002679,-0.003791,-0.000007,-0.004426,0.972570,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17961,2022-01-21 01:00:00,40894.81,40929.76,39730.00,39762.99,4172.351850,25.308075,-334.514225,697285.662621,-152.190082,0.005258,-0.027676,-0.004142,-0.016682,0.284850,1.0,0.0
17962,2022-01-21 02:00:00,39763.00,40280.00,39267.86,39830.52,4354.458167,26.469820,-445.353256,701640.120788,-137.983834,-0.027676,0.001698,-0.015875,-0.011632,0.043646,0.0,1.0
17963,2022-01-21 03:00:00,39835.99,39944.45,38220.00,38465.65,9508.823455,19.775068,-635.996220,692131.297333,-150.716081,0.001836,-0.034267,-0.008330,-0.026685,1.183698,1.0,0.0
17964,2022-01-21 04:00:00,38465.65,39070.00,38437.15,38842.03,3091.420650,25.379860,-748.087930,695222.717983,-126.902770,-0.034400,0.009785,-0.021892,0.005682,-0.674889,1.0,0.0


In [4]:
df['UP'].value_counts()

1.0    9186
0.0    8780
Name: UP, dtype: int64

In [5]:
df['DOWN'].value_counts()

0.0    9189
1.0    8777
Name: DOWN, dtype: int64

看起來數據蠻平衡的

# 儲存資料

In [9]:
df.to_csv(file_name, index=False)

# 分割成X、y

In [6]:
'''
Author: hibana2077 hibana2077@gmail.com
Date: 2022-12-23 16:28:55
LastEditors: hibana2077 hibana2077@gmail.com
LastEditTime: 2022-12-29 12:33:45
FilePath: \OOP-independent-study\train\train.ipynb
Description: 这是默认设置,请设置`customMade`, 打开koroFileHeader查看配置 进行设置: https://github.com/OBKoro1/koro1FileHeader/wiki/%E9%85%8D%E7%BD%AE
'''
X,y = list(),list()
ref_bar = 10

for i in range(len(df)-ref_bar):
    #df.iloc.values 會回傳一個numpy array
    X.append(df.iloc[i:i+ref_bar, 6:15].values) # i to i+ref_bar-1
    y.append(df.iloc[i+ref_bar-1, 15:].values) # i+ref_bar-1



In [7]:
X[0]#這裡會出現10個array，每個array裡面有9個數字，分別是RSI,MACD,OBV,CCI,OPEN_percent,CLOSE_percent,HIGH_percent,LOW_percent,VOLUME_percent

array([[ 4.66539459e+01, -1.87728730e+01, -1.08529396e+03,
        -3.51905789e+01,  5.98932918e-03,  1.06927803e-03,
         1.05064911e-03,  3.69244533e-03, -2.46517407e-01],
       [ 4.66128349e+01, -1.75850337e+01, -2.53151394e+03,
        -4.32277334e+00,  1.20359485e-03, -2.51326094e-05,
         2.09351012e-03,  2.51844336e-03,  7.21687735e-01],
       [ 4.17604212e+01, -1.82121677e+01, -3.29267051e+03,
        -3.20744220e+01, -4.04904652e-05, -3.07742574e-03,
        -1.57799443e-03, -1.99906897e-03, -4.73692399e-01],
       [ 4.67694980e+01, -1.70201825e+01, -2.49864002e+03,
        -1.14486985e+01, -3.09415531e-03,  2.59111262e-03,
        -7.35143339e-04,  0.00000000e+00,  4.31894413e-02],
       [ 4.11746059e+01, -1.80573430e+01, -4.06492071e+03,
        -6.97995011e+01,  2.67937303e-03, -3.79140828e-03,
        -6.97992573e-06, -4.42637165e-03,  9.72569944e-01],
       [ 4.23888154e+01, -1.83099634e+01, -3.10861979e+03,
        -4.61201187e+01, -3.86653741e-03,  6.226204

In [8]:
y[0]

array([1.0, 0.0], dtype=object)

In [9]:
X = np.array(X)
X = torch.tensor(X, dtype=torch.float32)

In [10]:
y = np.array(y,dtype=np.float32)
y = torch.tensor(y, dtype=torch.float32)
y = y.view(y.shape[0],1,2)

In [11]:
y

tensor([[[1., 0.]],

        [[0., 1.]],

        [[1., 0.]],

        ...,

        [[0., 1.]],

        [[1., 0.]],

        [[1., 0.]]])

In [12]:
print(f"X shape: {X.shape} , y shape: {y.shape}")

X shape: torch.Size([17956, 10, 9]) , y shape: torch.Size([17956, 1, 2])


In [13]:
X[0]

tensor([[ 4.6654e+01, -1.8773e+01, -1.0853e+03, -3.5191e+01,  5.9893e-03,
          1.0693e-03,  1.0506e-03,  3.6924e-03, -2.4652e-01],
        [ 4.6613e+01, -1.7585e+01, -2.5315e+03, -4.3228e+00,  1.2036e-03,
         -2.5133e-05,  2.0935e-03,  2.5184e-03,  7.2169e-01],
        [ 4.1760e+01, -1.8212e+01, -3.2927e+03, -3.2074e+01, -4.0490e-05,
         -3.0774e-03, -1.5780e-03, -1.9991e-03, -4.7369e-01],
        [ 4.6769e+01, -1.7020e+01, -2.4986e+03, -1.1449e+01, -3.0942e-03,
          2.5911e-03, -7.3514e-04,  0.0000e+00,  4.3189e-02],
        [ 4.1175e+01, -1.8057e+01, -4.0649e+03, -6.9799e+01,  2.6794e-03,
         -3.7914e-03, -6.9799e-06, -4.4264e-03,  9.7257e-01],
        [ 4.2389e+01, -1.8310e+01, -3.1086e+03, -4.6120e+01, -3.8665e-03,
          6.2262e-04, -8.7250e-04,  2.0978e-03, -3.8944e-01],
        [ 4.1433e+01, -1.8667e+01, -3.9710e+03, -5.9638e+01,  6.1841e-04,
         -6.4606e-04, -7.1258e-04, -3.2854e-04, -9.8161e-02],
        [ 2.9291e+01, -2.5009e+01, -7.5755e+03, 

In [14]:
y[0]

tensor([[1., 0.]])

# 建立資料集類別

- 要繼承torch.utils.data.Dataset
- 要實作`__len__`、`__getitem__`
- 後面要用DataLoader取用

In [15]:
#用sklearn來分成train val test
# train:val:test = 6:3:1
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.75, random_state=42)

In [17]:
print(f"X_train shape: {X_train.shape} , y_train shape: {y_train.shape}")

X_train shape: torch.Size([10773, 10, 9]) , y_train shape: torch.Size([10773, 1, 2])


In [18]:
print(f"X_test shape: {X_test.shape} , y_test shape: {y_test.shape}")

X_test shape: torch.Size([5388, 10, 9]) , y_test shape: torch.Size([5388, 1, 2])


In [19]:
print(f"X_val shape: {X_val.shape} , y_val shape: {y_val.shape}")

X_val shape: torch.Size([1795, 10, 9]) , y_val shape: torch.Size([1795, 1, 2])


In [20]:
class TrainDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [21]:
class ValDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [22]:
class TestDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [23]:
train_dataset = TrainDataset(X_train, y_train)
val_dataset = ValDataset(X_val, y_val)
test_dataset = TestDataset(X_test, y_test)

In [24]:
train_dataset[0]

(tensor([[ 2.6485e+01, -1.6131e+02,  5.9201e+04, -9.2721e+01,  1.1383e-04,
          -1.5751e-02, -5.2585e-03, -1.8805e-02,  2.6932e+00],
         [ 3.0662e+01, -1.6142e+02,  6.5494e+04, -1.0451e+02, -1.5854e-02,
           5.4725e-03, -8.8063e-03, -9.8619e-03,  2.4279e-01],
         [ 3.1172e+01, -1.5920e+02,  6.8492e+04, -8.2316e+01,  5.4911e-03,
           6.5906e-04,  3.3058e-03,  1.3949e-02, -5.2356e-01],
         [ 3.4758e+01, -1.5249e+02,  7.0689e+04, -6.9617e+01,  5.6474e-04,
           4.5702e-03,  2.4989e-03, -1.1561e-03, -2.6727e-01],
         [ 3.3365e+01, -1.4784e+02,  6.8939e+04, -6.4270e+01,  4.6649e-03,
          -3.3857e-03, -1.2640e-03,  3.1921e-03, -2.0327e-01],
         [ 3.8114e+01, -1.3833e+02,  7.1194e+04, -5.3180e+00, -3.3903e-03,
           6.0401e-03,  3.4302e-03,  3.6942e-03,  2.8836e-01],
         [ 4.1996e+01, -1.2563e+02,  7.4509e+04,  6.5914e+01,  6.1618e-03,
           5.2347e-03,  6.0992e-03,  5.7934e-03,  4.7046e-01],
         [ 4.2460e+01, -1.1382e+02

In [25]:
test_dataset[0]

(tensor([[ 5.8582e+01,  1.1959e+01,  1.7403e+05,  1.2495e+02,  2.3374e-03,
           1.8267e-04, -5.4229e-04,  4.1859e-04, -3.5869e-01],
         [ 6.3649e+01,  1.6000e+01,  1.7615e+05,  1.6399e+02,  1.8372e-04,
           3.4994e-03,  3.7772e-03,  7.5628e-04,  7.1435e-01],
         [ 5.8219e+01,  1.7091e+01,  1.7406e+05,  1.4761e+02,  3.5025e-03,
          -2.4690e-03,  1.7672e-03,  2.1793e-03, -1.4666e-02],
         [ 4.7602e+01,  1.3153e+01,  1.7200e+05, -4.7687e+00, -2.4721e-03,
          -6.0084e-03, -4.5180e-03, -6.8919e-03, -8.9839e-03],
         [ 4.8285e+01,  1.0227e+01,  1.7353e+05, -5.5034e+01, -6.0084e-03,
           4.0697e-04, -4.2758e-03, -9.4518e-04, -2.5954e-01],
         [ 4.8247e+01,  7.8008e+00,  1.7251e+05, -3.7415e+01,  4.3843e-04,
          -2.3066e-05,  1.9157e-04,  1.6272e-03, -3.3478e-01],
         [ 5.3201e+01,  7.9776e+00,  1.7349e+05, -1.3241e+01, -5.4518e-05,
           2.8477e-03,  1.6464e-03, -1.6246e-03, -3.4308e-02],
         [ 5.4422e+01,  8.5884e+00

In [26]:
val_dataset[0]

(tensor([[ 5.1127e+01, -1.1306e+01,  1.4610e+05,  1.1628e+02,  5.5154e-03,
          -2.1892e-03, -1.4525e-04,  6.5048e-03, -3.8472e-02],
         [ 5.4208e+01, -6.8719e+00,  1.4772e+05,  1.3500e+02, -2.0385e-03,
           2.6357e-03,  3.1875e-04, -1.2827e-03, -1.8354e-01],
         [ 5.3234e+01, -3.8597e+00,  1.4493e+05,  1.5453e+02,  2.5641e-03,
          -7.0847e-04,  4.9459e-03,  2.8703e-03,  7.2001e-01],
         [ 5.1449e+01, -2.4242e+00,  1.4352e+05,  8.2549e+01, -7.4507e-04,
          -1.2713e-03, -5.5099e-03, -1.6517e-03, -4.9351e-01],
         [ 5.1561e+01, -1.2097e+00,  1.4503e+05,  7.0754e+01, -1.3289e-03,
           8.1788e-05, -5.2268e-04,  3.1912e-04,  6.8788e-02],
         [ 5.6285e+01,  2.4527e+00,  1.4686e+05,  9.7540e+01,  1.3947e-04,
           3.5449e-03,  2.4443e-03,  7.7761e-04,  2.1341e-01],
         [ 5.1317e+01,  2.8077e+00,  1.4526e+05,  6.2449e+01,  3.5145e-03,
          -3.2566e-03, -2.4832e-04, -6.9731e-04, -1.2636e-01],
         [ 5.1451e+01,  3.1264e+00

# 建立模型

- 要繼承torch.nn.Module
- 可能要多建立不同的模型，到時候看結果再調整 -> 先讓數據流得通，再去看成績做調整。

- ver1 -> CNN+MLP
- ver2 -> CNN+LSTM+MLP
- ver3 -> CNN+GRU+MLP
- ver4 -> CNN+LSTM+GRU+MLP

In [71]:
class SelectItem(torch.nn.Module):#這是用來取出多個輸出其中一個的輸出，如果不用sequential的話，就可以不用這個
    def __init__(self, item_index):
        super(SelectItem, self).__init__()
        self._name = 'selectitem'
        self.item_index = item_index

    def forward(self, inputs):
        return inputs[self.item_index]

In [None]:
class crypto_classfier_ver1(nn.Module): #CNN+MLP
    def __init__(self):
        super(crypto_classfier_ver1, self).__init__()
        self.net = nn.Sequential(
            torch.nn.Conv1d(10, 20, 3, stride=1, padding=1),
            torch.nn.ReLU(),
            torch.nn.MaxPool1d(1, stride=8),
            torch.nn.Conv1d(20, 40, 3, stride=1, padding=1),
            torch.nn.ReLU(),
            torch.nn.MaxPool1d(1, stride=1),
            torch.nn.Conv1d(40, 1, 3, stride=1, padding=1),
            torch.nn.ReLU(),
            torch.nn.Linear(2,16),
            torch.nn.ReLU(),
            torch.nn.Linear(16,2),
            torch.nn.Softmax(dim=1))
    def forward(self, x):
        x = self.net(x)
        return x

In [None]:
class crypto_classfier_ver2(nn.Module): #CNN+LSTM+MLP
    def __init__(self):
        super(crypto_classfier_ver2, self).__init__()
        self.net = nn.Sequential(
            torch.nn.Conv1d(10, 20, 3, stride=1, padding=1),
            torch.nn.ReLU(),
            torch.nn.MaxPool1d(1, stride=8),
            torch.nn.Conv1d(20, 40, 3, stride=1, padding=1),
            torch.nn.ReLU(),
            torch.nn.MaxPool1d(1, stride=1),
            torch.nn.Conv1d(40, 1, 3, stride=1, padding=1),
            torch.nn.ReLU(),
            torch.nn.LSTM(2, 16, 25),
            SelectItem(0),
            torch.nn.Linear(16,2),
            torch.nn.Softmax(dim=1))
    def forward(self, x):
        x = self.net(x)
        return x

In [None]:
class crypto_classfier_ver3(nn.Module):#CNN+GRU+MLP
    def __init__(self):
        super(crypto_classfier_ver3, self).__init__()
        self.net = nn.Sequential(
            torch.nn.Conv1d(10, 20, 3, stride=1, padding=1),
            torch.nn.ReLU(),
            torch.nn.MaxPool1d(1, stride=1),
            torch.nn.Conv1d(20, 40, 3, stride=1, padding=1),
            torch.nn.ReLU(),
            torch.nn.MaxPool1d(1, stride=1),
            torch.nn.Conv1d(40, 1, 3, stride=1, padding=1),
            torch.nn.Linear(9,64),
            torch.nn.Linear(64,128),
            torch.nn.GRU(128, 64, 25),
            SelectItem(0),
            torch.nn.Linear(64,2),
            torch.nn.Softmax(dim=1))
    def forward(self, x):
        x = self.net(x)
        return x

In [69]:
lab_tensor = train_dataset[0][0]
lab_tensor.shape

torch.Size([10, 9])

In [58]:
from torchsummary import summary

In [85]:
lab_models = torch.nn.Sequential(
            torch.nn.Conv1d(10, 20, 3, stride=1, padding=1),
            torch.nn.ReLU(),
            torch.nn.MaxPool1d(1, stride=1),
            torch.nn.Conv1d(20, 40, 3, stride=1, padding=1),
            torch.nn.ReLU(),
            torch.nn.MaxPool1d(1, stride=1),
            torch.nn.Conv1d(40, 1, 3, stride=1, padding=1),
            torch.nn.Linear(9,64),
            torch.nn.Linear(64,128),
            torch.nn.GRU(128, 64, 25),
            SelectItem(0),
            torch.nn.Linear(64,2),
            torch.nn.Softmax(dim=1))
out = lab_models(lab_tensor)
print(out)
print(out.shape)
# summary(lab_models, (10, 9))

tensor([[0.5201, 0.4799]], grad_fn=<SoftmaxBackward0>)
torch.Size([1, 2])


## Note

1) model
    - `__init__`: define [layers](https://pytorch.org/docs/stable/nn.html)
    - forward: forward pass -> compute prediction
2) loss and optimizer
    - lr: learning rate [default=0.001]
    - momentum: momentum for optimizer [default=0.9]
    - criterion: loss function [in torch.nn] eg.nn.BCELoss()
    - optimizer: optimizer [in torch.optim] eg.torch.optim.SGD()
        - eg. optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=momentum)
3) training loop
    - forward pass: compute prediction and loss

        ```python
        output = model(data)
        loss = criterion(output, target)
        ```
        
    - backward pass: loss.backward()
    - update weights: optimizer.step()
    - zero the gradients: optimizer.zero_grad()

# TODO

- 寫訓練方法
    - using dataloader
        - batch and epoch
- 寫驗證方法
    - using model.eval()
