In [1]:
import sys
import joblib
import tensorflow as tf
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report

print(f"Python version: {sys.version}")
print(f"TensorFlow version: {tf.__version__}")

Python version: 3.10.11 (tags/v3.10.11:7d4cc5a, Apr  5 2023, 00:38:17) [MSC v.1929 64 bit (AMD64)]
TensorFlow version: 2.12.0


- `train_test_split` 是一個用於將數據集分割為訓練集和測試集的函數。通常在機器學習模型訓練前會使用。
- `StandardScaler` 是一種數據預處理技術，用於將特徵標準化為平均值為0，標準差為1的數據集，這也被稱為z-score標準化。
- `confusion_matrix` 是一種特定格式的表格，用於視覺化一個分類模型的效能。矩陣的每一列代表實際類別，每一行代表預測類別。
- `accuracy_score` 函數用於計算分類模型的準確度，即模型正確預測的樣本數與總樣本數的比例。
- `precision_score` 函數用於計算分類模型的精度，即模型正確預測的正樣本數與模型預測為正的總樣本數的比例。
- `recall_score` 函數用於計算分類模型的召回率，即模型正確預測的正樣本數與實際正樣本數的比例。
- `f1_score` 函數用於計算分類模型的F1分數，這是精度和召回率的調和平均值。
- `ROC Curve, AUC, and ROC_AUC_Score` 這些函數用於計算和繪製ROC（Receiver Operating Characteristic）曲線，以及計算AUC（Area Under the Curve）值。ROC曲線是用於評估分類模型在所有可能的分類閾值下的效能，AUC值則是ROC曲線下的面積，用於衡量模型的整體性能。
- `classification_report` 函數生成一個包含主要分類指標（如精度、召回率、F1分數等）的報告。
- `precision_recall_curve` 函數用於計算精度-召回率曲線。這是一種工具，用於視覺化在不同閾值下模型的精度和召回率之間的權衡。

In [2]:
# constants
DATA_LOC = 'MEXC_ETHUSDTP1m_iso.csv'

In [3]:
#load data
df = pd.read_csv(DATA_LOC)
#drop RSI = NaN
df.dropna(subset=['RSI'], inplace=True)
#NaN -> 0
df.fillna(0, inplace=True)
#reverse order
df = df.iloc[::-1]
#set index
df.set_index('time', inplace=True)
#create bullish/bearish labels
df['bullish'] = (df["Regular Bullish"] > 0) | (df["Hidden Bullish"] > 0) | (df["Regular Bullish Label"] > 0) | (df["Hidden Bullish Label"] > 0)
df['bearish'] = (df["Regular Bearish"] > 0) | (df["Hidden Bearish"] > 0) | (df["Regular Bearish Label"] > 0) | (df["Hidden Bearish Label"] > 0)
df['neutral'] = ~(df['bullish'] | df['bearish'])
#drop columns
df.drop(columns=['Regular Bearish', 'Hidden Bearish', 'Regular Bullish Label', 'Hidden Bullish Label', 'Regular Bearish Label', 'Hidden Bearish Label',"Regular Bullish","Hidden Bullish"], inplace=True)
#change bool to int
df['bullish'] = df['bullish'].astype(int)
df['bearish'] = df['bearish'].astype(int)
df['neutral'] = df['neutral'].astype(int)

df.head()

Unnamed: 0_level_0,open,high,low,close,RSI,bullish,bearish,neutral
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2023-06-22T07:36:00+08:00,1896.75,1897.23,1896.74,1897.0,57.717497,0,0,1
2023-06-22T07:35:00+08:00,1895.85,1896.75,1895.8,1896.74,56.783005,0,0,1
2023-06-22T07:34:00+08:00,1895.8,1895.86,1895.3,1895.86,53.557043,0,0,1
2023-06-22T07:33:00+08:00,1896.0,1896.0,1895.8,1895.81,53.373414,0,0,1
2023-06-22T07:32:00+08:00,1896.32,1896.32,1895.99,1895.99,54.088309,0,0,1


In [4]:
#data shape
print(f"Data shape: {df.shape}")
print("{:=<40}".format(""))
print(f"Data columns: {df.columns}")
print("{:=<40}".format(""))
print(f"Data types: {df.dtypes}")
print("{:=<40}".format(""))
#count bullish/bearish
print("{:=^40}".format("Ground Truth"))
print(f"Bullish count: {df['bullish'].sum()}")
print(f"Bearish count: {df['bearish'].sum()}")
print(f"Neutral count: {df['neutral'].sum()}")
#print percent bullish/bearish/neutral
print(f"Bullish percent: {df['bullish'].sum()/df.shape[0]:.4f}")
print(f"Bearish percent: {df['bearish'].sum()/df.shape[0]:.4f}")
print(f"Neutral percent: {df['neutral'].sum()/df.shape[0]:.4f}")
print("{:=^40}".format(""))

Data shape: (24434, 8)
Data columns: Index(['open', 'high', 'low', 'close', 'RSI', 'bullish', 'bearish', 'neutral'], dtype='object')
Data types: open       float64
high       float64
low        float64
close      float64
RSI        float64
bullish      int32
bearish      int32
neutral      int32
dtype: object
Bullish count: 1695
Bearish count: 1715
Neutral count: 21024
Bullish percent: 0.0694
Bearish percent: 0.0702
Neutral percent: 0.8604


# 標準化

- Z-Score標準化：這種方法將數據轉換為均值為0，標準差為1的數據集。在Python中，可以使用`sklearn.preprocessing.StandardScaler`來實現。

- 最小最大標準化：這種方法將數據轉換到一個指定的範圍內，例如[0,1]。這對於神經網絡來說非常有用，因為它們通常喜歡在固定範圍內工作。在Python中，可以使用`sklearn.preprocessing.MinMaxScaler`來實現。

- Log轉換：對於高度偏態的數據，我們可以對其進行log轉換以使其更接近正態分布。這在處理股價和其他金融數據時非常有用，因為它們的分佈往往有很大的尾部。可以使用`np.log`函數來實現。

- 差分化（Differencing）：在時間序列分析中，我們有時會使用差分來消除時間序列數據中的趨勢和季節性成分。這通常適用於股價數據，因為我們更關注價格變化而不是實際價格。

In [None]:
#split data
X = df.drop(columns=['bullish', 'bearish', 'neutral'])
y = df[['bullish', 'bearish', 'neutral']]
#scale data
scaler = StandardScaler()
X = scaler.fit_transform(X)
X = pd.DataFrame(X, columns=df.columns[:-3])
X.head()

In [5]:

def create_sequences(data, time_steps):
    X = []
    for i in range(len(data) - time_steps + 1):
        X.append(data[i: (i + time_steps)].values)
    return np.array(X)
def create_y_sequences(data, time_steps):
    Y = []
    for i in range(len(data) - time_steps + 1):
        Y.append(data.iloc[i + time_steps - 1].values)
    return np.array(Y)


# 將 data 轉換為帶有時間步長的序列
X = create_sequences(X, 55)
y = create_y_sequences(y, 55)

print("{:=^40}".format("Shape"))
print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")
print("{:=^40}".format(""))

X shape: (24420, 15, 5)
y shape: (24420, 3)


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# 模型建立

In [7]:
#model
model = tf.keras.models.Sequential([
    tf.keras.layers.LSTM(128, input_shape=(X_train.shape[1], X_train.shape[2]), return_sequences=True),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.LSTM(256, return_sequences=True),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.LSTM(512, return_sequences=True),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.LSTM(256, return_sequences=True),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.LSTM(128, return_sequences=False),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(3, activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 15, 128)           68608     
                                                                 
 dropout (Dropout)           (None, 15, 128)           0         
                                                                 
 lstm_1 (LSTM)               (None, 15, 256)           394240    
                                                                 
 dropout_1 (Dropout)         (None, 15, 256)           0         
                                                                 
 lstm_2 (LSTM)               (None, 15, 512)           1574912   
                                                                 
 dropout_2 (Dropout)         (None, 15, 512)           0         
                                                                 
 lstm_3 (LSTM)               (None, 15, 256)           7

In [8]:
#train
class_weights = {0: y.shape[0] / y[:, 0].sum(), 1: y.shape[0] / y[:, 1].sum(), 2: y.shape[0] / y[:, 2].sum()}
history = model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.2, class_weight=class_weights)

Epoch 1/100
 41/489 [=>............................] - ETA: 1:21 - loss: 3.3346 - accuracy: 0.3491

KeyboardInterrupt: 

In [None]:
#plot
fig = go.Figure()
fig.add_trace(go.Scatter(x=history.epoch, y=history.history['loss'], name='loss'))
fig.add_trace(go.Scatter(x=history.epoch, y=history.history['val_loss'], name='val_loss'))
fig.update_layout(title='Loss', xaxis_title='Epoch', yaxis_title='Loss')
fig.show()

In [None]:
#plot - confusion matrix
y_pred = model.predict(X_test)
y_pred = np.argmax(y_pred, axis=1)
y_test = np.argmax(y_test, axis=1)
cm = confusion_matrix(y_test, y_pred)
cm = pd.DataFrame(cm, index=['bullish', 'bearish', 'neutral'], columns=['bullish', 'bearish', 'neutral'])
fig = px.imshow(cm, color_continuous_scale='Blues')
fig.update_layout(title='Confusion Matrix', xaxis_title='Predicted Class', yaxis_title='Actual Class')
fig.show()

In [None]:
#plot - candlestick(Ground Truth)
df = pd.read_csv(DATA_LOC)
df.fillna(0, inplace=True)
df.set_index('time', inplace=True)
df['bullish'] = (df["Regular Bullish"] > 0) | (df["Hidden Bullish"] > 0) | (df["Regular Bullish Label"] > 0) | (df["Hidden Bullish Label"] > 0)
df['bearish'] = (df["Regular Bearish"] > 0) | (df["Hidden Bearish"] > 0) | (df["Regular Bearish Label"] > 0) | (df["Hidden Bearish Label"] > 0)
df['neutral'] = ~(df['bullish'] | df['bearish'])
df.drop(columns=['Regular Bearish', 'Hidden Bearish', 'Regular Bullish Label', 'Hidden Bullish Label', 'Regular Bearish Label', 'Hidden Bearish Label',"Regular Bullish","Hidden Bullish"], inplace=True)
df['bullish'] = df['bullish'].astype(int)
df['bearish'] = df['bearish'].astype(int)
df['neutral'] = df['neutral'].astype(int)
df = df[['open', 'high', 'low', 'close', 'bullish', 'bearish', 'neutral']]
# df.head()
fig = go.Figure(data=[go.Candlestick(x=df.index, open=df['open'], high=df['high'], low=df['low'], close=df['close'])])
fig.update_layout(title='Candlestick', xaxis_title='Time', yaxis_title='Price')
#plot bullish/bearish/neutral
bullish = df[df['bullish'] == 1]
bearish = df[df['bearish'] == 1]
fig.add_trace(go.Scatter(x=bullish.index, y=bullish['close'], mode='markers', name='Bullish', marker=dict(color='green')))
fig.add_trace(go.Scatter(x=bearish.index, y=bearish['close'], mode='markers', name='Bearish', marker=dict(color='red')))
fig.show()

In [None]:
#plot - candlestick(Predict)
df = pd.read_csv(DATA_LOC)
df.fillna(0, inplace=True)
df.set_index('time', inplace=True)
df['bullish'] = (df["Regular Bullish"] > 0) | (df["Hidden Bullish"] > 0) | (df["Regular Bullish Label"] > 0) | (df["Hidden Bullish Label"] > 0)
df['bearish'] = (df["Regular Bearish"] > 0) | (df["Hidden Bearish"] > 0) | (df["Regular Bearish Label"] > 0) | (df["Hidden Bearish Label"] > 0)
df['neutral'] = ~(df['bullish'] | df['bearish'])
df.drop(columns=['Regular Bearish', 'Hidden Bearish', 'Regular Bullish Label', 'Hidden Bullish Label', 'Regular Bearish Label', 'Hidden Bearish Label',"Regular Bullish","Hidden Bullish"], inplace=True)
df['bullish'] = df['bullish'].astype(int)
df['bearish'] = df['bearish'].astype(int)
df['neutral'] = df['neutral'].astype(int)

In [None]:
#evaluation
print("{:=^40}".format("Evaluation"))
print(classification_report(y_test, y_pred, target_names=['bullish', 'bearish', 'neutral']))
print("{:=^40}".format(""))

# 評估指標

- **精確度（Precision）**：被預測為正樣本並且預測正確的樣本數占被預測為正樣本的樣本數的比例。這是一個評價模型準確性的指標。

- **召回率（Recall）**：也叫真正率（True Positive Rate, TPR），被預測為正樣本並且預測正確的樣本數占所有真實正樣本的比例。這是一個評價模型覆蓋率的指標。

- **F1 分數**：精確度和召回率的調和平均數，這是一個綜合考慮了精確度和召回率的指標。如果你同時關心精確度和召回率，那麼 F1 分數是一個很好的評價指標。

- **準確率（Accuracy）**：預測正確的樣本數占總樣本數的比例。這是一個評價模型整體性能的指標。

- **支持度（Support）**：實際的每個類別的樣本數。

- **宏平均（Macro Avg）**：先對每個類別分別計算指標，然後取平均。這種方式不考慮類別不平衡。

- **加權平均（Weighted Avg）**：先對每個類別分別計算指標，然後按照每個類別的樣本數加權平均。這種方式考慮了類別不平衡。

In [None]:
#save model
model.save('model_1m_ETH.h5')

# 優化建議

- 重新採樣：對訓練數據進行重新采樣，使各類別的樣本數量更為均衡。這可以通過過採樣（增加少數類別的樣本數量），或者欠採樣（減少多數類別的樣本數量）來實現。

- 使用合成樣本：例如，`SMOTE (Synthetic Minority Over-sampling Technique)` 方法可以生成少數類別的合成樣本，從而提升模型在這些類別上的預測性能。

- 調整類別權重：在模型訓練過程中，增大少數類別的權重，以便讓模型在誤分類這些類別時，承受更大的損失。

- 嘗試不同的模型：某些模型可能對不平衡數據更為`Robust`，例如，集成方法(陳冠霖的專題)或深度學習模型。

- 嘗試其他評價指標：比如 `ROC AUC`、`PR AUC`，這些指標更關注模型在各類別之間的區分度，而不僅僅是簡單的分類精度。

- 改變問題設定：如果可能的話，可以考慮將問題轉化為二分類或者排序問題。