In [1]:
import datetime

import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
import matplotlib.pyplot as plt

from tensorflow.keras import Model, Sequential

from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.losses import MeanSquaredError
from tensorflow.keras.metrics import MeanAbsoluteError

from tensorflow.keras.layers import Dense, Conv1D, LSTM, Lambda, Reshape, RNN, LSTMCell,ConvLSTM2D

import warnings
warnings.filterwarnings('ignore')

In [2]:
plt.rcParams['figure.figsize'] = (10, 7.5)
plt.rcParams['axes.grid'] = False

In [3]:
print(tf.__version__)

2.12.0


In [4]:
tf.random.set_seed(42)
np.random.seed(42)

In [5]:
df = pd.read_csv('FinBERT/Data_final/BTC_final.csv').drop(['CionScore'],axis=1)
df = df.drop(['Unnamed: 0'], axis=1)
df.head()

Unnamed: 0,CoinDate,Open,High,Low,Close,Volume
0,2014-10-01,387.427002,391.378998,380.779999,383.61499,26229400.0
1,2014-10-02,383.988007,385.497009,372.946014,375.071991,21777700.0
2,2014-10-03,375.181,377.695007,357.859009,359.511993,30901200.0
3,2014-10-04,359.891998,364.487,325.885986,328.865997,47236500.0
4,2014-10-05,328.915985,341.800995,289.29599,320.51001,83308096.0


In [7]:
#將時間轉換為數值，以方便後續可以運算
timestamp_s = pd.to_datetime(df['CoinDate']).map(datetime.datetime.timestamp)
df['Coin_timestamp'] = timestamp_s
df = df.drop(['CoinDate'], axis=1)
df.head()

Unnamed: 0,Open,High,Low,Close,Volume,Coin_timestamp
0,387.427002,391.378998,380.779999,383.61499,26229400.0,1412093000.0
1,383.988007,385.497009,372.946014,375.071991,21777700.0,1412179000.0
2,375.181,377.695007,357.859009,359.511993,30901200.0,1412266000.0
3,359.891998,364.487,325.885986,328.865997,47236500.0,1412352000.0
4,328.915985,341.800995,289.29599,320.51001,83308096.0,1412438000.0


In [8]:
df.shape

(3073, 6)

In [9]:
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Open,3073.0,13082.73,16097.32,176.897,715.555,7296.165,17813.64,67549.73
High,3073.0,13409.83,16507.74,211.731,736.452,7447.272,18393.95,68789.62
Low,3073.0,12720.14,15623.94,171.51,705.26,7153.306,17382.55,66382.06
Close,3073.0,13088.7,16093.27,178.103,716.411,7302.089,17899.7,67566.83
Volume,3073.0,16565350000.0,19851800000.0,5914570.0,112354000.0,8660880000.0,28066360000.0,350968000000.0
Coin_timestamp,3073.0,1544803000.0,76657800.0,1412093000.0,1478448000.0,1544803000.0,1611158000.0,1677514000.0


In [10]:
n = len(df)

# Split 70:20:10 (train:validation:test)
train_df = df[0:int(n*0.7)]
val_df = df[int(n*0.7):int(n*0.9)]
test_df = df[int(n*0.9):]

train_df.shape, val_df.shape, test_df.shape

((2151, 6), (614, 6), (308, 6))

In [11]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(train_df)

train_df[train_df.columns] = scaler.transform(train_df[train_df.columns])
val_df[val_df.columns] = scaler.transform(val_df[val_df.columns])
test_df[test_df.columns] = scaler.transform(test_df[test_df.columns])

In [12]:
train_df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Open,2151.0,0.215505,0.212028,0.0,0.013279,0.177392,0.395715,1.0
High,2151.0,0.213083,0.211984,0.0,0.011472,0.173223,0.394385,1.0
Low,2151.0,0.21522,0.210212,0.0,0.01361,0.179136,0.394656,1.0
Close,2151.0,0.215487,0.211927,0.0,0.013218,0.177176,0.39572,1.0
Volume,2151.0,0.104343,0.155503,0.0,0.000735,0.025612,0.165412,1.0
Coin_timestamp,2151.0,0.5,0.288877,0.0,0.25,0.5,0.75,1.0


In [14]:
# train_df.to_csv('data/train.csv')
# val_df.to_csv('data/val.csv')
# test_df.to_csv('data/test.csv')

In [13]:
# train_df = pd.read_csv('../data/train.csv', index_col=0)
# val_df = pd.read_csv('../data/val.csv', index_col=0)
# test_df = pd.read_csv('../data/test.csv', index_col=0)

print(train_df.shape, val_df.shape, test_df.shape)

(2151, 6) (614, 6) (308, 6)


In [14]:
class DataWindow():
    def __init__(self, input_width, label_width, shift, 
                 train_df=train_df, val_df=val_df, test_df=test_df, 
                 label_columns=None):
        
        self.train_df = train_df
        self.val_df = val_df
        self.test_df = test_df
        
        self.label_columns = label_columns
        if label_columns is not None:
            self.label_columns_indices = {name: i for i, name in enumerate(label_columns)}
        self.column_indices = {name: i for i, name in enumerate(train_df.columns)}
        
        self.input_width = input_width
        self.label_width = label_width
        self.shift = shift
        
        self.total_window_size = input_width + shift
        
        self.input_slice = slice(0, input_width)
        self.input_indices = np.arange(self.total_window_size)[self.input_slice]
        
        self.label_start = self.total_window_size - self.label_width
        self.labels_slice = slice(self.label_start, None)
        self.label_indices = np.arange(self.total_window_size)[self.labels_slice]
    
    def split_to_inputs_labels(self, features):
        inputs = features[:, self.input_slice, :]
        labels = features[:, self.labels_slice, :]
        if self.label_columns is not None:
            labels = tf.stack(
                [labels[:,:,self.column_indices[name]] for name in self.label_columns],
                axis=-1
            )
        inputs.set_shape([None, self.input_width, None])
        labels.set_shape([None, self.label_width, None])
        
        return inputs, labels
    
    def plot(self, model=None, plot_col='Close', max_subplots=3):
        inputs, labels = self.sample_batch
        
        plt.figure(figsize=(12, 8))
        plot_col_index = self.column_indices[plot_col]
        max_n = min(max_subplots, len(inputs))
        
        for n in range(max_n):
            plt.subplot(3, 1, n+1)
            plt.ylabel(f'{plot_col} [scaled]')
            plt.plot(self.input_indices, inputs[n, :, plot_col_index],
                     label='Inputs', marker='.', zorder=-10)

            if self.label_columns:
              label_col_index = self.label_columns_indices.get(plot_col, None)
            else:
              label_col_index = plot_col_index

            if label_col_index is None:
              continue

            plt.scatter(self.label_indices, labels[n, :, label_col_index],
                        edgecolors='k', marker='s', label='Labels', c='green', s=64)
            if model is not None:
              predictions = model(inputs)
              plt.scatter(self.label_indices, predictions[n, :, label_col_index],
                          marker='X', edgecolors='k', label='Predictions',
                          c='red', s=64)

            if n == 0:
              plt.legend()

        plt.xlabel('Time (h)')
        
    def make_dataset(self, data):
        data = np.array(data, dtype=np.float32)
        ds = tf.keras.preprocessing.timeseries_dataset_from_array(
            data=data,
            targets=None,
            sequence_length=self.total_window_size,
            sequence_stride=1,
            shuffle=True,
            batch_size=32
        )
        
        ds = ds.map(self.split_to_inputs_labels)
        return ds
    
    @property
    def train(self):
        return self.make_dataset(self.train_df)
    
    @property
    def val(self):
        return self.make_dataset(self.val_df)
    
    @property
    def test(self):
        return self.make_dataset(self.test_df)
    
    @property
    def sample_batch(self):
        result = getattr(self, '_sample_batch', None)
        if result is None:
            result = next(iter(self.train))
            self._sample_batch = result
        return result

In [15]:
def compile_and_fit(model, window, patience=3, max_epochs=50):
    early_stopping = EarlyStopping(monitor='val_loss',
                                   patience=patience,
                                   mode='min')
    
    model.compile(loss=MeanSquaredError(),
                  optimizer=Adam(),
                  metrics=[MeanAbsoluteError()])
    
    history = model.fit(window.train,
                       epochs=max_epochs,
                       validation_data=window.val,
                       callbacks=[early_stopping])
    
    return history

In [16]:
column_indices = {name: i for i, name in enumerate(train_df.columns)}

## LSTM

### Multi-step model 

In [17]:
multi_window = DataWindow(input_width=7, label_width=7, shift=1, label_columns=['Close'])

In [18]:
ms_mae_val = []
ms_mae_test = []

In [19]:
for i in range(10):
    ms_lstm_model = Sequential([
        LSTM(64, return_sequences=True),
        Dense(units=1,kernel_initializer=tf.initializers.zeros),
    ])

    history = compile_and_fit(ms_lstm_model, multi_window)

    ms_val_performance = {}
    ms_performance = {}

    ms_val_performance['LSTM'] = ms_lstm_model.evaluate(multi_window.val)
    ms_performance['LSTM'] = ms_lstm_model.evaluate(multi_window.test, verbose=0)

    print(f"驗證集的MSE {ms_val_performance['LSTM'][1]}")
    print(f"測試集的MSE {ms_performance['LSTM'][1]}")

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
驗證集的MSE 0.2604219317436218
測試集的MSE 0.055513009428977966
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
驗證集的MSE 0.2629072368144989
測試集的MSE 0.052390649914741516
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
驗證集的MSE 0.2655491232872009
測試集的MSE 0.05317193269729614
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
驗證集的MSE 0.2858068346977234
測試集的MSE 0.05314365401864052
Epoch 1/50
Epoch 2/50
