In [25]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error
from keras.models import Sequential
from keras.layers import Dense, Input, LSTM
from keras.callbacks import EarlyStopping
from keras_tuner import RandomSearch, HyperParameters
import tensorflow as tf
import plotly.graph_objects as go

In [26]:
df = pd.read_csv('tsmc_data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1762 entries, 0 to 1761
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Price   1762 non-null   object
 1   Close   1761 non-null   object
 2   High    1761 non-null   object
 3   Low     1761 non-null   object
 4   Open    1761 non-null   object
 5   Volume  1761 non-null   object
dtypes: object(6)
memory usage: 82.7+ KB


In [27]:
df = df.drop(index=[0, 1])
missing_val = df.isnull().sum()
missing_val

Price     0
Close     0
High      0
Low       0
Open      0
Volume    0
dtype: int64

In [28]:
# handle missing val
df.ffill(inplace=True)
df.head(5)

Unnamed: 0,Price,Close,High,Low,Open,Volume
2,2018-01-02,34.1712760925293,34.1962662572868,33.67145055282769,33.74642422489328,4984000
3,2018-01-03,34.74608612060547,34.82939199967066,34.52116501382098,34.6461222435216,6963200
4,2018-01-04,34.56280899047852,34.987657721727544,34.52948664902551,34.92101303882153,4876600
5,2018-01-05,35.3708610534668,35.462495918950445,34.812726040493885,34.896028730318704,5330800
6,2018-01-08,35.35420227050781,35.38752461980848,35.14594235406982,35.32088309900127,3538200


In [29]:
# Use feature to predict target
feature = ['High', 'Low', 'Open', 'Volume']
target = 'Close'

X = df[feature].values
y = df[target].values.reshape(-1, 1)

In [30]:
# Split the data to train, val and test
n = len(X)
train_size = int(n * 0.7)
val_size = int(n * 0.15)

# training set
X_train = X[:train_size]
y_train = y[:train_size]

# validation set
X_val = X[train_size:train_size + val_size]
y_val = y[train_size:train_size + val_size]

# test set
X_test = X[train_size + val_size:]
y_test = y[train_size + val_size:]

In [31]:
# Normalize
# feature
feature_scaler = MinMaxScaler()
feature_scaler.fit(X_train)
X_train_scaled = feature_scaler.transform(X_train)
X_val_scaled = feature_scaler.transform(X_val)
X_test_scaled = feature_scaler.transform(X_test)

# target
target_scaler = MinMaxScaler()
target_scaler.fit(y_train)
y_train_scaled = target_scaler.transform(y_train)
y_val_scaled = target_scaler.transform(y_val)
y_test_scaled = target_scaler.transform(y_test)

In [32]:
# Create sequences
def create_sequence(X, y, look_back=7, foresight=1):
    X_seq, y_seq = [], []
    for i in range(len(X) - look_back - foresight):
        X_seq.append(X[i:i+look_back])
        y_seq.append(y[i+look_back+foresight-1])
    return X_seq, y_seq

In [33]:
X_train_seq, y_train_seq = create_sequence(X_train_scaled, y_train_scaled)
X_val_seq, y_val_seq = create_sequence(X_val_scaled, y_val_scaled)
X_test_seq, y_test_seq = create_sequence(X_test_scaled, y_test_scaled)

X_train_seq = np.array(X_train_seq)
y_train_seq = np.array(y_train_seq)

X_val_seq = np.array(X_val_seq)
y_val_seq = np.array(y_val_seq)

X_test_seq = np.array(X_test_seq)
y_test_seq = np.array(y_test_seq)

In [34]:
# LSTM model with hyper parameter tuning 

def LSTM_tunning(hp):
    model = Sequential()
    model.add(Input(shape=(7, 4)))
    # LSTM
    model.add(LSTM(units=hp.Int('lstm_unit', min_value=32, max_value=128, step=32), return_sequences=False))
    # Dense
    model.add(Dense(units=hp.Int('dense_unit', min_value=16, max_value=64, step=16), activation='relu'))

    # Output
    model.add(Dense(1, activation='linear'))

    optimizer = hp.Choice('optimizer', ['adam', 'rmsprop'])
    learning_rate = hp.Choice('lr', [1e-2, 1e-3, 5e-4])
    
    if optimizer == 'adam':
        opt = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    else:
        opt = tf.keras.optimizers.RMSprop(learning_rate=learning_rate)
    model.compile(optimizer=opt, loss='mae', metrics=['mae'])
    return model

lstm_tuner = RandomSearch(
    hypermodel=LSTM_tunning,
    objective='val_mae',
    max_trials=30,
    executions_per_trial=2,
    directory='tuning',
    project_name='lstm_stock_forecast'
)

early_stop = EarlyStopping(patience=10, restore_best_weights=True)

lstm_tuner.search(X_train_seq, y_train_seq, epochs=100, validation_data=(X_val_seq, y_val_seq), batch_size = 32, callbacks = [early_stop])

Reloading Tuner from tuning\lstm_stock_forecast\tuner0.json


In [35]:
best_lsmt_parameter = lstm_tuner.get_best_hyperparameters(num_trials=1)[0]

print('Best Hyper tuning for LSTM:')
print(f"LSTM units: {best_lsmt_parameter.get('lstm_unit')}")
print(f"Dense units: {best_lsmt_parameter.get('dense_unit')}")
print(f"Optimizer: {best_lsmt_parameter.get('optimizer')}")
print(f"Learning rate: {best_lsmt_parameter.get('lr')}")

Best Hyper tuning for LSTM:
LSTM units: 64
Dense units: 32
Optimizer: adam
Learning rate: 0.01


In [36]:
# check the result if it's acceptable
best_lstm_model = lstm_tuner.get_best_models(num_models=1)[0]
y_val_pred_scaled = best_lstm_model(X_val_seq)

y_val_pred = target_scaler.inverse_transform(y_val_pred_scaled)
y_val_true = target_scaler.inverse_transform(y_val_seq)


mae_dollor = mean_absolute_error(y_val_true, y_val_pred)
print(f"Validation MAE in dollars: ${mae_dollor:.2f}")

Validation MAE in dollars: $1.40



Skipping variable loading for optimizer 'adam', because it has 2 variables whereas the saved optimizer has 16 variables. 



In [53]:
y_val_true = y_val_true.flatten()
y_val_pred = y_val_pred.flatten()

In [64]:
val_dates = df['Price'].iloc[0:0 + len(y_val_true)]

In [None]:
# Plot the performace of Validation
x_val =  list(range(len(y_val_true)))
fig_val = go.Figure()
fig_val.add_trace(go.Scatter(x=val_dates, y=y_val_true, mode="lines", name="True"))
fig_val.add_trace(go.Scatter(x=val_dates, y=y_val_pred, mode="lines", name="Prediction"))
fig_val.update_layout(title_text='Validation Comparison', xaxis_title='Dates', yaxis_title='Price')
fig_val.show()
