In [1]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset, Dataset

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from preprocessing_data.utils import format_Dataframes, preprocessing_dataframe, create_sliding_windows

# Preprocessing Data

In [2]:
df = format_Dataframes(data_path="D:\-DSP391m-Forecasting-Financial-Time-Series-With-Transformer\craw_data\FPT_stock.xlsx",
                                 type_file="xlsx")
df

Unnamed: 0,Tên,Ngày,Đóng cửa,Điều chỉnh,Thay đổi,Thay đổi 1,%,Khối lượng (Khớp lệnh),Giá trị (Khớp lệnh),Khối lượng (Thỏa thuận),Giá trị (Thỏa thuận),Mở cửa,Cao nhất,Thấp nhất
0,FPT,2024-05-21,136.0,--,3(2.26,3(2.26%),%),,467.06,734.400,99.45,134.0,136.0,133.5
1,FPT,2024-05-20,133.0,--,-1.5(-1.12,-1.5(-1.12%),%),,270.50,,152.86,135.2,136.1,133.0
2,FPT,2024-05-17,134.5,--,-0.9(-0.66,-0.9(-0.66%),%),,310.27,642.601,86.43,135.4,136.0,133.7
3,FPT,2024-05-16,135.4,--,1.3(0.97,1.3(0.97%),%),,423.78,924.900,119.98,135.6,137.3,134.2
4,FPT,2024-05-15,134.1,--,3.6(2.76,3.6(2.76%),%),,485.07,,139.80,131.5,134.5,131.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4338,FPT,2006-12-19,486.0,2024-02-16 00:00:00,23(4.97,23(4.97%),%),137.52,0.07,0.000,0.00,486.0,486.0,486.0
4339,FPT,2006-12-18,463.0,15.44,22(4.99,22(4.99%),%),215.79,0.10,10.000,0.00,463.0,463.0,463.0
4340,FPT,2006-12-15,441.0,2024-07-14 00:00:00,21(5.00,21(5.00%),%),265.30,0.12,30.000,0.01,441.0,441.0,441.0
4341,FPT,2006-12-14,420.0,14,20(5.00,20(5.00%),%),280.71,0.12,96.000,0.04,420.0,420.0,420.0


In [3]:
data_FPT = preprocessing_dataframe(df)
data_FPT.head(10)

Unnamed: 0_level_0,Đóng cửa,Khối lượng (Khớp lệnh),Giá trị (Khớp lệnh),Khối lượng (Thỏa thuận),Giá trị (Thỏa thuận),Mở cửa,Cao nhất,Thấp nhất
Ngày,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2006-12-13,0.58116,0.083545,3.5e-05,0.0,0.0,0.581226,0.574601,0.581292
2006-12-14,0.612771,0.280761,0.000142,0.096823,4.9e-05,0.612832,0.605881,0.612893
2006-12-15,0.645962,0.265348,0.000142,0.030257,1.2e-05,0.646018,0.638724,0.646074
2006-12-18,0.680733,0.215829,0.000118,0.010086,0.0,0.680784,0.673131,0.680834
2006-12-19,0.717086,0.137545,8.3e-05,0.0,0.0,0.71713,0.709102,0.717175
2006-12-20,0.755018,0.22016,0.00013,0.0,0.0,0.755057,0.746637,0.755096
2006-12-21,0.715505,0.156428,9.4e-05,0.0,0.0,0.740834,0.732562,0.715595
2006-12-22,0.677572,0.143526,8.3e-05,0.010086,0.0,0.677623,0.670003,0.677674
2006-12-25,0.64122,0.022144,1.2e-05,0.030257,1.2e-05,0.641277,0.634032,0.641334
2006-12-26,0.674411,0.300204,0.000153,0.020171,1.2e-05,0.608091,0.666875,0.608153


In [4]:
data_FPT.isnull().sum()

Đóng cửa                   0
Khối lượng (Khớp lệnh)     0
Giá trị (Khớp lệnh)        0
Khối lượng (Thỏa thuận)    0
Giá trị (Thỏa thuận)       0
Mở cửa                     0
Cao nhất                   0
Thấp nhất                  0
dtype: int64

# Data Exploration

In [5]:
data_FPT.head(10)

Unnamed: 0_level_0,Đóng cửa,Khối lượng (Khớp lệnh),Giá trị (Khớp lệnh),Khối lượng (Thỏa thuận),Giá trị (Thỏa thuận),Mở cửa,Cao nhất,Thấp nhất
Ngày,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2006-12-13,0.58116,0.083545,3.5e-05,0.0,0.0,0.581226,0.574601,0.581292
2006-12-14,0.612771,0.280761,0.000142,0.096823,4.9e-05,0.612832,0.605881,0.612893
2006-12-15,0.645962,0.265348,0.000142,0.030257,1.2e-05,0.646018,0.638724,0.646074
2006-12-18,0.680733,0.215829,0.000118,0.010086,0.0,0.680784,0.673131,0.680834
2006-12-19,0.717086,0.137545,8.3e-05,0.0,0.0,0.71713,0.709102,0.717175
2006-12-20,0.755018,0.22016,0.00013,0.0,0.0,0.755057,0.746637,0.755096
2006-12-21,0.715505,0.156428,9.4e-05,0.0,0.0,0.740834,0.732562,0.715595
2006-12-22,0.677572,0.143526,8.3e-05,0.010086,0.0,0.677623,0.670003,0.677674
2006-12-25,0.64122,0.022144,1.2e-05,0.030257,1.2e-05,0.641277,0.634032,0.641334
2006-12-26,0.674411,0.300204,0.000153,0.020171,1.2e-05,0.608091,0.666875,0.608153


In [6]:
data_FPT.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 4343 entries, 2006-12-13 to 2024-05-21
Data columns (total 8 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Đóng cửa                 4343 non-null   float64
 1   Khối lượng (Khớp lệnh)   4343 non-null   float64
 2   Giá trị (Khớp lệnh)      4343 non-null   float64
 3   Khối lượng (Thỏa thuận)  4343 non-null   float64
 4   Giá trị (Thỏa thuận)     4343 non-null   float64
 5   Mở cửa                   4343 non-null   float64
 6   Cao nhất                 4343 non-null   float64
 7   Thấp nhất                4343 non-null   float64
dtypes: float64(8)
memory usage: 305.4 KB


In [7]:
data_FPT.describe()

Unnamed: 0,Đóng cửa,Khối lượng (Khớp lệnh),Giá trị (Khớp lệnh),Khối lượng (Thỏa thuận),Giá trị (Thỏa thuận),Mở cửa,Cao nhất,Thấp nhất
count,4343.0,4343.0,4343.0,4343.0,4343.0,4343.0,4343.0,4343.0
mean,0.077245,0.386191,0.087992,0.099514,0.022386,0.077426,0.077282,0.076303
std,0.133764,0.214971,0.101205,0.188193,0.064048,0.133862,0.133581,0.132555
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.023392,0.239403,0.022747,0.0,0.0,0.023388,0.023303,0.023187
50%,0.035088,0.386191,0.060407,0.0,0.0,0.03524,0.035815,0.034603
75%,0.078552,0.421741,0.104396,0.099514,0.022386,0.078382,0.078433,0.077263
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


# Data Preprocessing

In [8]:
y = data_FPT["Đóng cửa"]
X = data_FPT.drop(columns=["Đóng cửa"])

In [9]:
y.head(10)

Ngày
2006-12-13    0.581160
2006-12-14    0.612771
2006-12-15    0.645962
2006-12-18    0.680733
2006-12-19    0.717086
2006-12-20    0.755018
2006-12-21    0.715505
2006-12-22    0.677572
2006-12-25    0.641220
2006-12-26    0.674411
Name: Đóng cửa, dtype: float64

In [10]:
X.head(10)

Unnamed: 0_level_0,Khối lượng (Khớp lệnh),Giá trị (Khớp lệnh),Khối lượng (Thỏa thuận),Giá trị (Thỏa thuận),Mở cửa,Cao nhất,Thấp nhất
Ngày,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2006-12-13,0.083545,3.5e-05,0.0,0.0,0.581226,0.574601,0.581292
2006-12-14,0.280761,0.000142,0.096823,4.9e-05,0.612832,0.605881,0.612893
2006-12-15,0.265348,0.000142,0.030257,1.2e-05,0.646018,0.638724,0.646074
2006-12-18,0.215829,0.000118,0.010086,0.0,0.680784,0.673131,0.680834
2006-12-19,0.137545,8.3e-05,0.0,0.0,0.71713,0.709102,0.717175
2006-12-20,0.22016,0.00013,0.0,0.0,0.755057,0.746637,0.755096
2006-12-21,0.156428,9.4e-05,0.0,0.0,0.740834,0.732562,0.715595
2006-12-22,0.143526,8.3e-05,0.010086,0.0,0.677623,0.670003,0.677674
2006-12-25,0.022144,1.2e-05,0.030257,1.2e-05,0.641277,0.634032,0.641334
2006-12-26,0.300204,0.000153,0.020171,1.2e-05,0.608091,0.666875,0.608153


### Splitting the Data

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=152)

In [12]:
X_train.head(10)

Unnamed: 0_level_0,Khối lượng (Khớp lệnh),Giá trị (Khớp lệnh),Khối lượng (Thỏa thuận),Giá trị (Thỏa thuận),Mở cửa,Cao nhất,Thấp nhất
Ngày,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2007-07-17,0.464404,0.158988,0.0,0.0,0.407396,0.402565,0.407489
2007-09-21,0.285671,0.076666,0.0,0.0,0.307838,0.305599,0.306368
2022-01-19,0.649417,0.06794,0.050429,0.005758,0.088338,0.08852,0.088324
2013-08-29,0.139315,0.007049,0.023954,0.022386,0.016435,0.017047,0.01659
2021-04-01,0.386191,0.232584,0.655572,0.066043,0.072219,0.073663,0.072365
2009-03-03,0.236953,0.01163,0.0,0.0,0.018015,0.015014,0.014852
2016-06-17,0.206157,0.009836,0.032274,0.001687,0.012958,0.012355,0.012956
2019-10-24,0.386191,0.151915,0.028401,0.022386,0.038875,0.040038,0.038395
2013-07-16,0.107899,0.005538,0.030277,0.001687,0.017541,0.017673,0.01738
2010-12-29,0.326999,0.087992,0.0,0.0,0.051833,0.050673,0.049613


In [13]:
y_train.head(10)

Ngày
2007-07-17    0.407302
2007-09-21    0.309309
2022-01-19    0.088668
2013-08-29    0.016279
2021-04-01    0.074917
2009-03-03    0.014541
2016-06-17    0.012802
2019-10-24    0.040936
2013-07-16    0.017860
2010-12-29    0.050893
Name: Đóng cửa, dtype: float64

### Creating training set

In [14]:
# Creating a Training set with 60 time-steps and 1 output
X_train_new, y_train_new = create_sliding_windows(X_train, y_train)

  ys.append(y[i + time_steps])


In [15]:
X_train_new.shape

(3414, 60, 7)

In [16]:
y_train_new.shape

(3414,)

In [17]:
#Convert to PyTorch tensor;
X = torch.tensor(X_train_new, dtype=torch.float32)
y = torch.tensor(y_train_new, dtype=torch.float32)

train_dataset = TensorDataset(X, y)

#data loader;
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

# Model and Training

In [18]:
class StockLSTM(nn.Module):
    def __init__(self, input_size=7, hidden_size=50, num_layers=10, output_size=1):
        super(StockLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = StockLSTM().to(device)

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

num_epochs = 100

for epoch in range(num_epochs):
    model.train()
    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch [10/100], Loss: 0.0334
Epoch [20/100], Loss: 0.0015
Epoch [30/100], Loss: 0.0367
Epoch [40/100], Loss: 0.0042
Epoch [50/100], Loss: 0.0029
Epoch [60/100], Loss: 0.0308
Epoch [70/100], Loss: 0.0544
Epoch [80/100], Loss: 0.0352
Epoch [90/100], Loss: 0.0205
Epoch [100/100], Loss: 0.0125


# Evaluation

In [19]:
X_test_new, y_test_new = create_sliding_windows(X_test, y_test)

  ys.append(y[i + time_steps])


In [20]:
X_test_new.shape

(809, 60, 7)

In [21]:
y_test_new.shape

(809,)

In [22]:
#Convert to PyTorch tensor;
X_test_end = torch.tensor(X_test_new, dtype=torch.float32)
y_test_end = torch.tensor(y_test_new, dtype=torch.float32)

test_dataset = TensorDataset(X, y)

#data loader;
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=True)

In [23]:
model.eval()
with torch.no_grad():
    predictions = []
    actuals = []
    for inputs, targets in test_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        outputs = model(inputs)
        predictions.extend(outputs.cpu().numpy())
        actuals.extend(targets.cpu().numpy())


predictions = np.array(predictions)
predictions = np.squeeze(predictions)

actuals = np.array(actuals)


#Evaluation;
mse = mean_squared_error(actuals, predictions)
mae = mean_absolute_error(actuals, predictions)
huber_loss = nn.SmoothL1Loss()(torch.tensor(predictions), torch.tensor(actuals)).item()

print(f'MSE: {mse:.4f}')
print(f'MAE: {mae:.4f}')
print(f'Huber Loss: {huber_loss:.4f}')

MSE: 0.0173
MAE: 0.0693
Huber Loss: 0.0086
