# Build and train the model in a specific sector: Banks

You can try any sector you want by replacing the `sector` in the code

In [11]:
# Load libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sys
sys.path.append('..')

In [12]:
# Load data from csv file
data_companies = pd.read_csv('ticker-overview.csv')
data_companies = data_companies.drop('Unnamed: 0', axis = 1)

In [13]:
data_companies['industryEn'].value_counts()

Construction & Materials       335
Industrial Goods & Services    265
Food & Beverage                150
Utilities                      145
Real Estate                    125
Basic Resources                110
Personal & Household Goods      77
Chemicals                       73
Health Care                     59
Financial Services              55
Travel & Leisure                50
Media                           41
Retail                          32
Technology                      30
Banks                           27
Automobiles & Parts             15
Insurance                       12
Oil & Gas                       12
Telecommunications               8
Name: industryEn, dtype: int64

In [14]:
# Only pick stickers from HOSE and HNX 
df = pd.read_csv('list-company-by-sector/Banks.csv')
bank_df = df.drop(df[(df['exchange'] == 'UPCOM') | (df['noShareholders'] == 0.0)].index)
bank_df.head()

Unnamed: 0,exchange,shortName,industryID,industryIDv2,industry,industryEn,establishedYear,noEmployees,noShareholders,foreignPercent,...,deltaInMonth,deltaInYear,outstandingShare,issueShare,companyType,ticker,status,code,message,traceId
1,HOSE,ACB,289.0,8355.0,Ngân hàng,Banks,1993.0,11008.0,44011.0,0.3,...,-0.012,0.244,3377.4,3377.4,NH,ACB,,,,
2,HOSE,BIDV,289.0,8355.0,Ngân hàng,Banks,1993.0,25416.0,36773.0,0.173,...,0.002,0.347,5058.5,5058.5,NH,BID,,,,
6,HOSE,HDBank,289.0,8355.0,Ngân hàng,Banks,1992.0,13992.0,6015.0,0.19,...,0.097,0.136,2515.3,2530.3,NH,HDB,,,,
8,HOSE,LienVietPostBank,289.0,8355.0,Ngân hàng,Banks,2008.0,8105.0,59906.0,0.049,...,0.073,0.18,1729.1,1729.1,NH,LPB,,,,
9,HOSE,MBBank,289.0,8355.0,Ngân hàng,Banks,1994.0,15910.0,126621.0,0.232,...,0.015,-0.043,4534.0,4534.0,NH,MBB,,,,


In [15]:
import os
import pandas as pd
import numpy as np

# Define the sector of interest
sector = 'Banks'

# Load the ticket overview dataset
ticker_overview = pd.read_csv('ticker-overview.csv')

# Select the tickers for companies in the sector of interest
tickers = ticker_overview.loc[ticker_overview['industryEn'] == sector, 'ticker'].tolist()

In [16]:
# Load the historical price data for the selected tickers
paths = []  # List to store individual DataFrames

for ticker in tickers:
    exchange = ticker_overview.loc[ticker_overview['ticker'] == ticker, 'exchange'].iloc[0]
    
    if exchange == 'HOSE':
        path = os.path.join('stock-historical-data', ticker + '-VNINDEX-History.csv')
        paths.append(path)
            
    elif exchange == 'HNX':
        path = os.path.join('stock-historical-data', ticker + '-' + exchange + 'Index' + '-History.csv')
        paths.append(path)
            
    elif exchange == 'UPCOM':
        continue

In [17]:
paths

['stock-historical-data/ACB-VNINDEX-History.csv',
 'stock-historical-data/BID-VNINDEX-History.csv',
 'stock-historical-data/CTG-VNINDEX-History.csv',
 'stock-historical-data/EIB-VNINDEX-History.csv',
 'stock-historical-data/HDB-VNINDEX-History.csv',
 'stock-historical-data/LPB-VNINDEX-History.csv',
 'stock-historical-data/MBB-VNINDEX-History.csv',
 'stock-historical-data/MSB-VNINDEX-History.csv',
 'stock-historical-data/BAB-HNXIndex-History.csv',
 'stock-historical-data/NVB-HNXIndex-History.csv',
 'stock-historical-data/OCB-VNINDEX-History.csv',
 'stock-historical-data/SSB-VNINDEX-History.csv',
 'stock-historical-data/SHB-VNINDEX-History.csv',
 'stock-historical-data/STB-VNINDEX-History.csv',
 'stock-historical-data/TCB-VNINDEX-History.csv',
 'stock-historical-data/TPB-VNINDEX-History.csv',
 'stock-historical-data/VCB-VNINDEX-History.csv',
 'stock-historical-data/VIB-VNINDEX-History.csv',
 'stock-historical-data/VPB-VNINDEX-History.csv']

In [18]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0,1))
window_size = 30
X_data, y_data = [], []
for path in paths:
    df = pd.read_csv(path)
    df1 = df.reset_index()['Close']
    df1 = scaler.fit_transform(np.array(df1).reshape(-1,1))
    for i in range(len(df1)-window_size-7):
        a = df1[i:(i + window_size)]
        X_data.append(a)
        b = df1[(i + window_size):(i + window_size+7)] 
        y_data.append(b) 
        

In [19]:
X_data[0]

array([[0.15595229],
       [0.16493346],
       [0.17231474],
       [0.18907716],
       [0.18907716],
       [0.18191404],
       [0.17133299],
       [0.15853392],
       [0.15755218],
       [0.15755218],
       [0.13998982],
       [0.14017162],
       [0.16453349],
       [0.17751436],
       [0.18387754],
       [0.17333285],
       [0.17471457],
       [0.17689623],
       [0.18151407],
       [0.18151407],
       [0.18151407],
       [0.18449567],
       [0.18347757],
       [0.17191477],
       [0.17551451],
       [0.17871428],
       [0.19009527],
       [0.19009527],
       [0.19547669],
       [0.19547669]])

In [20]:
y_data[0]

array([[0.19525853],
       [0.2054396 ],
       [0.2110392 ],
       [0.21005745],
       [0.20743946],
       [0.21383899],
       [0.21583885]])

In [21]:
### Split the data into training, validation and test set
from helper_functions import data_preprocessing as pp
X_train, y_train, X_val, y_val, X_test, y_test = pp.split_dataset(X_data, y_data, 0.1)

print("Shape of training set: ", X_train.shape)
print("Shape of validation set: ", X_val.shape)
print("Shape of test set: ", X_test.shape)

Shape of training set:  (32450, 30, 1)
Shape of validation set:  (3606, 30, 1)
Shape of test set:  (4007, 30, 1)


In [22]:
### Create and train the model
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Dense, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping

# Define the LSTM model
model = Sequential()
model.add(LSTM(256, return_sequences=True, input_shape=(30, 1), activation='relu'))  # LSTM layer with 128 units
model.add(Dropout(0.5))  # Dropout layer for regularization
model.add(LSTM(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))  
model.add(Dense(7))  # Dense output layer with 7 units

early_stopping = EarlyStopping(monitor='val_mse', patience=10, restore_best_weights=True)

# Compile and train the model with Mean Squared Error loss function
model.compile(optimizer= tf.keras.optimizers.Adam(learning_rate=0.01), loss='mse', metrics=['mse'])
model.fit(X_train,y_train,validation_data=(X_val,y_val),epochs=20,batch_size=1028,verbose=1,callbacks = [early_stopping])



Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20


<keras.callbacks.History at 0x7f8eac8cb8b0>

In [23]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 30, 256)           264192    
                                                                 
 dropout (Dropout)           (None, 30, 256)           0         
                                                                 
 lstm_1 (LSTM)               (None, 128)               197120    
                                                                 
 dropout_1 (Dropout)         (None, 128)               0         
                                                                 
 dense (Dense)               (None, 64)                8256      
                                                                 
 dense_1 (Dense)             (None, 7)                 455       
                                                                 
Total params: 470,023
Trainable params: 470,023
Non-trai

In [25]:
### Get prediction on the test data and convert the result back to stock price (i.e., de-normalization)
from sklearn.metrics import mean_squared_error
y_pred = model.predict(X_test)
y_pred = np.reshape(y_pred, (-1, 1))
y_test = np.reshape(y_test, (-1, 1))
# Get prediction on the test data
print("MSE on the test set: ", mean_squared_error(y_pred, y_test))

MSE on the test set:  0.0035046333075473986


In [27]:
# Define the LSTM model2
model2 = Sequential()
model2.add(LSTM(256, return_sequences=True, input_shape=(30, 1), activation='relu'))  # LSTM layer with 128 units
model2.add(LSTM(128, activation='relu'))
model2.add(Dense(7))  # Dense output layer with 7 units

early_stopping = EarlyStopping(monitor='val_mse', patience=10, restore_best_weights=True)
model2.summary()



Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_2 (LSTM)               (None, 30, 256)           264192    
                                                                 
 lstm_3 (LSTM)               (None, 128)               197120    
                                                                 
 dense_2 (Dense)             (None, 7)                 903       
                                                                 
Total params: 462,215
Trainable params: 462,215
Non-trainable params: 0
_________________________________________________________________


In [28]:
# Compile and train the model with Mean Squared Error loss function
model2.compile(optimizer= tf.keras.optimizers.Adam(learning_rate=0.01), loss='mse', metrics=['mse'])
model2.fit(X_train,y_train,validation_data=(X_val,y_val),epochs=50,batch_size=1028,verbose=1,callbacks = [early_stopping])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f8e20273bb0>

In [29]:
### Get prediction on the test data and convert the result back to stock price (i.e., de-normalization)
from sklearn.metrics import mean_squared_error
y_pred = model.predict(X_test)
y_pred = np.reshape(y_pred, (-1, 1))
y_test = np.reshape(y_test, (-1, 1))
# Get prediction on the test data
print("MSE on the test set: ", mean_squared_error(y_pred, y_test))

MSE on the test set:  0.0035046333075473986
