In [5]:
'''
0 --- Dovish
1 --- Hawkish
2 --- Neutral
'''

'\n0 --- Dovish\n1 --- Hawkish\n2 --- Neutral\n'

In [6]:
import re
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
import sklearn.model_selection as sk
import sklearn.metrics as skm

# Text pre-processing
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping

# Modeling
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, GRU, Dense, Embedding, Dropout, GlobalAveragePooling1D, Flatten, \
    SpatialDropout1D, Bidirectional
import string
from string import digits
import os

In [7]:
#import lab-manual-combine-training data

# Read an Excel file using pandas
mm_5768 = '/Users/simonli/Desktop/data297/fomc-hawkish-dovish-main/training_data/test-and-training/training_data/lab-manual-mm-split-train-5768.xlsx'
df_5768 = pd.read_excel(mm_5768)

mm_78516 = '/Users/simonli/Desktop/data297/fomc-hawkish-dovish-main/training_data/test-and-training/training_data/lab-manual-mm-split-train-78516.xlsx'
df_78516 = pd.read_excel(mm_78516)

mm_944601 = '/Users/simonli/Desktop/data297/fomc-hawkish-dovish-main/training_data/test-and-training/training_data/lab-manual-mm-split-train-944601.xlsx'
df_944601 = pd.read_excel(mm_944601)


In [8]:
# Read an Excel file using pandas
mm_5768_test = '/Users/simonli/Desktop/data297/fomc-hawkish-dovish-main/training_data/test-and-training/test_data/lab-manual-mm-split-test-5768.xlsx'
df_5768_test = pd.read_excel(mm_5768_test)


mm_78516_test = '/Users/simonli/Desktop/data297/fomc-hawkish-dovish-main/training_data/test-and-training/test_data/lab-manual-mm-split-test-78516.xlsx'
df_78516_test = pd.read_excel(mm_78516_test)

mm_944601_test = '/Users/simonli/Desktop/data297/fomc-hawkish-dovish-main/training_data/test-and-training/test_data/lab-manual-mm-split-test-944601.xlsx'
df_944601_test = pd.read_excel(mm_944601_test)


In [9]:
def get_max_length(df):
    max = 0
    for index, row in df.iterrows():  # format sentence for tokenization
        sentence = row['sentence'].replace(",", "").replace(".", " ") \
            .replace("—", " ").replace("â€", "").replace("  ", " ") \
            .replace(";", "").replace("\n", " ").translate(str.maketrans('', '', string.punctuation))
        words = word_tokenize(sentence)
        if len(words) > max:
            max = len(words)
    return max

In [10]:
def run_lstm(train, test, max_len, seed, epoch_val, b_size):
    train, valid = sk.train_test_split(train, train_size=0.8, random_state=seed)

    X_train = train['sentence'].tolist()
    Y_train = train['label']

    X_test = test['sentence'].tolist()
    Y_test = test['label']

    X_valid = valid['sentence'].tolist()
    Y_valid = valid['label']

    trunc_type = 'post'
    padding_type = 'post'
    oov_tok = '<OOV>'  # out of vocabulary token
    vocab_size = 2000
    tokenizer = Tokenizer(num_words=vocab_size, char_level=False, oov_token=oov_tok)
    tokenizer.fit_on_texts(X_train)
    word_index = tokenizer.word_index
    total_words = len(word_index)

    # Padding
    train_sequences = tokenizer.texts_to_sequences(X_train)
    train_padded = pad_sequences(train_sequences,
                                 maxlen=max_len,
                                 padding=padding_type,
                                 truncating=trunc_type)
    test_sequences = tokenizer.texts_to_sequences(X_test)
    test_padded = pad_sequences(test_sequences,
                                maxlen=max_len,
                                padding=padding_type,
                                truncating=trunc_type)
    valid_sequences = tokenizer.texts_to_sequences(X_valid)
    valid_padded = pad_sequences(valid_sequences,
                                 maxlen=max_len,
                                 padding=padding_type,
                                 truncating=trunc_type)
    print('Shape of train tensor: ', train_padded.shape)
    print('Shape of test tensor: ', test_padded.shape)
    print('Shape of valid tensor: ', valid_padded.shape)

    # Define parameter
    embedding_dim = 16
    batch_size = b_size
    epochs = epoch_val

    # Define Dense Model Architecture
    model = Sequential()
    model.add(Embedding(vocab_size,
                        embedding_dim,
                        input_length=max_len,
                        mask_zero=True))
    model.add(Bidirectional(LSTM(4, return_sequences=False))) #Bi-LSTM
    model.add(Dense(5, activation='relu'))
    model.add(Dropout(0.7))
    model.add(Dense(3, activation='sigmoid'))
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    history = model.fit(train_padded, Y_train, validation_data=(valid_padded, Y_valid), epochs=epochs, shuffle=True,
                        verbose=1, batch_size=batch_size)
    res = model.predict(test_padded)
    res = res.argmax(axis=-1)
    print(res)
    cp = skm.classification_report(Y_test.tolist(), res, output_dict=True)

    val_acc = history.history['val_accuracy'][-1]
    test_acc = cp['weighted avg']['f1-score']

    return val_acc, test_acc

In [11]:
#Hyperparameters
epochs = [10, 20, 30]
batch_sizes = [4, 8, 16, 32]

res_df = {"Dataset": [],
          "Seed": [],
          "Epoch": [],
          "Batch-Size": [],
          "Val-Acc": [],
          "Test-Acc": []}

In [12]:
for e in epochs:
    for b in batch_sizes:
        max_len = get_max_length(df_5768)
        val_acc, test_acc = run_lstm(train=df_5768, test=df_5768_test, max_len=max_len,
                                                seed=5768, epoch_val=e, b_size=b)
        res_df["Dataset"].append("5768")
        res_df["Seed"].append(5768)
        res_df["Epoch"].append(e)
        res_df["Batch-Size"].append(b)
        res_df["Val-Acc"].append(val_acc)
        res_df["Test-Acc"].append(test_acc)





Shape of train tensor:  (724, 86)
Shape of test tensor:  (227, 86)
Shape of valid tensor:  (181, 86)


2023-12-09 14:47:11.419148: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 86, 16)            32000     
                                                                 
 bidirectional (Bidirectiona  (None, 8)                672       
 l)                                                              
                                                                 
 dense (Dense)               (None, 5)                 45        
                                                                 
 dropout (Dropout)           (None, 5)                 0         
                                                                 
 dense_1 (Dense)             (None, 3)                 18        
                                                                 
Total params: 32,735
Trainable params: 32,735
Non-trainable params: 0
____________________________________________________

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 86, 16)            32000     
                                                                 
 bidirectional_3 (Bidirectio  (None, 8)                672       
 nal)                                                            
                                                                 
 dense_6 (Dense)             (None, 5)                 45        
                                                                 
 dropout_3 (Dropout)         (None, 5)                 0         
                                                                 
 dense_7 (Dense)             (None, 3)                 18        
                                                                 
Total params: 32,735
Trainable params: 32,735
Non-trainable params: 0
__________________________________________________

In [13]:
for e in epochs:
    for b in batch_sizes:
        max_len = get_max_length(df_78516)
        val_acc, test_acc = run_lstm(train=df_78516, test=df_78516_test, max_len=max_len,
                                                seed=78516, epoch_val=e, b_size=b)
        res_df["Dataset"].append("78516")
        res_df["Seed"].append(78516)
        res_df["Epoch"].append(e)
        res_df["Batch-Size"].append(b)
        res_df["Val-Acc"].append(val_acc)
        res_df["Test-Acc"].append(test_acc)





Shape of train tensor:  (724, 86)
Shape of test tensor:  (227, 86)
Shape of valid tensor:  (181, 86)
Model: "sequential_12"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_12 (Embedding)    (None, 86, 16)            32000     
                                                                 
 bidirectional_12 (Bidirecti  (None, 8)                672       
 onal)                                                           
                                                                 
 dense_24 (Dense)            (None, 5)                 45        
                                                                 
 dropout_12 (Dropout)        (None, 5)                 0         
                                                                 
 dense_25 (Dense)            (None, 3)                 18        
                                                                 
Total params: 32,7

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Model: "sequential_13"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_13 (Embedding)    (None, 86, 16)            32000     
                                                                 
 bidirectional_13 (Bidirecti  (None, 8)                672       
 onal)                                                           
                                                                 
 dense_26 (Dense)            (None, 5)                 45        
                                                                 
 dropout_13 (Dropout)        (None, 5)                 0         
                                                                 
 dense_27 (Dense)            (None, 3)                 18        
                                                                 
Total params: 32,735
Trainable params: 32,735
Non-trainable params: 0
_________________________________________________

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Model: "sequential_14"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_14 (Embedding)    (None, 86, 16)            32000     
                                                                 
 bidirectional_14 (Bidirecti  (None, 8)                672       
 onal)                                                           
                                                                 
 dense_28 (Dense)            (None, 5)                 45        
                                                                 
 dropout_14 (Dropout)        (None, 5)                 0         
                                                                 
 dense_29 (Dense)            (None, 3)                 18        
                                                                 
Total params: 32,735
Trainable params: 32,735
Non-trainable params: 0
_________________________________________________

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Model: "sequential_15"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_15 (Embedding)    (None, 86, 16)            32000     
                                                                 
 bidirectional_15 (Bidirecti  (None, 8)                672       
 onal)                                                           
                                                                 
 dense_30 (Dense)            (None, 5)                 45        
                                                                 
 dropout_15 (Dropout)        (None, 5)                 0         
                                                                 
 dense_31 (Dense)            (None, 3)                 18        
                                                                 
Total params: 32,735
Trainable params: 32,735
Non-trainable params: 0
_________________________________________________

In [14]:
for e in epochs:
    for b in batch_sizes:
        max_len = get_max_length(df_944601)
        val_acc, test_acc = run_lstm(train=df_944601, test=df_944601_test, max_len=max_len,
                                                seed=78516, epoch_val=e, b_size=b)
        res_df["Dataset"].append("78516")
        res_df["Seed"].append(78516)
        res_df["Epoch"].append(e)
        res_df["Batch-Size"].append(b)
        res_df["Val-Acc"].append(val_acc)
        res_df["Test-Acc"].append(test_acc)





Shape of train tensor:  (724, 86)
Shape of test tensor:  (227, 86)
Shape of valid tensor:  (181, 86)
Model: "sequential_24"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_24 (Embedding)    (None, 86, 16)            32000     
                                                                 
 bidirectional_24 (Bidirecti  (None, 8)                672       
 onal)                                                           
                                                                 
 dense_48 (Dense)            (None, 5)                 45        
                                                                 
 dropout_24 (Dropout)        (None, 5)                 0         
                                                                 
 dense_49 (Dense)            (None, 3)                 18        
                                                                 
Total params: 32,7

In [15]:
#check result
t = pd.DataFrame(res_df)
t

Unnamed: 0,Dataset,Seed,Epoch,Batch-Size,Val-Acc,Test-Acc
0,5768,5768,10,4,0.480663,0.476957
1,5768,5768,10,8,0.453039,0.540512
2,5768,5768,10,16,0.519337,0.393817
3,5768,5768,10,32,0.403315,0.405616
4,5768,5768,20,4,0.535912,0.517394
5,5768,5768,20,8,0.541436,0.537396
6,5768,5768,20,16,0.502762,0.520286
7,5768,5768,20,32,0.469613,0.524729
8,5768,5768,30,4,0.480663,0.417254
9,5768,5768,30,8,0.541436,0.52591


In [16]:
#find the best result
t[t["Test-Acc"] == t["Test-Acc"].max()]

Unnamed: 0,Dataset,Seed,Epoch,Batch-Size,Val-Acc,Test-Acc
23,78516,78516,30,32,0.508287,0.588969
