In [1]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split, GridSearchCV
from pandas.api.types import is_string_dtype, CategoricalDtype
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
#from keras.preprocessing.sequence import pad_sequences
#authors note, different version of Keras put pad_sequences in different places, if one doesn't work, please try the other
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, Dropout
from keras.utils.data_utils import pad_sequences
from scikeras.wrappers import KerasRegressor
from keras.callbacks import EarlyStopping
from keras.optimizers import Adam
from sklearn.metrics import confusion_matrix


First step is to load economic regimes found through unsupervised training.

In [2]:
regimes = pd.read_parquet('quarterly_data/df_2020Q1.parquet')
regimes['labels']

usa            0
canada         2
europe         0
uk             0
australia      0
brazil         1
india          1
switzerland    0
japan          0
south korea    0
Name: labels, dtype: int32

In [3]:
countries = regimes[regimes['labels'] == 0].index.tolist()

In [4]:
import re

dfs = []

# Iterate over the countries
for country in countries:
    # Read the Parquet file into a DataFrame
    if country == 'uk':
        country = 'england'
    if country == 'south korea':
        country = 'korea'
    df = pd.read_parquet(f"fed_statements2/{country}.parquet")
    df = df.map(lambda x: re.sub(r'[^a-zA-Z\s]', '', str(x)) if isinstance(x, str) else x)
    
    # Add the DataFrame to the list
    dfs.append(df)

# Concatenate all the DataFrames along the row axis
data = pd.concat(dfs, axis=0)

In [5]:
data.index.tolist()

[Timestamp('2019-11-13 00:01:00'),
 Timestamp('2020-02-11 00:01:00'),
 Timestamp('2020-05-13 00:01:00'),
 Timestamp('2020-06-16 00:01:00'),
 Timestamp('2020-08-27 00:11:00'),
 Timestamp('2020-10-07 00:02:00'),
 Timestamp('2021-02-23 00:01:00'),
 Timestamp('2021-06-22 00:04:00'),
 Timestamp('2021-07-14 00:01:00'),
 Timestamp('2021-09-02 00:11:00'),
 Timestamp('2021-10-12 00:00:00'),
 Timestamp('2021-11-28 00:01:00'),
 Timestamp('2021-12-02 00:01:00'),
 Timestamp('2022-03-02 00:01:00'),
 Timestamp('2022-03-22 00:09:00'),
 Timestamp('2022-06-01 00:02:00'),
 Timestamp('2022-06-20 00:02:00'),
 Timestamp('2022-06-23 00:02:00'),
 Timestamp('2022-08-26 00:01:00'),
 Timestamp('2022-09-30 00:01:00'),
 Timestamp('2022-12-01 00:08:00'),
 Timestamp('2023-01-11 00:02:00'),
 Timestamp('2023-03-08 00:01:00'),
 Timestamp('2023-06-22 00:01:00'),
 Timestamp('2023-07-03 00:02:00'),
 Timestamp('2023-08-30 00:08:00'),
 Timestamp('2023-10-11 00:06:00'),
 Timestamp('2023-10-20 00:01:00'),
 Timestamp('2023-10-

In [6]:
target = pd.read_parquet("data/bonds.parquet")

In [7]:
target.head()

Unnamed: 0,usa,canada,europe,uk,australia,brazil,india,switzerland,japan,south korea
2019-08-01,1.9,1.205238,0.0992,0.5776,0.955,5.5882,6.62,-0.975,-0.28,1.254
2019-09-01,1.5,1.364,0.0478,0.5981,1.035,6.3432,6.82,-0.7,-0.215,1.42
2019-10-01,1.65,1.449545,0.1325,0.6373,1.037,4.4765,6.54,-0.511,-0.15,1.577
2019-11-01,1.73,1.4995,0.3016,0.765,1.152,4.5261,6.64,-0.58,-0.08,1.75
2019-12-01,1.78,1.6045,0.3662,0.8308,1.202,4.4842,6.85,-0.456,-0.025,1.653


In [8]:
target_resampled = target['usa'].diff().resample('D').interpolate(method='polynomial', order=2)
# Shift the DataFrame so we are predicting out in time
target_shifted = target_resampled.shift(30)


In [9]:
target_shifted_selected = target_shifted[target_shifted.index.isin(data.index)]

In [10]:
target_shifted_selected.tail()

2020-05-28   -0.017059
2020-12-08    0.192532
2021-10-12    0.135649
Name: usa, dtype: float64

In [11]:
data

Unnamed: 0,content
2019-11-13 00:01:00,Jerome H Powell The economic outlook\nTestimon...
2020-02-11 00:01:00,Jerome H Powell Semiannual Monetary Policy Rep...
2020-05-13 00:01:00,Jerome H Powell Current economic issues\nSpeec...
2020-06-16 00:01:00,Jerome H Powell Semiannual Monetary Policy Rep...
2020-08-27 00:11:00,For release on delivery \n am EDT am CDT \n...
...,...
2022-07-13 00:05:00,BOK Internatinoal Conference \nOpening Address...
2022-10-28 00:07:00,October \n \n \n \nKoreas Monetary Policy A...
2022-11-25 00:01:00,BIS Central bankers speechesChang Yong Rhee ...
2023-01-10 00:01:00,BIS Central bankers speechesChang Yong Rhee ...


In [12]:
stop_words = set(stopwords.words('english'))

# Assuming df['content'] is a series of strings
sentences = data['content'].apply(lambda x: ' '.join(word for word in x.split() if word not in stop_words)).tolist()


train word vectors

In [13]:
# Define some variables
vocab_size = 10000 # max number of words
embedding_dim = 120 # This is the dimension of the Word Vector
max_length = 1000 # maximum length of the sequence
trunc_type = 'post' # where to chop off
padding_type = 'post' # where to put the padding
oov_token = '<OOV>'


In [14]:

import os
# Check if a model is already saved
model_path = "models/fed_parser"

if os.path.exists(model_path):
    word2vec = Word2Vec.load(model_path)
    print("Loaded existing model")
else:
    # Train your model
    tokenizer = Tokenizer(num_words=vocab_size, oov_token = oov_token)
    tokenizer.fit_on_texts(sentences)
    sequences = tokenizer.texts_to_sequences(sentences)
    
    word_index = tokenizer.word_index
    index_word = {v: k for k, v in word_index.items()}
    sentences_tokenized = [[index_word[i] for i in seq] for seq in sequences]
    word2vec = Word2Vec(sentences=sentences_tokenized, vector_size=120, window=5, min_count=1, workers=4)
    word2vec.train(sentences_tokenized, total_examples=len(sentences_tokenized), epochs=10)
    print("Trained a new model")
    # Save your model
    word2vec.save(model_path)



Loaded existing model


In [15]:
# Save word vectors
word_vectors = word2vec.wv.vectors

# Create a word-to-index dictionary
word_to_index = {word: i for i, word in enumerate(word2vec.wv.index_to_key)}

# Create an embedding matrix
embedding_matrix = np.zeros((len(word_to_index) + 1, word2vec.vector_size))
for word, i in word_to_index.items():
    embedding_matrix[i] = word2vec.wv[word]

Create target column

In [16]:
print(data.shape)
print(target_shifted_selected.shape)

(247, 1)
(3,)


In [17]:
# Get the common indexes
common_indexes = data.index.intersection(target_shifted_selected.index)

# Select the rows from both dataframes where the index is in common_indexes
data = data[data.index.isin(common_indexes)]
target_shifted_selected = target_shifted_selected[target_shifted_selected.index.isin(common_indexes)]

In [18]:
def expand_features(full_array, window=20):

    n_features = full_array.shape[1]
    
    expanded_features = np.zeros((full_array.shape[0], n_features * window))
    for feature_idx in range(n_features):
        # For each time step in the window
        for lag in range(window):
            # Shift the data
            shifted_data = np.roll(full_array[:, feature_idx], lag)
            # Set the first 'lag' values to zero
            shifted_data[:lag] = 0
            # Store it in the expanded features
            expanded_features[:, feature_idx*window + lag] = shifted_data
            
    return expanded_features

In [19]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['content'])

# Convert texts to sequences
#data['sequences'] = tokenizer.texts_to_sequences(data['content'])
sequences = tokenizer.texts_to_sequences(data['content'])
padded_sequences = pad_sequences(sequences, maxlen=10000)

In [20]:
expanded_data = expand_features(padded_sequences, 10)

In [21]:
expanded_data.shape 

(3, 100000)

In [22]:
from tensorflow.keras.layers import Bidirectional
from keras.layers import Dropout

In [23]:

reg = Sequential()
reg.add(Embedding(input_dim=len(word_to_index) + 1, output_dim=word2vec.vector_size, weights=[embedding_matrix]))
reg.add(Bidirectional(LSTM(64, return_sequences = True)))
reg.add(Dropout(0.25))
reg.add(Bidirectional(LSTM(64, return_sequences = True)))
reg.add(Dropout(0.25))
reg.add(Bidirectional(LSTM(32)))
reg.add(Dropout(0.25))
reg.add(Dense(1))  # No activation function for regression

# Compile the model with gradient clipping
opt = Adam(clipvalue=0.5)
reg.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy', 'mean_squared_error'])  # Mean squared error for regression


In [None]:
early_stopping = EarlyStopping(monitor='val_loss', patience=2)

In [24]:
history = reg.fit(expanded_data, target_shifted_selected.values, epochs=20, validation_split=0.2, batch_size=25, verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [25]:

clf = Sequential()
clf.add(Embedding(input_dim=len(word_to_index) + 1, output_dim=word2vec.vector_size, weights=[embedding_matrix]))
clf.add(Bidirectional(LSTM(64, return_sequences = True)))
clf.add(Bidirectional(LSTM(32)))
clf.add(Dense(1, activation = 'sigmoid'))

# Compile the model with gradient clipping
opt = Adam(clipvalue=0.5)
clf.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [26]:
early_stopping = EarlyStopping(monitor='rmse', patience=2)

#history = clf.fit(expanded_data, target.values, epochs=10, validation_split=0.2, batch_size=128, verbose=1)

In [27]:
y_pred = clf.predict(expanded_data) 
y_pred = np.argmax(y_pred, axis=1)
# Generate confusion matrix
cm = confusion_matrix(target, y_pred)
sns.heatmap(cm, annot=True, fmt='d')
plt.xlabel('Predicted')
plt.ylabel('True')



ValueError: Found input variables with inconsistent numbers of samples: [54, 3]

In [None]:
# Visualize the loss
train_loss = history.history['loss']
test_loss = history.history['val_loss']

plt.figure(figsize=(12, 8))
plt.plot(train_loss, label='Training loss', color='navy')
plt.plot(test_loss, label='Testing loss', color='skyblue')
plt.legend();

: 

In [None]:
# Get the embeddings from the embedding layer
embeddings = clf.layers[0].get_weights()[0]

# Create a dictionary to map indices to words
word_to_index = tokenizer.word_index
index_to_word = {v: k for k, v in word_to_index.items()}

# Now you can get the embedding of a word like this:
word = "example"
word_embedding = embeddings[word_to_index[word]]

: 

In [None]:
word_embedding

: 

: 