In [1]:
import pandas as pd
import numpy as np
from gensim.models.doc2vec import Doc2Vec,TaggedDocument 
import nltk
from nltk.corpus import stopwords
import os
import seaborn as sns
from nltk.tokenize import casual_tokenize

In [2]:
import multiprocessing

In [3]:
num_cores=multiprocessing.cpu_count()

In [4]:
path=r"C:\Users\iavta\Natural Language Processing\Economics\FED\\"
df_Fed_merged=pd.read_csv(path+"Data - Clean\\"+"Fed_merged.csv")

In [5]:
df_Fed_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 855 entries, 0 to 854
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   date                 855 non-null    object 
 1   link                 855 non-null    object 
 2   title                855 non-null    object 
 3   event                855 non-null    object 
 4   text                 855 non-null    object 
 5   location             855 non-null    object 
 6   DATE                 855 non-null    object 
 7   T5YIFR               855 non-null    float64
 8   Changes of T5YIFR    855 non-null    float64
 9   Distance to 2        855 non-null    float64
 10  Changes in Distance  855 non-null    float64
 11  Classes              855 non-null    float64
 12  T5YIFR Lagged        855 non-null    float64
dtypes: float64(6), object(7)
memory usage: 87.0+ KB


In [6]:
df_Fed_merged.iloc[-6:]

Unnamed: 0,date,link,title,event,text,location,DATE,T5YIFR,Changes of T5YIFR,Distance to 2,Changes in Distance,Classes,T5YIFR Lagged
849,2020-04-09,https://www.federalreserve.gov/newsevents/spee...,COVID-19 and the Economy,At the Hutchins Center on Fiscal and Monetary ...,Good morning. The challenge we face today is ...,D.C. (via webcast),2020-04-09,1.56,0.05,0.44,-0.05,1.0,1.51
850,2020-05-05,https://www.federalreserve.gov/newsevents/spee...,Welcoming Remarks for Investment Connection â...,"At the ""Investment Connection â Response to ...",Good afternoon everyone. I greatly appreciate...,Missouri,2020-05-05,1.49,0.04,0.51,-0.04,1.0,1.45
851,2020-05-13,https://www.federalreserve.gov/newsevents/spee...,Current Economic Issues,At the Peterson Institute for International Ec...,The coronavirus has left a devastating human ...,D.C. (via webcast),2020-05-13,1.43,-0.03,0.57,0.03,2.0,1.46
852,2020-05-21,https://www.federalreserve.gov/newsevents/spee...,Opening Remarks Introductory remarks for the F...,"At ""A Fed Listens Event: How Is COVID-19 Affec...",Good afternoon. I just want to say a few word...,D.C. (via webcast) (via webcast) New York (vi...,2020-05-21,1.47,-0.02,0.53,0.02,2.0,1.49
853,2020-06-16,https://www.federalreserve.gov/newsevents/spee...,U.S. Economic Outlook and Monetary Policy (via...,"At the Foreign Policy Association, New York, N...",It is my pleasure to meet virtually this even...,New York,2020-06-16,1.52,0.04,0.48,-0.04,1.0,1.48
854,2020-06-19,https://www.federalreserve.gov/newsevents/spee...,Introductory Comments The Adaptability of Stre...,"At ""Building a Resilient Workforce,"" a video c...","Thank you, President Mester and Treye Johnson...",Ohio (via webcast) D.C.,2020-06-19,1.54,0.1,0.46,-0.1,1.0,1.44


In [7]:
data_X=df_Fed_merged[["text","T5YIFR Lagged"]].iloc[:-5]
# we're keeping the last 5 data points out, as our final test set.
data_X.shape

(850, 2)

In [8]:
data_X_testfinal=df_Fed_merged[["text","T5YIFR Lagged"]].iloc[-5:]
data_X_testfinal.shape

(5, 2)

In [9]:
def Time_Validation(datafr,labels,test_size=0.2):
    train_index = list(range(int(len(datafr)-np.floor(test_size*len(datafr))+1)))
    test_index =list(range(int(len(datafr)-np.floor(test_size*len(datafr))+1),len(datafr)))
    return datafr.loc[train_index], datafr.loc[test_index],labels[train_index], labels[test_index]

In [10]:
def prepare_corpus(array, tokens_only=False):
    for i, text in enumerate(array):
        tokens = casual_tokenize(text)
        if tokens_only:
            yield tokens
        else:
            # For training data (only), add tags
            yield TaggedDocument(tokens, [i])

In [11]:
def prepare_corpus_comp(array, tokens_only=False):
    [casual_tokenize(text) if tokens_only 
     else TaggedDocument(casual_tokenize(text), [i]) for i, text in enumerate(array)]

# Fed - LGBM - Classification - Naive Optuna

In [12]:
import lightgbm as lgb
from lightgbm import LGBMClassifier
import optuna
from sklearn.metrics import roc_auc_score,classification_report
from sklearn.metrics import confusion_matrix, balanced_accuracy_score

In [13]:
labels=np.asarray(df_Fed_merged["Classes"].astype("int"))[:-5]
labels.shape

(850,)

In [14]:
labels_testfinal=np.asarray(df_Fed_merged["Classes"].astype("int"))[-5:]
labels_testfinal.shape

(5,)

## Preparing data for LGBM dataset

Usually, the spliting data stage should be inside the Optuna function that we will optimise. However, since we're in a time series setting, with a given test_size, we're going to have the same validation set. Hence, it seems that we can accelerate the optuna process, by leaving the data out.

Cross-Validation is not possible in a Time-Series setting.
The best we can do is do a stepwise n-lag prediction, and compute a (cumulative) metric for our process of prediction, e.g. RMSE. This will imply to reestimate the word embeddings at every period, which in my PC is very costly...  

In [15]:
# Here we have validation(test) sets
data_X_train, data_X_test, labels_train, labels_test=Time_Validation(data_X,labels,
                                                                         test_size=0.05)
train_size , _= data_X_train.shape
test_size , _= data_X_test.shape


corpus_train=list(prepare_corpus(data_X_train["text"]))

#instantiating Doc2Vec class
vector_size = 20
epochs = 25
embedding_doc2vec=Doc2Vec(vector_size=vector_size, min_count=2, epochs=epochs, 
                          workers=num_cores)

#building and training doc2vec model
embedding_doc2vec.build_vocab(corpus_train)

embedding_doc2vec.train(corpus_train,total_examples=embedding_doc2vec.corpus_count,
                       epochs=embedding_doc2vec.epochs)

# preparing X_train data
df_train=pd.DataFrame(np.array([embedding_doc2vec.dv[i] for i in range(train_size)]),
                  columns=["doc"+str(i) for i in range(vector_size)])

df_train["T5YIFR Lagged"]=data_X_train["T5YIFR Lagged"]

# preparing X_test data
corpus_test=list(prepare_corpus(data_X_test["text"],tokens_only=True))

df_test=pd.DataFrame(np.array([embedding_doc2vec.infer_vector(corpus_test[i])
                            for i in range(test_size)]),
                  columns=["doc_test"+str(i) for i in range(vector_size)])
df_test["T5YIFR Lagged"]=data_X_test["T5YIFR Lagged"]


In [19]:
def objective(trial):
   
    # uncomment the line below, if you want to work directly with the best model
    # global lgbmc 
    
    dtrain = lgb.Dataset(df_train, label=labels_train)
 
    param = {
        'objective': 'multiclass', # we're in a multiclass classification problem
        'num_class':3, 
        'metric': 'multi_logloss',
        "verbosity": -1,
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
        'num_leaves': trial.suggest_int('num_leaves', 2, 502,10),
        'max_depth': trial.suggest_categorical('max_depth', [10,20,100]),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 10,3),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100,5)
    }
    # Add a callback for pruning = early stopping.
    pruning_callback = optuna.integration.LightGBMPruningCallback(trial, "multi_logloss")
    
    lgbmc = lgb.train(param, dtrain) #verbose_eval=False)
    
    predictions=lgbmc.predict(df_test) # returns the class probabilities
    
    crit = roc_auc_score(labels_test,predictions, multi_class="ovr") 
    
    return crit

In [20]:
# this callback will check if the model is the best,
# and return a global variable with the best model

def best_callback(study, trial):
    pass
# Uncomment bellow is you want the best model to be returned
#    global best_booster # best model to be returned
    
    # study is the optuna study to be created below
#    if study.best_trial.number == trial.number:
#        best_booster = lgbmc

In [21]:
study = optuna.create_study(pruner=optuna.pruners.MedianPruner(n_warmup_steps=10),
                            direction='maximize')
optuna.logging.set_verbosity(optuna.logging.WARNING)

study.optimize(objective, n_trials=250, callbacks=[best_callback])

In [22]:
# lgbm_trial will have the parameters of the best_booster
lgbm_trial= study.best_trial
print("  Params: ")
for key, value in lgbm_trial.params.items():
       print("    {}: {}".format(key, value))

  Params: 
    lambda_l1: 5.853623660010822e-05
    lambda_l2: 4.343021754023105
    num_leaves: 252
    max_depth: 20
    feature_fraction: 0.5440941567177511
    bagging_fraction: 0.6249489616647574
    bagging_freq: 7
    min_child_samples: 35


In [46]:
#for key, value in best_booster.params.items():
#       print("    {}: {}".format(key, value))

In [23]:
# I could have used the best_booster model, and then I wouldn't need to run LGBMClassifier
# However, then I would have to change the code below, 
# since it would only output class probabilities.
best_lgbm=LGBMClassifier(**lgbm_trial.params)
best_lgbm.fit(df_train,labels_train)


# We're using the same test 
predictions=best_lgbm.predict(df_test)
probabilities=best_lgbm.predict_proba(df_test)



In [24]:
confusion_matrix(labels_test, predictions)
#  i-th row and j-th column entry indicates the number of samples 
#  with true label being i-th class and predicted label being j-th class.

array([[ 1,  1,  6],
       [ 2, 10,  6],
       [ 1,  3, 11]], dtype=int64)

If we compare to the confusion_matrix without optuna, we can see that lgbm no longer tends to over predict class 2. However, due to class imbalance, we still perform poorly (but less) on class 0.

In [25]:
roc_auc_score(labels_test,probabilities, multi_class="ovr")

0.7030806954719999

Also, roc_auc_score is now better than what we got from the default LGBM + WE  in the previous notebook. 

Still, we're analysing the performance of the model on the dataset on which it was trained. We need to analyse its performance on the unseen data, like below.

In [26]:
testfinal = list(prepare_corpus(data_X_testfinal["text"],tokens_only=True))
testfinal=pd.DataFrame(np.array([embedding_doc2vec.infer_vector(i)
                                for i in testfinal]))
testfinal["T5YIFR Lagged"]=data_X_testfinal["T5YIFR Lagged"]

predictions_final=best_lgbm.predict(testfinal)
probabilities_final=best_lgbm.predict_proba(testfinal)

In [27]:
confusion_matrix(labels_testfinal, predictions_final)

array([[3, 0],
       [0, 2]], dtype=int64)

Not bad (and a bit lucky), specially since the last 5 data points are 1 to 2 months apart approx. from the last observation.

In [28]:
balanced_accuracy_score(labels_testfinal, predictions_final,adjusted=True)
# accuracy score for imbalanced classes
# 0 for a random classifier
# 1 for a perfect classifier

1.0

In a world where I had a better laptop, I would have saved a bigger test set, and then analyse iteratively, the performance for 1-lag, 2-lag and 5-lag predictions. Also, Since classes are imbalanced, and the finaltest set is small, there's no point in running:

In [53]:
# roc_auc_score(labels_testfinal,probabilities, multi_class="ovr")

In [29]:
examples=[
    'Trust us, inflation will be 2%',
    'We are unsure whether it will be 2%',
    'Inflation will be higher than 2%',
    'Inflation will be lower than 2%',
]

examples_trans = list(prepare_corpus(examples,tokens_only=True))
examples_trans=pd.DataFrame(np.array([embedding_doc2vec.infer_vector(i)
                                for i in examples_trans]))

examples_trans["T5YIFR Lagged"]=[1,2,1,3] 
# I've tried several different vectors

In [30]:
predictions = best_lgbm.predict(examples_trans)
# 0 ->  no change
# 2 -> increased the distance relative to objective of 2
# 1 -> decreased the distance relative to objective of 2


for sent,pred in zip(examples,predictions):
    print("Sentence: {0} Predicted Class: {1} \n".format(sent,pred))

Sentence: Trust us, inflation will be 2% Predicted Class: 1 

Sentence: We are unsure whether it will be 2% Predicted Class: 1 

Sentence: Inflation will be higher than 2% Predicted Class: 1 

Sentence: Inflation will be lower than 2% Predicted Class: 1 



# Fed - LGBM - Regression

In [31]:
from lightgbm import LGBMRegressor

In [32]:
target=np.asarray(df_Fed_merged["T5YIFR"])[:-5]
target.shape

(850,)

In [33]:
target_testfinal=np.asarray(df_Fed_merged["T5YIFR"])[-5:]
target_testfinal.shape

(5,)

In [37]:
data_X_train, data_X_test, target_train, target_test=Time_Validation(data_X,target,test_size=0.05)
train_size , _= data_X_train.shape
test_size , _= data_X_test.shape

corpus_train=list(prepare_corpus(data_X_train["text"]))

#instantiating Doc2Vec class
vector_size = 20
epochs = 25
embedding_doc2vec=Doc2Vec(vector_size=vector_size, min_count=2, epochs=epochs, 
                          workers=num_cores)

#building and training model
embedding_doc2vec.build_vocab(corpus_train)

embedding_doc2vec.train(corpus_train,total_examples=embedding_doc2vec.corpus_count,
                       epochs=embedding_doc2vec.epochs)

# embedding_doc2vec.dv is gensim keyedvector data type,
# with each component beint the document vector (dv) obtained from training.
df_train=pd.DataFrame(np.array([embedding_doc2vec.dv[i] for i in range(train_size)]),
                      columns=["doc"+str(i) for i in range(vector_size)])

df_train["T5YIFR Lagged"]=data_X_train["T5YIFR Lagged"]

# creating validation test set.
corpus_test=list(prepare_corpus(data_X_test["text"],tokens_only=True))

df_test=pd.DataFrame(np.array([embedding_doc2vec.infer_vector(corpus_test[i])
                                for i in range(test_size)]),
                      columns=["doc_test"+str(i) for i in range(vector_size)])
df_test["T5YIFR Lagged"]=data_X_test["T5YIFR Lagged"]

In [39]:
# path_saved_LGBM=path+"Saved Models\\"+"Embedding-Doc2Vec\\"
# we need to create the directory before running the command below
# embedding_doc2vec.save(path_saved_LGBM+"embedding_doc2vec_regressor")
# to load, uncomment the line below
# embedding_doc2vec = Doc2Vec.load(path_saved_LGBM+"embedding_doc2vec")

In [42]:
def objective_reg(trial):
   
    # uncomment the line below, if you want to work directly with the best model
    # global lgbmc 
    
    dtrain = lgb.Dataset(df_train, label=labels_train)
 
    param = {
        'objective': 'regression', # we're in a regression problem
        'metric': 'l2',
        "verbosity": -1,
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
        'num_leaves': trial.suggest_int('num_leaves', 2, 502,10),
        'max_depth': trial.suggest_categorical('max_depth', [10,20,100]),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 10,3),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100,5)
    }
    
    # Add a callback for pruning = early stopping.
    pruning_callback = optuna.integration.LightGBMPruningCallback(trial, "l2")
    
    lgbmc = lgb.train(param, dtrain)
    
    predictions=lgbmc.predict(df_test) # returns predictions for IE 
    
    crit = lgbmc.score(df_test,target_test) # Regression score
    
    return crit

In [43]:
study = optuna.create_study(pruner=optuna.pruners.MedianPruner(n_warmup_steps=10),
                            direction='maximize')
optuna.logging.set_verbosity(optuna.logging.WARNING)

study.optimize(objective, n_trials=250)

In [44]:
# lgbm_trial will have the parameters of the best_booster
lgbm_trial= study.best_trial
print("  Params: ")
for key, value in lgbm_trial.params.items():
       print("    {}: {}".format(key, value))

  Params: 
    lambda_l1: 0.0004907685962644185
    lambda_l2: 0.04017199105504725
    num_leaves: 242
    max_depth: 10
    feature_fraction: 0.6074944699115936
    bagging_fraction: 0.6492544671270084
    bagging_freq: 10
    min_child_samples: 15


In [49]:
lgbm_reg=LGBMRegressor(**lgbm_trial.params)
lgbm_reg.fit(df_train,target_train);

In [50]:
predictions_reg=lgbm_reg.predict(df_test)

In [51]:
accuracy_reg = lgbm_reg.score(df_test,target_test)
print("Score: ", accuracy_reg) #

rmse = np.sqrt(np.mean((predictions_reg-target_test)**2))
print("\n RMSE: ", rmse)

Score:  -6.020079508828599

 RMSE:  0.22733524991908935


In [52]:
examples_trans = list(prepare_corpus(examples,tokens_only=True))
examples_trans=pd.DataFrame(np.array([embedding_doc2vec.infer_vector(i)
                                for i in examples_trans]))

examples_trans["T5YIFR Lagged"]=[3,3,3,3] 
# I've tried several different vectors

predictions = lgbm_reg.predict(examples_trans)
# 0 ->  no change
# 2 -> increased the distance relative to objective of 2
# 1 -> decreased the distance relative to objective of 2
for sent,pred in zip(examples,predictions):
    print("Sentence: {0} Predicted Class: {1} \n".format(sent,pred))

Sentence: Trust us, inflation will be 2% Predicted Class: 2.5733475052838215 

Sentence: We are unsure whether it will be 2% Predicted Class: 2.7140534025185885 

Sentence: Inflation will be higher than 2% Predicted Class: 2.6818045411951497 

Sentence: Inflation will be lower than 2% Predicted Class: 2.5620249697326085 



Let's assume the baseline is the "Trust us" sentence. 

With uncertainty, and when the CB states that inflation will be bigger, the model predicts a higher inflation than the baseline.

When the CB states inflation will be lower, the model predicts that inflation will be lower than the baseline model.