In [3]:
import pandas as pd
import lightgbm as lgb
import numpy as np

In [97]:
path=r"C:\Users\iavta\Natural Language Processing\Economics\FED\\"
df_Fed_merged=pd.read_csv(path+"Data - Clean\\"+"Fed_merged.csv")

In [98]:
df_Fed_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 855 entries, 0 to 854
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   date                 855 non-null    object 
 1   link                 855 non-null    object 
 2   title                855 non-null    object 
 3   event                855 non-null    object 
 4   text                 855 non-null    object 
 5   location             855 non-null    object 
 6   DATE                 855 non-null    object 
 7   T5YIFR               855 non-null    float64
 8   Changes of T5YIFR    855 non-null    float64
 9   Distance to 2        855 non-null    float64
 10  Changes in Distance  855 non-null    float64
 11  Classes              855 non-null    float64
 12  T5YIFR Lagged        855 non-null    float64
dtypes: float64(6), object(7)
memory usage: 87.0+ KB


In [100]:
data_X=df_Fed_merged[["text","T5YIFR Lagged"]]

In [101]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import casual_tokenize

In [102]:
vectorizer=TfidfVectorizer(tokenizer=casual_tokenize)

In [103]:
def Time_Validation(datafr,labels,test_size=0.2):
    train_index = list(range(int(len(datafr)-np.floor(test_size*len(datafr))+1)))
    test_index =list(range(int(len(datafr)-np.floor(test_size*len(datafr))+1),len(datafr)))
    return datafr.loc[train_index], datafr.loc[test_index],labels[train_index], labels[test_index]

# Fed - LGBM - Classification

In [104]:
from lightgbm import LGBMClassifier

In [105]:
labels=np.asarray(df_Fed_merged["Classes"].astype("int"))
labels.shape

(855,)

In [106]:
data_X_train, data_X_test, labels_train, labels_test=Time_Validation(data_X,labels,test_size=0.05)

In [108]:
data_X_train.shape

(814, 2)

In [109]:
vectorizer_fitted=vectorizer.fit(data_X_train["text"])

corpus_train=pd.DataFrame(vectorizer_fitted.transform(data_X_train["text"]).toarray())

# We've fitted the vectorizer to train data only, and use it to also transform corpus_test
corpus_test=pd.DataFrame(vectorizer_fitted.transform(data_X_test["text"]).toarray())



In [110]:
len(vectorizer_fitted.vocabulary_)

45979

In [111]:
corpus_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 814 entries, 0 to 813
Columns: 45979 entries, 0 to 45978
dtypes: float64(45979)
memory usage: 285.5 MB


In [112]:
corpus_train["T5YIFR Lagged"]=data_X_train["T5YIFR Lagged"]
corpus_test["T5YIFR Lagged"]=data_X_test["T5YIFR Lagged"]

In [113]:
lgbmc=LGBMClassifier();
lgbmc.fit(corpus_train,labels_train)

LGBMClassifier()

In [114]:
predictions=lgbmc.predict(corpus_test)
probabilities=lgbmc.predict_proba(corpus_test)

In [115]:
from sklearn.metrics import confusion_matrix, balanced_accuracy_score,roc_auc_score

In [116]:
confusion_matrix(labels_test, predictions)
#  i-th row and j-th column entry indicates the number of samples 
#  with true label being i-th class and predicted label being j-th class.

array([[ 0,  0,  7],
       [ 1,  8,  9],
       [ 0,  4, 12]], dtype=int64)

We can see that lgbm tends to predict class 2 (from 0,1 and 2) too many times...
This will also be seen with the example sentences below.

In [117]:
balanced_accuracy_score(labels_test, predictions,adjusted=True) 
# accuracy score for imbalanced classes
# 0 for a random classifier
# 1 for a perfect classifier

0.09722222222222222

In [118]:
roc_auc_score(labels_test,probabilities, multi_class="ovr")

0.46433994370695136

This value for roc seems to indicate performance is worse than a random classifier which would get a roc of 0.5...

I think I need to do a random grid search to the parameters of the LGBM classifier. 

In [119]:
examples = [
    'Trust us, inflation will be 2%',
    'We are unsure whether it will be 2%',
    'Inflation will be higher than 2%',
    'Inflation will be lower than 2%',
]
examples_trans=pd.DataFrame(vectorizer_fitted.transform(examples).toarray())

In [127]:
examples_trans["T5YIFR Lagged"]=[3,3,3,3] 
# I've tried several different vectors, no change in results below

In [128]:
predictions = lgbmc.predict(examples_trans)
# 0 ->  no change
# 2 -> increased the distance relative to objective of 2
# 1 -> decreased the distance relative to objective of 2
for sent,pred in zip(examples,predictions):
    print("Sentence: {0} Predicted Class: {1} \n".format(sent,pred))

Sentence: Trust us, inflation will be 2% Predicted Class: 2 

Sentence: We are unsure whether it will be 2% Predicted Class: 2 

Sentence: Inflation will be higher than 2% Predicted Class: 2 

Sentence: Inflation will be lower than 2% Predicted Class: 2 



# Fed - LGBM - Regression

In [129]:
from lightgbm import LGBMRegressor

In [130]:
target=np.asarray(df_Fed_merged["T5YIFR"])
target.shape

(855,)

In [131]:
data_X_train, data_X_test, target_train, target_test=Time_Validation(data_X,target,test_size=0.05)

In [132]:
vectorizer_fitted=vectorizer.fit(data_X_train["text"])

df_trans_corpus_train=pd.DataFrame(vectorizer_fitted.transform(data_X_train["text"]).toarray())

# We've fitted the vectorizer to train data only, and use it to also transform corpus_test
df_trans_corpus_test=pd.DataFrame(vectorizer_fitted.transform(data_X_test["text"]).toarray())




In [133]:
df_trans_corpus_train["T5YIFR Lagged"]=data_X_train["T5YIFR Lagged"]
df_trans_corpus_test["T5YIFR Lagged"]=data_X_test["T5YIFR Lagged"]

In [134]:
lgbm_reg=LGBMRegressor();
lgbm_reg.fit(df_trans_corpus_train,target_train)

LGBMRegressor()

In [135]:
predictions_reg=lgbm_reg.predict(df_trans_corpus_test)

In [136]:
accuracy_reg = lgbm_reg.score(df_trans_corpus_test,target_test)
print(accuracy_reg) #

rmse = np.sqrt(np.mean((predictions_reg-target_test)**2))
print(rmse)

-3.6195525721794786
0.2170498613750913


In [138]:
examples_trans=pd.DataFrame(vectorizer_fitted.transform(examples).toarray()) # model was refitted
examples_trans["T5YIFR Lagged"]=[3,3,3,3] 

predictions = lgbm_reg.predict(examples_trans)
# 0 ->  no change
# 2 -> increased the distance relative to objective of 2
# 1 -> decreased the distance relative to objective of 2
for sent,pred in zip(examples,predictions):
    print("Sentence: {0} Predicted Class: {1} \n".format(sent,pred))

Sentence: Trust us, inflation will be 2% Predicted Class: 2.82658007546689 

Sentence: We are unsure whether it will be 2% Predicted Class: 2.850074496579208 

Sentence: Inflation will be higher than 2% Predicted Class: 2.83186006580961 

Sentence: Inflation will be lower than 2% Predicted Class: 2.83186006580961 

