In [1]:
import pandas as pd
import torch
import os
import datetime as dt

In [2]:

# Define the directory path where the .txt files are located.
directory_path = '../data/FOMC/statements.clean'

# Initialize an empty list to store the data.
data = []

# Loop through each file in the directory.
for file_name in os.listdir(directory_path):
    # Check if the file is a .txt file.
    if file_name.endswith('.txt'):
        # Extract the date from the file name.
        date_str = file_name.replace('FOMC_statement_', '').replace('.txt', '')
        date = dt.datetime.strptime(date_str, '%Y-%m-%d')
        
        # Read the content of the file.
        file_path = os.path.join(directory_path, file_name)
        with open(file_path, 'r') as file:
            content = file.read()

        word_count = len(content.split())
        
        # Append the date and content to the data list.
        data.append({'date': date, 'content': content, 'word_count': word_count})

# Create a DataFrame from the data list.
df = pd.DataFrame(data)

# Sort the DataFrame by date.
df = df.sort_values(by='date').reset_index(drop=True)

# Output the DataFrame to check.
df.head()


Unnamed: 0,date,content,word_count
0,1994-02-04,chairman alan greenspan announced today fomc d...,61
1,1994-03-22,chairman alan greenspan announced today fomc d...,31
2,1994-04-18,chairman alan greenspan announced today federa...,30
3,1994-05-17,the federalreserve today announced actions des...,83
4,1994-08-16,the federalreserve announced today following m...,65


In [3]:
df['year_month'] = [str(x.year) + '-' + str(x.month) + '-01' for x in df['date']]
df = df.groupby('year_month').agg({
    'content': lambda x: ' '.join(x),
    'word_count': 'sum',
}).reset_index()
df

Unnamed: 0,year_month,content,word_count
0,1994-11-01,the federalreserve board today approved increa...,46
1,1994-2-01,chairman alan greenspan announced today fomc d...,61
2,1994-3-01,chairman alan greenspan announced today fomc d...,31
3,1994-4-01,chairman alan greenspan announced today federa...,30
4,1994-5-01,the federalreserve today announced actions des...,83
...,...,...,...
210,2023-3-01,share recent indicators point modest growth sp...,189
211,2023-5-01,share economic activity expanded modest pace j...,169
212,2023-6-01,share recent indicators suggest economic activ...,177
213,2023-7-01,share recent indicators suggest economic activ...,171


In [4]:
df.to_csv('../data/preprocessed/statement.csv')

In [5]:
from transformers import BertTokenizer, BertForSequenceClassification, pipeline
import pandas as pd

# Initialize the model and tokenizer
model_name = "ahmedrachid/FinancialBERT-Sentiment-Analysis"
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=3)
model.save_pretrained('../data/models/pretrained_sentiment_model')

tokenizer = BertTokenizer.from_pretrained(model_name)
tokenizer.save_pretrained('../data/models/tokenizer')

# Initialize the pipeline
nlp = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, truncation=True)

# Function to apply the sentiment analysis model to a sentence
def get_sentiment(sentence):
    result = nlp(sentence)
    return result[0]['label'],result[0]['score']


  from .autonotebook import tqdm as notebook_tqdm


In [6]:
# Load the model and tokenizer from local paths
model_path = '../data/models/pretrained_sentiment_model/'
tokenizer_path = '../data/models/tokenizer/'
model = BertForSequenceClassification.from_pretrained(model_path, num_labels=3)
tokenizer = BertTokenizer.from_pretrained(tokenizer_path)

In [7]:
res = df['content'].apply(lambda x: pd.Series(get_sentiment(x)))


In [9]:
res.columns = ['Sentiment','Sentiment_score']
df = pd.concat([df,res], axis = 1)
df

Unnamed: 0,year_month,content,word_count,Sentiment,Sentiment_score
0,1994-11-01,the federalreserve board today approved increa...,46,positive,0.974765
1,1994-2-01,chairman alan greenspan announced today fomc d...,61,neutral,0.684734
2,1994-3-01,chairman alan greenspan announced today fomc d...,31,positive,0.478265
3,1994-4-01,chairman alan greenspan announced today federa...,30,positive,0.589428
4,1994-5-01,the federalreserve today announced actions des...,83,positive,0.882231
...,...,...,...,...,...
210,2023-3-01,share recent indicators point modest growth sp...,189,neutral,0.633636
211,2023-5-01,share economic activity expanded modest pace j...,169,positive,0.877291
212,2023-6-01,share recent indicators suggest economic activ...,177,positive,0.680297
213,2023-7-01,share recent indicators suggest economic activ...,171,positive,0.658053


In [10]:
df['Sentiment'].value_counts()

Sentiment
positive    114
neutral      87
negative     14
Name: count, dtype: int64

# merge sentiment with train data

In [15]:
Data = pd.read_csv('../data/preprocessed/statement_LM_sentiment.csv')
Data.head()

Unnamed: 0,date,content,wordcount,NPositiveWords,NNegativeWords,sentiment,Poswords,Negwords
0,1994-02-04,chairman alan greenspan announced today fomc d...,61,1,1,0.0,['enhance'],['misunderstanding']
1,1994-03-22,chairman alan greenspan announced today fomc d...,31,0,0,0.0,[],[]
2,1994-04-18,chairman alan greenspan announced today federa...,30,0,0,0.0,[],[]
3,1994-05-17,the federalreserve today announced actions des...,83,2,0,2.409639,"['favorable', 'effective']",[]
4,1994-08-16,the federalreserve announced today following m...,65,2,0,3.076923,"['effective', 'strength']",[]


In [16]:
Data['date'] = pd.to_datetime(Data['date'])
Data['year_month'] = [str(x.year) + '-' + str(x.month) + '-01' for x in Data['date']]


Data = Data.groupby('year_month').agg({
    'sentiment': 'mean',
    'NPositiveWords': 'sum',
    'NNegativeWords': 'sum',
    'Poswords': lambda x: ' '.join(x),
    'Negwords': lambda x: ' '.join(x)
}).reset_index()
Data

Unnamed: 0,year_month,sentiment,NPositiveWords,NNegativeWords,Poswords,Negwords
0,1994-11-01,2.173913,2,1,"['effective', 'strength']",['persistent']
1,1994-2-01,0.000000,1,1,['enhance'],['misunderstanding']
2,1994-3-01,0.000000,0,0,[],[]
3,1994-4-01,0.000000,0,0,[],[]
4,1994-5-01,2.409639,2,0,"['favorable', 'effective']",[]
...,...,...,...,...,...,...
210,2023-3-01,0.000000,4,4,"['gains', 'achieve', 'attain', 'attainment']","['unemployment', 'tightening', 'lags', 'impede']"
211,2023-5-01,-0.591716,3,4,"['gains', 'achieve', 'attainment']","['unemployment', 'tightening', 'lags', 'impede']"
212,2023-6-01,-0.564972,3,4,"['gains', 'achieve', 'attainment']","['unemployment', 'tightening', 'lags', 'impede']"
213,2023-7-01,-0.584795,3,4,"['gains', 'achieve', 'attainment']","['unemployment', 'tightening', 'lags', 'impede']"


In [17]:
Data.rename(columns={'sentiment':'sentiment_score_LM'},inplace=True)
Data

Unnamed: 0,year_month,sentiment_score_LM,NPositiveWords,NNegativeWords,Poswords,Negwords
0,1994-11-01,2.173913,2,1,"['effective', 'strength']",['persistent']
1,1994-2-01,0.000000,1,1,['enhance'],['misunderstanding']
2,1994-3-01,0.000000,0,0,[],[]
3,1994-4-01,0.000000,0,0,[],[]
4,1994-5-01,2.409639,2,0,"['favorable', 'effective']",[]
...,...,...,...,...,...,...
210,2023-3-01,0.000000,4,4,"['gains', 'achieve', 'attain', 'attainment']","['unemployment', 'tightening', 'lags', 'impede']"
211,2023-5-01,-0.591716,3,4,"['gains', 'achieve', 'attainment']","['unemployment', 'tightening', 'lags', 'impede']"
212,2023-6-01,-0.564972,3,4,"['gains', 'achieve', 'attainment']","['unemployment', 'tightening', 'lags', 'impede']"
213,2023-7-01,-0.584795,3,4,"['gains', 'achieve', 'attainment']","['unemployment', 'tightening', 'lags', 'impede']"


In [18]:
merge = pd.merge(df,Data,on='year_month',how = 'left')
merge

Unnamed: 0,year_month,content,word_count,Sentiment,Sentiment_score,sentiment_score_LM,NPositiveWords,NNegativeWords,Poswords,Negwords
0,1994-11-01,the federalreserve board today approved increa...,46,positive,0.974765,2.173913,2,1,"['effective', 'strength']",['persistent']
1,1994-2-01,chairman alan greenspan announced today fomc d...,61,neutral,0.684734,0.000000,1,1,['enhance'],['misunderstanding']
2,1994-3-01,chairman alan greenspan announced today fomc d...,31,positive,0.478265,0.000000,0,0,[],[]
3,1994-4-01,chairman alan greenspan announced today federa...,30,positive,0.589428,0.000000,0,0,[],[]
4,1994-5-01,the federalreserve today announced actions des...,83,positive,0.882231,2.409639,2,0,"['favorable', 'effective']",[]
...,...,...,...,...,...,...,...,...,...,...
210,2023-3-01,share recent indicators point modest growth sp...,189,neutral,0.633636,0.000000,4,4,"['gains', 'achieve', 'attain', 'attainment']","['unemployment', 'tightening', 'lags', 'impede']"
211,2023-5-01,share economic activity expanded modest pace j...,169,positive,0.877291,-0.591716,3,4,"['gains', 'achieve', 'attainment']","['unemployment', 'tightening', 'lags', 'impede']"
212,2023-6-01,share recent indicators suggest economic activ...,177,positive,0.680297,-0.564972,3,4,"['gains', 'achieve', 'attainment']","['unemployment', 'tightening', 'lags', 'impede']"
213,2023-7-01,share recent indicators suggest economic activ...,171,positive,0.658053,-0.584795,3,4,"['gains', 'achieve', 'attainment']","['unemployment', 'tightening', 'lags', 'impede']"


In [19]:
merge['year_month'] = pd.to_datetime(merge['year_month'])

In [20]:
# merge with train data
df = pd.read_csv('../data/MarketData/Predicting Fed Rate.csv')

df['observation_date'] = pd.to_datetime(df['observation_date'])
df.tail()
Data = Data.reset_index()
merged_data = pd.merge(df, merge, left_on='observation_date',right_on = 'year_month' , how='left')
merged_data.shape

(831, 88)

In [22]:
merged_data.to_csv('../data/preprocessed/dataset_merged_with_sentiment.csv', index=False)