In [None]:
import pandas as pd

### Load and preprocess the data

In [22]:
reports = pd.read_csv("/Users/julialorenc/Desktop/BAN443_LLMs/FINAL_PROJECT/BIG_Beige_Book_RE.csv")

In [25]:
reports["report"] = reports["report"].str.replace("Back to Archive", "")
reports["report"] = reports["report"].str.replace("Search", "")
reports["report"] = reports["report"].str.replace("‹ ", "")

In [26]:
reports.drop(columns = "Unnamed: 0", inplace=True)
reports.head()

Unnamed: 0,year,month,district,report
0,1997,1,Atlanta,"‹ January 22, 1997\n\n\nConsumer Spending\nCo..."
1,1997,1,Boston,"‹ January 22, 1997\n\n\nRetail\nThe First Dis..."
2,1997,1,Chicago,"‹ January 22, 1997\n\n\nConsumer Spending\nOv..."
3,1997,1,Cleveland,"‹ January 22, 1997\n\n\nManufacturing\nThe Di..."
4,1997,1,Dallas,"‹ January 22, 1997\n\n\nPrices\nIn December a..."


### Initialize a text-classification pipeline using the FinBERT

In [1]:
from transformers import pipeline

pipe = pipeline("text-classification", model="ProsusAI/finbert")

2024-12-13 08:57:11.824135: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
from nltk.tokenize.punkt import PunktSentenceTokenizer
from transformers import AutoTokenizer, AutoModelForSequenceClassification 
import torch
import nltk

tokenizer = AutoTokenizer.from_pretrained("yiyanghkust/finbert-tone") # Load the tokenizer
model = AutoModelForSequenceClassification.from_pretrained("yiyanghkust/finbert-tone") # Load the model

sentence_tokenizer = PunktSentenceTokenizer() # Load the sentence tokenizer

def classify_beigebook_finBert(report_text):
    sentences = sentence_tokenizer.tokenize(report_text) # Split the report text into sentences
    
    sentence_scores = []

    for sentence in sentences:
        inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True, max_length=512) # Tokenize and truncate each sentence
        
        with torch.no_grad():
            outputs = model(**inputs) # Get the model outputs
        
        probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1) # Get probabilities and determine the sentiment
        sentiment = torch.argmax(probabilities, dim=-1).item()
        
        sentiment_map = {0: -1, 1: 0, 2: 1} # Convert the sentiment label - FinBERT's labels: 0 = negative, 1 = neutral, 2 = positive
        sentence_scores.append(sentiment_map[sentiment])

    tone_score = sum(sentence_scores) / len(sentence_scores) if sentence_scores else 0 # Compute the average tone score
    
    return tone_score


### Apply the `classify_beigebook_finBert` function through each row of the reports data frame

In [11]:
reports["tone"] = reports.apply(lambda row: classify_beigebook_finBert(row["report"]), axis=1)

In [16]:
reports.head()

Unnamed: 0,year,month,district,report,tone
0,1997,1,Atlanta,"‹ Search\nJanuary 22, 1997\n\n\nConsumer Spen...",0.120567
1,1997,1,Boston,"‹ Search\nJanuary 22, 1997\n\n\nRetail\nThe F...",0.131673
2,1997,1,Chicago,"‹ Search\nJanuary 22, 1997\n\n\nConsumer Spen...",0.245192
3,1997,1,Cleveland,"‹ Search\nJanuary 22, 1997\n\n\nManufacturing...",0.267081
4,1997,1,Dallas,"‹ Search\nJanuary 22, 1997\n\n\nPrices\nIn De...",0.071429


### Pivot the reports data frame to convert districts to columns & save final data frame

In [17]:
reports_tones = reports[["year", "month", "district", "tone"]]
reports_tones = reports_tones.pivot_table(index=["year", "month"], columns="district", values="tone", aggfunc='mean').reset_index()

In [19]:
reports_tones.head(10)

district,year,month,Atlanta,Boston,Chicago,Cleveland,Dallas,Kansas City,Minneapolis,New York,Philadelphia,Richmond,San Francisco,St Louis
0,1997,1,0.120567,0.131673,0.245192,0.267081,0.071429,-0.010989,0.120879,0.240838,-0.180556,0.518519,0.107143,-0.233945
1,1997,3,0.205128,0.215328,0.230769,0.247863,0.301676,-0.280899,-0.126154,-0.057971,-0.020833,0.315789,0.041379,0.19697
2,1997,5,0.142857,-0.036885,0.273128,0.309278,0.187817,-0.222727,-0.011236,0.496894,-0.137755,0.215223,0.182796,0.183908
3,1997,6,0.173077,-0.058632,0.257261,0.302703,0.365385,-0.323651,-0.201238,0.0,0.18,0.20462,0.215686,0.139738
4,1997,8,0.114583,0.064885,0.205645,0.324324,0.009615,-0.154589,-0.016667,0.149068,-0.036585,0.269663,0.31677,0.055118
5,1997,9,0.052402,-0.333333,0.19685,0.016194,0.090535,-0.126126,-0.316498,0.093525,-0.328671,0.011765,0.271523,0.280612
6,1997,10,0.078049,-0.143382,0.348416,-0.171548,-0.010601,-0.098446,-0.346354,0.11976,0.243056,0.281046,0.205556,0.071795
7,1997,12,0.139896,-0.05814,-0.004785,0.274194,0.35124,-0.234973,0.225806,0.136364,-0.028571,0.073171,0.452381,0.156398
8,1998,1,0.138889,0.022727,0.397321,0.17094,0.304348,0.09009,0.215686,0.52356,0.227848,-0.035948,0.333333,0.040936
9,1998,3,0.116402,0.051587,0.2875,0.181818,0.552083,-0.036697,0.108168,0.308756,0.082418,0.475336,0.543353,0.262857


In [21]:
reports_tones.to_csv("FINAL_Beige_Book_sentiment_scores.csv")