In [14]:
# from google.colab import drive
# drive.mount('/content/drive')

In [15]:
# import os
# os.chdir("/content/drive/MyDrive/BAN443_project")

### Download Beige Books 1993-1996

In [16]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd

In [17]:
base_link = "https://www.minneapolisfed.org/beige-book-reports"
districts = ["at", "bo", "ch", "cl", "da", "kc", "mi", "ny", "ph", "ri", "sf", "sl"]
districts_full = {"at": "Atlanta", "bo": "Boston", "ch": "Chicago", "cl": "Cleveland", "da": "Dallas", "kc": "Kansas City",
                  "mi": "Minneapolis", "ny": "New York", "ph": "Philadelphia", "ri": "Richmond", "sf": "San Francisco", "sl": "St Louis"}

### Scrape the reports with BeautifulSoup

In [18]:
years = []
months = []
district_texts = []
district_names = []

for year in range(1993, 1997):
    for month in range(1, 13):
        link = f"{base_link}/{year}/{year}-{month:02d}-{districts[0]}"
        response = requests.get(link)

        if response.status_code != 200:
            continue

        for district in districts:
            link = f"{base_link}/{year}/{year}-{month:02d}-{district}"
            response = requests.get(link)
            soup = BeautifulSoup(response.content, "html.parser")
            report_area = soup.find("div", class_="col-sm-12 col-lg-8 offset-lg-1")

            if report_area is None:
                continue

            report_text = []

            for element in report_area.find_all("p"):
                if element.find("strong"):  # Check for <strong> within <p>
                    strong_text = element.find('strong').get_text(strip=True) + "\n"
                    remaining_text = element.get_text(strip=True).replace(strong_text.strip(), "").strip() + "\n"
                    report_text.append(strong_text + remaining_text)
                elif element.find("b"):  # Check for <b> within <p>
                    b_text = element.find("b").get_text(strip=True) + "\n"
                    remaining_text = element.get_text(strip=True).replace(b_text.strip(), "").strip() + "\n"
                    report_text.append(b_text + remaining_text)
                else:
                    report_text.append(element.get_text(strip=True))

            district_report = "\n".join(report_text)

            years.append(year)
            months.append(month)
            district_names.append(districts_full[district])
            district_texts.append(district_report)
            

### Combine all reports to data frame

In [19]:
reports = pd.DataFrame({"year": years,
              "month": months,
              "district": district_names,
              "report": district_texts})

In [20]:
reports.head()

Unnamed: 0,year,month,district,report
0,1993,1,Atlanta,"‹ Back to Archive Search\nJanuary 21, 1993\nOv..."
1,1993,1,Boston,"‹ Back to Archive Search\nJanuary 21, 1993\n\n..."
2,1993,1,Chicago,"‹ Back to Archive Search\nJanuary 21, 1993\n\n..."
3,1993,1,Cleveland,"‹ Back to Archive Search\nJanuary 21, 1993\n\n..."
4,1993,1,Dallas,"‹ Back to Archive Search\nJanuary 21, 1993\n\n..."


### Clean data frame and remove formatting marks

In [None]:
reports['report'] = reports['report'].str.replace("Back to Archive Search", "")
reports['report'] = reports['report'].str.replace("‹ ", "")
reports.head()

Unnamed: 0,year,month,district,report
0,1993,1,Atlanta,"‹ \nJanuary 21, 1993\nOverviewAccording to Six..."
1,1993,1,Boston,"‹ \nJanuary 21, 1993\n\n\nEconomic activity in..."
2,1993,1,Chicago,"‹ \nJanuary 21, 1993\n\n\nSummary\nThe Seventh..."
3,1993,1,Cleveland,"‹ \nJanuary 21, 1993\n\n\nSummary\nBusiness ac..."
4,1993,1,Dallas,"‹ \nJanuary 21, 1993\n\n\nWhile District econo..."


In [None]:
reports['report'] = reports['report'].str.replace('\r', '', regex=False).str.strip()
reports['report'] = reports['report'].str.replace('\n', ' ', regex=False).str.strip()

In [39]:
reports.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 384 entries, 0 to 383
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   year      384 non-null    int64 
 1   month     384 non-null    int64 
 2   district  384 non-null    object
 3   report    384 non-null    object
dtypes: int64(2), object(2)
memory usage: 12.1+ KB


### Split the texts into sentences

In [22]:
from nltk.tokenize.punkt import PunktSentenceTokenizer

sentence_tokenizer = PunktSentenceTokenizer()
beige_book_sentences = []

for report in reports['report']:
  sentences = sentence_tokenizer.tokenize(report)
  for sentence in sentences:
    beige_book_sentences.append(sentence)

In [23]:
len(beige_book_sentences)

25519

### Sample 10% of all sentences 

In [None]:
import random
random.seed(42)

split_index = int(0.1 * len(beige_book_sentences))
random.shuffle(beige_book_sentences)

sampled_sentences = beige_book_sentences[:split_index]

### Label the sentences with GPT-4 API: classify the sentiment (positive, neutral, negative)

In [25]:
import os
from openai import AzureOpenAI
#from google.colab import userdata

gpt_model = "gpt-4"

client = AzureOpenAI(
    api_key="4454e1ff9dc84d3e90ea5a61ce447d8f",
    api_version="2023-03-15-preview",
    azure_endpoint="https://gpt-ban443-2.openai.azure.com/openai/deployments/Group04/chat/completions?api-version=2023-03-15-preview",
)

In [26]:
# Test that everything is working
messages = [{"role": "user", "content": "Hello, ready for some AI analysis?"}]
response = client.chat.completions.create(model=gpt_model, messages=messages, max_tokens=50)

print("Response: \n", response.choices[0].message.content)

Response: 
 Absolutely! I'm ready to assist you with any AI analysis you need. What specific topic or area would you like to focus on?


In [28]:
from openai.types.chat import ChatCompletionUserMessageParam, ChatCompletionSystemMessageParam

sentiment_scores = []

for sentence in sampled_sentences:
  messages = [
      ChatCompletionSystemMessageParam(
          role="system", content="""
          You are a helpful assistant, fluent in analysing economic and financial reports.
          Your main task is to analyse FED's Beige Book reports split into sentences, and estimate the sentiment of each sentence.
          The estimated sentiment scores should be: positive, neutral, negative - nothing else.
          When responding, give me only the sentiment score, without any parenthesis and other characters."""
      ),
      ChatCompletionUserMessageParam(
          role="user", content=f"""
          What is the sentiment score of a given sentence?
          <data>{sentence}</data>"""
      ),
  ]

  completion = client.chat.completions.create(model=gpt_model, messages=messages, temperature=0.2)
  sentiment_scores.append(completion.choices[0].message.content)


### Combine the results into a data frame and save them

In [29]:
training_data = pd.DataFrame({"sentence": sampled_sentences, "sentiment": sentiment_scores})
training_data.head()

Unnamed: 0,sentence,sentiment
0,"In Washington, cutbacks at Boeing are \r\nproc...",negative
1,"Delinquency rates \r\nare still rising, as the...",negative
2,Manufacturing\n: Factory operations have slowe...,negative
3,One of the largest auto dealers in the Distric...,positive
4,"The suburban Boston office market, by contrast...",positive


In [30]:
training_data.loc[3, 'sentence']

'One of the largest auto dealers in the District noted that \r\nprofitability has been improving to a somewhat greater extent than \r\nvolume.'

In [32]:
training_data.to_csv("finBERT_training_data.csv", index=False, sep='\t')