# How to do basic sentiment analysis using BERT

### Step 1: Import necessary packages

In [None]:
# for webscraping
import requests
from bs4 import BeautifulSoup 

# for text processing
import nltk
from nltk import sent_tokenize, word_tokenize
nltk.download('punkt')

# for the language model
# you might need to install transformers first using either !pip install transformers or Qt Console in Anaconda Navigator
from transformers import BertTokenizer, BertForSequenceClassification

# for formatting
import numpy as np

### Step 2a: Get your text data (usually through via an API or webscraping)

Here, we are just looking at Federal Reserve press releases from Q1 2023. We use BeautifulSoup to webscrape the text through their webpage urls. You can use specific APIs (e.g. Reddit API, Twitter API, Dow Jones API, etc.) to scrape other text though, and your scraping code will look different.

### Step 2b: Clean and pre-process the text for putting sentences through model
In general, you'll want to make everything lowercase and get rid of excess whitespaces (tabs, new lines, etc.). Other than that, the way you clean text should be specific to the text itself. Your text may repeatedly include phrases or characters that you want to exclude--you can use regular expressions to delete/replace them from your text. 

In [None]:
#manually add urls

#2023 q1
q1urls2023 = []
q1urls2023.append('https://www.federalreserve.gov/newsevents/pressreleases/monetary20230201a.htm')
q1urls2023.append('https://www.federalreserve.gov/newsevents/pressreleases/monetary20230322a.htm')

In [None]:
#scrape text from urls
textlist = [] #to store clean text later

for url in q1urls2023: #iterate through urls in list
    reqs = requests.get(url)
    soup = BeautifulSoup(reqs.text, 'html.parser')
    text = soup.get_text()
    ##################clean text######################
    #delete stuff before relevant text
    index = text.find('Recent indicators') 
    text = text[index:]
    #delete stuff after relevant text
    index = text.find('Voting for the monetary policy action')
    text = text[:index]
    #remove white space
    text = text.replace('\n',"")
    #lowercase
    text = text.lower()
    #cleaning u.s.
    text = text.replace('u.s.',"united states")
    #remove unicode
    text_encode = text.encode(encoding="ascii", errors="ignore")
    text_decode = text_encode.decode()
    text = " ".join([word for word in text_decode.split()])
    #cleaning fractions
    text = text.replace('-1/4',".25")
    text = text.replace('-1/2',".5")
    text = text.replace('-3/4',".75")
    text = text.replace('1/4',"0.25")
    text = text.replace('1/2',".5")
    text = text.replace('3/4',"0.75")
    ##################################################
    textlist.append(text)

sentences = [] #to store tokenized sentences 
sentences.append(sent_tokenize(textlist[0])) #tokenize each sentence
sentences.append(sent_tokenize(textlist[1])) #tokenize each sentence

#using list comprehension to store each sentence as 1 item in a list
finalsentences = [item for sublist in sentences for item in sublist]

#then each sentence should be 1 string item in a list
print('cleaned tokenized sentences:\n',finalsentences) 

In [None]:
#if 'w' it'll write to file / if 'a', it'll append to the file
# 'w' is fine for single batch processing, if using multiple batches due to data size then 'a' is more useful
txt = open("QSSsample.txt","w",encoding="utf-8") 
txt.writelines(finalsentences)
txt.close()

### Step 3: Run the model and record results
There are different types of sentiment analysis models. BERT (which stands for "Bidirectional Encoder Representations from Transformers") is one of them. There are all kinds of BERTs that are fine-tuned (i.e. specially trained) for specific tasks, like reading Tweets or recognizing hatespeech or detecting specific emotions. Here, I use finbert, a BERT model fine-tuned to understand sentiment from financial news. 

The BERT model reads each sentence then determines if it is positive, neutral, or negative. We can record these results, along with the counts of positive, neutral, and negative sentences. We might want to convert these counts into percentages to make them more universally comparable.

In [None]:
#importing the model we want to use
finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')

#feed in the sentences from earlier 
sentences = finalsentences 

#BERT has its own way of encoding and processing sentences
inputs = tokenizer(sentences, return_tensors="pt", padding=True)
outputs = finbert(**inputs)[0]

#this BERT tells us if the sentiment of each sentence is positive, neutral, or negative
labels = {0:'neutral', 1:'positive',2:'negative'} 

#open a text file to record our results if we want to manually view each sentence and its sentiment result
with open("QSSsampleresults.txt", "a") as f:
    #get ready to count the number of pos, neu, and neg sentences
    poscount=0 
    neucount=0
    negcount=0

    #iterate through all the sentences
    for idx, sent in enumerate(sentences):
        #if the model says it's positive
        if labels[np.argmax(outputs.detach().numpy()[idx])] == 'positive':
            #then add 1 to our positive count
            poscount+=1
            #also record the sentence and its sentiment result into the text file
            f.write(sent +"\t*positive*\n")
        elif labels[np.argmax(outputs.detach().numpy()[idx])]=='neutral':
            neucount+=1
            f.write(sent +"\t*neutral*\n")
        elif labels[np.argmax(outputs.detach().numpy()[idx])]=='negative':
            negcount+=1
            f.write(sent +"\t*negative*\n")
        #just in case
        else:
            print('error')

In [None]:
#get a total count so that we can convert our counts into percentages later
totalcount = poscount + neucount + negcount

#printing count results
print('# positive sentences: ',poscount)
print('# neutral sentences: ',neucount)
print('# negative sentences: ',negcount)

#finding percentages too
print('% positive sentences: {:.0%}'.format(poscount/totalcount))
print('% neutral sentences: {:.0%}'.format(neucount/totalcount))
print('% negative sentences: {:.0%}'.format(negcount/totalcount))