Sentiment analysis using FinBERT

Importing relevant packages

In [43]:
import os
import pandas as pd
import torch
import torch.nn.functional as F
import spacy
from transformers import AutoModelForSequenceClassification, AutoTokenizer


Initializing classes, directories, and variables

In [44]:
# Load the Spacy English language model
nlp = spacy.load('en_core_web_lg')

In [45]:
# Define the sentiment classes and the corresponding sentiment scores
sentiment_classes = {0: 'Positive', 1: 'Neutral', 2: 'Negative'}
sentiment_scores = {0: 1, 1: 0, 2: -1}

In [46]:
# Set the directory containing the CSV files
dir_path = '/Users/jomarjordas/Documents/MSFIN299/MSFIN299-Research/data/17a_exports'

Tokenization

In [47]:
# Load the FinBERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('ProsusAI/finbert')
model = AutoModelForSequenceClassification.from_pretrained('ProsusAI/finbert')

Running the input into the FinBERT Model

Document level

In [48]:
# Initialize an empty list to store the results
results_doc = []

# Iterate through all the text files in the directory
for filename in os.listdir(dir_path):
    if filename.endswith('.txt'):
        file_path = os.path.join(dir_path, filename)
        
        # Read the contents of the text file
        with open(file_path, 'r') as file:
            text = file.read().replace('\n', '')
        
        # Tokenize the text using the BERT tokenizer
        tokens = tokenizer.encode_plus(text, max_length=512, truncation=True, padding='max_length',
                                       add_special_tokens=True, return_tensors='pt')
        
        # Get the model's output for the tokenized text
        output = model(**tokens)
        
        # Apply softmax to the logits output tensor of our model (in index 0) across dimension -1
        probs = F.softmax(output[0], dim=-1)
        
        # Get the index of the predicted sentiment class
        pred_class_idx = torch.argmax(probs, dim=1)
        
        # Map the predicted sentiment class to a sentiment score and interpretation
        sentiment_score = sentiment_scores[pred_class_idx.item()]
        interpretation = sentiment_classes[pred_class_idx.item()]
        
        # Add the sentiment score and interpretation to the list of results
        results_doc.append({'file_name': filename, 'sentiment_score': sentiment_score, 'interpretation': interpretation})

In [49]:
# Create a dataframe with the sentiment score and interpretation for each file
df_results_doc = pd.DataFrame(results_doc)

# Print the results dataframe
print(df_results_doc)


      file_name  sentiment_score interpretation
0  MEG_2017.txt                1       Positive
1  MEG_2020.txt                0        Neutral
2  MEG_2021.txt                0        Neutral
3  MEG_2019.txt                1       Positive
4  MEG_2018.txt                1       Positive


Sentence level

In [54]:
# Define the sentiment scores and classes
sentiment_scores_sent = [1, 0, -1]
sentiment_classes_sent = ['positive', 'neutral', 'negative']

# Initialize the results list and counters
results_sent = []
pos_count = 0
neu_count = 0
neg_count = 0

# Loop over each text file in the directory
for filename in os.listdir(dir_path):
    if filename.endswith('.txt'):
        file_path = os.path.join(dir_path, filename)

        # Read the contents of the text file
        with open(file_path, 'r') as file:
            text = file.read()

        # Use Spacy to split the text into sentences
        doc = nlp(text)
        sentences = [sent.text for sent in doc.sents]

        # Initialize the sentence-level results list and counters
        sentence_results = []
        pos_sent_count = 0
        neu_sent_count = 0
        neg_sent_count = 0

        # Loop over each sentence in the text file
        for sentence in sentences:
            # Tokenize the sentence using the FinBERT tokenizer
            tokens = tokenizer.encode_plus(sentence, max_length=512, truncation=True, padding='max_length',
                                           add_special_tokens=True, return_tensors='pt')

            # Get the model's output for the tokenized sentence
            output = model(**tokens)

            # Apply softmax to the logits output tensor of our model (in index 0) across dimension -1
            probs = F.softmax(output[0], dim=-1)

            # Get the index of the predicted sentiment class
            pred_class_idx = torch.argmax(probs, dim=1)

            # Map the predicted sentiment class to a sentiment score and interpretation
            sentiment_score = sentiment_scores_sent[pred_class_idx.item()]
            interpretation = sentiment_classes_sent[pred_class_idx.item()]

            # Add the sentiment score and interpretation to the sentence-level results list
            sentence_results.append({'sentence': sentence.strip(), 'sentiment_score': sentiment_score, 'interpretation': interpretation})

            # Update the sentence-level sentiment counters
            if interpretation == 'positive':
                pos_sent_count += 1
            elif interpretation == 'neutral':
                neu_sent_count += 1
            elif interpretation == 'negative':
                neg_sent_count += 1

        # Calculate the overall sentiment score for the corpus
        sentiment_score_corpus = (pos_sent_count - neg_sent_count) / len(sentences)

        # Add the sentence-level and file-level results to the results list
        results_sent.append({'file_name': filename, 'sentences': sentence_results, 
                              'positive_sent_count': pos_sent_count,
                              'neutral_sent_count': neu_sent_count,
                              'negative_sent_count': neg_sent_count,
                              'total_sent_count': len(sentences),
                              'sentiment_score_corpus': sentiment_score_corpus})
        
        # Update the file-level sentiment counters
        pos_count += pos_sent_count
        neu_count += neu_sent_count
        neg_count += neg_sent_count


Creating a dataframe to show results

In [55]:
# Create a pandas DataFrame from the results list
df = pd.DataFrame(results_sent)

# Add a column for the total sentence count
df['sentence_count'] = df['sentences'].apply(lambda x: len(x))

# Add a column for the total token count
df['token_count'] = df['sentences'].apply(lambda x: sum(len(sentence['sentence'].split()) for sentence in x))

output_dir_path = '/Users/jomarjordas/Documents/MSFIN299/MSFIN299-Research/data/17a_exports'
output_file_name = 'sent_lvl.csv'
output_file_path = os.path.join(output_dir_path, output_file_name)

# save to csv
df.to_csv(output_file_path, index=False)

df

Unnamed: 0,file_name,sentences,positive_sent_count,neutral_sent_count,negative_sent_count,total_sent_count,sentiment_score_corpus,sentence_count,token_count
0,MEG_2017.txt,"[{'sentence': 'Megaworld, the country’s larges...",30,4,19,53,0.207547,53,1800
1,MEG_2020.txt,"[{'sentence': 'Megaworld, the country’s larges...",17,14,21,52,-0.076923,52,1603
2,MEG_2021.txt,"[{'sentence': 'Megaworld, the country’s larges...",17,14,21,52,-0.076923,52,1603
3,MEG_2019.txt,"[{'sentence': 'Megaworld, the country’s larges...",28,5,22,55,0.109091,55,1687
4,MEG_2018.txt,"[{'sentence': 'Megaworld, the country’s larges...",27,4,22,53,0.09434,53,1868


Checker

In [None]:
dir_path = '/Users/jomarjordas/Documents/MSFIN299/MSFIN299-Research/data/17a_scrubbed'

for filename in os.listdir(dir_path):
    if filename.endswith('.csv'):
        file_path = os.path.join(dir_path, filename)
        df = pd.read_csv(file_path)
        print(f"The data type of {filename} is {type(df)}")