In [None]:
import os

import nltk
import numpy as np
import pandas as pd
import pysentiment2 as ps
import spacy
from dotenv import find_dotenv, load_dotenv
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.stem import WordNetLemmatizer
from rich import print
from textblob import TextBlob
from transformers import BertForSequenceClassification, BertTokenizer

%load_ext rich

load_dotenv(find_dotenv())

In [None]:
nltk.download('stopwords')
nltk.download('vader_lexicon')

### Load documents

In [None]:
documents_list = os.listdir("./extracted/")

docs = {
    doc.split("_")[0]: open(f"./extracted/{doc}", "r").read()
    for doc in documents_list
    if doc.endswith(".txt")
}


### Preprocess documents

In [None]:
print(docs['AAPL'])

## Preprocessing

In [None]:
def clean_text(text):
    text = text.replace('\n', ' ')
    text = text.replace('\r', ' ')
    text = text.replace('&#',' ')
    text = text.replace("\\ ",'')
    text = text.lower()
    return text

def text_tokenize(text):
    nltk_tokens_no_stopwords = nltk.word_tokenize(text)
    return nltk_tokens_no_stopwords

def remove_stopwords(text):
    nltk_tokens_no_stopwords = [word for word in text if word not in stopwords.words('english')]
    text_no_stopwords = ' '.join(nltk_tokens_no_stopwords)
    return text_no_stopwords

def lemmatize_text(text):
    nlp = spacy.load('en_core_web_sm')
    doc = nlp(text)
    text_lemmatized = ' '.join([token.lemma_ for token in doc])
    return text_lemmatized

def preprocessing_text(text):
    text = clean_text(text)
    text = text_tokenize(text)
    text = remove_stopwords(text)
    text = lemmatize_text(text)
    return text

## Sentiment analysis

### Loughran and McDonald Financial Sentiment Dictionaries

In [None]:
lm = ps.LM()

In [None]:
for k, v in docs.items():
    print(
        f"[bold green]Sentiment for {k}[/bold green]\n",
        lm.get_score(lm.tokenize(preprocessing_text(v))),
    )

### Using `TextBlob`

In [None]:
for k, v in docs.items():
    print(
        f"[bold green]Sentiment for {k}[/bold green]\n",
        TextBlob(preprocessing_text(v)).sentiment,
    )

### Using VADER

In [None]:
vader = SentimentIntensityAnalyzer()
for k, v in docs.items():
    print(
        f"[bold green]Sentiment for {k}[/bold green]\n",
        vader.polarity_scores(preprocessing_text(v)),
    )

### Using pre-trained model

@misc{yang2020finbert,
    title={FinBERT: A Pretrained Language Model for Financial Communications},
    author={Yi Yang and Mark Christopher Siy UY and Allen Huang},
    year={2020},
    eprint={2006.08097},
    archivePrefix={arXiv},
    }

Only allows 512 tokens

In [None]:
finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')

In [None]:
def get_predictions(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True)
    outputs = finbert(**inputs)[0]

    labels = {0: "neutral", 1: "positive", 2: "negative"}

    return (text, "----", labels[np.argmax(outputs.detach().numpy())])


for k, v in docs.items():
    print(
        f"[bold green]Sentiment for {k}[/bold green]\n",
        get_predictions(preprocessing_text(v)[:512]),
    )