In [1]:
import os
import pandas as pd

import pysentiment2 as ps

from dotenv import load_dotenv, find_dotenv
from rich import print

import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup, pipeline
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader


import nltk
from nltk.corpus import stopwords
import spacy
from transformers import BertTokenizer, BertForSequenceClassification
from nltk.corpus import stopwords
from textblob import TextBlob
import numpy as np
from nltk.sentiment.vader import SentimentIntensityAnalyzer
%load_ext rich

load_dotenv(find_dotenv())

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
  from .autonotebook import tqdm as notebook_tqdm


[3;91mFalse[0m

In [None]:
nltk.download('stopwords')

### Load documents

In [4]:
documents_list = os.listdir("./extracted/")

docs = {
    doc.split("_")[0]: open(f"./extracted/{doc}", "r").read()
    for doc in documents_list
    if doc.endswith(".txt")
}


### Preprocess documents

In [5]:
print(docs['AAPL'])

## Preprocessing

In [77]:
from nltk.stem import WordNetLemmatizer
def clean_text(text):
    text = text.replace('\n', ' ')
    text = text.replace('\r', ' ')
    text = text.replace('&#',' ')
    text = text.replace("\\ ",'')
    text = text.lower()
    return text

def text_tokenize(text):
    nltk_tokens_no_stopwords = nltk.word_tokenize(text)
    return nltk_tokens_no_stopwords

def remove_stopwords(text):
    nltk_tokens_no_stopwords = [word for word in text if word not in stopwords.words('english')]
    text_no_stopwords = ' '.join(nltk_tokens_no_stopwords)
    return text_no_stopwords

def lemmatize_text(text):
    nlp = spacy.load('en_core_web_sm')
    doc = nlp(text)
    text_lemmatized = ' '.join([token.lemma_ for token in doc])
    return text_lemmatized

def preprocessing_text(text):
    text = clean_text(text)
    text = text_tokenize(text)
    text = remove_stopwords(text)
    text = lemmatize_text(text)
    return text

## Sentiment analysis

### Loughran and McDonald Financial Sentiment Dictionaries

In [44]:
lm = ps.LM()

In [5]:
for k, v in docs.items():
    print(
        f"[bold green]Sentiment for {k}[/bold green]\n",
        lm.get_score(lm.tokenize(v)),
    )


In [80]:

for k, v in docs.items():
    print(
        f"[bold green]Sentiment for {k}[/bold green]\n",
        lm.get_score(lm.tokenize(preprocessing_text(v))),
    )
 



### Using `TextBlob`

In [81]:
for k, v in docs.items():
    print(
        f"[bold green]Sentiment for {k}[/bold green]\n",
        TextBlob(preprocessing_text(v)).sentiment
    )
 


### Using VADER

In [82]:
vader = SentimentIntensityAnalyzer()
for k, v in docs.items():
    print(
        f"[bold green]Sentiment for {k}[/bold green]\n",
        vader.polarity_scores(preprocessing_text(v))
    )




### Using pre-trained model

@misc{yang2020finbert,
    title={FinBERT: A Pretrained Language Model for Financial Communications},
    author={Yi Yang and Mark Christopher Siy UY and Allen Huang},
    year={2020},
    eprint={2006.08097},
    archivePrefix={arXiv},
    }

Only allows 512 tokens

In [67]:
finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')

config.json: 100%|██████████| 533/533 [00:00<00:00, 480kB/s]
pytorch_model.bin: 100%|██████████| 439M/439M [00:43<00:00, 10.1MB/s] 
vocab.txt: 100%|██████████| 226k/226k [00:00<00:00, 8.97MB/s]


In [84]:


def get_predictions(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True)
    outputs = finbert(**inputs)[0]

    labels = {0:'neutral', 1:'positive',2:'negative'}

    return (text, '----', labels[np.argmax(outputs.detach().numpy())])

for k, v in docs.items():
    print(
        f"[bold green]Sentiment for {k}[/bold green]\n",
        get_predictions(preprocessing_text(v)[:512])
    )