In [1]:
import json

import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from tqdm.auto import tqdm

from prep import prep, item_parser


In [2]:
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")
acc_nr_store = json.load(open("metastore/acc_nr_store.json", "r"))

In [3]:
class TenKDocument(object):
    def __init__(self, document, max_len=512):
        self.document = document
        self.max_len = max_len
        
        self.sentiment_store = []
    
    def __iter__(self):
        for i in range(0, len(self.document), self.max_len):
            yield self.build_partial(i)
            
    def __len__(self):
        return len(self.document)
    
    def build_partial(self, start):
        return self.document[start:start + self.max_len]
    
    def add_segment_sentiment(self, sentiment):
        self.sentiment_store.append(list(sentiment.detach().numpy()[0]))
        
    def get_document_sentiment(self):
        df = pd.DataFrame(self.sentiment_store)
        return df.mean().to_list()

In [4]:
def predict_sentiment(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors="pt")
    labels = torch.tensor([[1, 1, 1]], dtype=torch.float)
    outputs = model(**inputs, labels=labels)
    logits = outputs.logits
    return torch.nn.functional.softmax(logits, dim=1)

In [None]:
sentiment_store = {}
for cik in tqdm(acc_nr_store):
    if cik not in sentiment_store:
        sentiment_store[cik] = {}
    for acc_nr in acc_nr_store[cik]:
        if acc_nr in sentiment_store[cik]:
            continue
        contents = prep.load_sec_filing(acc_nr, "filings", bs=False)
        item_1a_raw, item_7_raw, item_7a_raw = item_parser.get_items(contents)
        item_7 = item_7_raw + item_7a_raw
        item_7_soup = prep.load_bs(item_7)

        item_7_doc = prep.clean_filing(item_7_soup)
        
        ten_k = TenKDocument(item_7_doc)
        for segment in ten_k:
            try:
                sentiment_prediction = predict_sentiment(segment, tokenizer, model)
            except RuntimeError: 
                print(cik, acc_nr)
                sentiment_prediction = torch.tensor([[]], dtype=torch.float)
            ten_k.add_segment_sentiment(sentiment_prediction)
            
        sentiment_store[cik][acc_nr] = pd.DataFrame(ten_k.sentiment_store).values.tolist()
        json.dump(sentiment_store, open("metastore/sentiment_store_item7.json", "w"))

  0%|          | 0/100 [00:00<?, ?it/s]