In [1]:
import pymongo
import pandas as pd
import requests as req
import logging
import json
import torch
from tqdm import tqdm #For Progress Bars
from bs4 import BeautifulSoup as bs
import re
import xml.etree.ElementTree as et
import numpy as np
import torch

In [2]:
from transformers import BertTokenizer, BertForSequenceClassification
import numpy as np

finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')

In [3]:
sentences = [
    "The Company’s financial instruments that are exposed to concentrations of credit risk consist primarily of cash, cash equivalents, restricted cash, available-for-sale securities, and accounts receivable."
    "Although the Company maintains cash deposits, cash equivalent balances, and available-for-sale securities with multiple financial institutions, the deposits, at times, may exceed federally insured limits.",
    "Cash and cash equivalents may be withdrawn or redeemed on demand.",
    "The Company believes that the financial institutions that hold its cash and cash equivalents and restricted cash are financially sound and, accordingly, minimal credit risk exists with respect to these balances.",
    "The Company also maintains investments in U.S. treasury securities, U.S. government agency securities, commercial paper, and corporate bonds that carry high credit ratings and accordingly, minimal credit risk exists with respect to these balances.",
    "Cash equivalents consist of money market funds, commercial paper, and corporate bonds which are invested through financial institutions in the United States."
]

In [4]:
inputs = tokenizer(sentences, return_tensors="pt", padding=True)
outputs = finbert(**inputs)[0]
labels = {0:'neutral', 1:'positive',2:'negative'}
for idx, sent in enumerate(sentences):
    print(sent, '----', labels[np.argmax(outputs.detach().numpy()[idx])])

The Company’s financial instruments that are exposed to concentrations of credit risk consist primarily of cash, cash equivalents, restricted cash, available-for-sale securities, and accounts receivable.Although the Company maintains cash deposits, cash equivalent balances, and available-for-sale securities with multiple financial institutions, the deposits, at times, may exceed federally insured limits. ---- neutral
Cash and cash equivalents may be withdrawn or redeemed on demand. ---- neutral
The Company believes that the financial institutions that hold its cash and cash equivalents and restricted cash are financially sound and, accordingly, minimal credit risk exists with respect to these balances. ---- positive
The Company also maintains investments in U.S. treasury securities, U.S. government agency securities, commercial paper, and corporate bonds that carry high credit ratings and accordingly, minimal credit risk exists with respect to these balances. ---- neutral
Cash equivale

In [5]:
head = {
    "User-Agent": "Digital-Alpha SEC Explorer/1.0",
    "Connection": "keep-alive"
}

company_data = pd.read_csv('company_summary.csv')

logging.basicConfig(filename='error.log', encoding='utf-8', level=logging.DEBUG)

In [6]:
def has_keypoints(str, list):
    for key in list:
        if key in str.lower():
            return True
    return False

In [7]:
mongo_url = "mongodb+srv://huntrag:killsasuke@cluster0.staij.mongodb.net/myFirstDatabase?authSource=admin&replicaSet=atlas-a852oq-shard-0&w=majority&readPreference=primary&appname=MongoDB%20Compass&retryWrites=true&ssl=true"
try:
    client = pymongo.MongoClient(mongo_url)
    print("Connected to MongoDb Successfully")
except:
    logging.error("DB CONN ERROR: Couldn't connect to DB successfully")

mydb = client['tech-meet']

form_sentiment = mydb['form-sentiment']

Connected to MongoDb Successfully


In [12]:
def getSentiment(cik,asc):
    asc = asc.replace("-", "")
    cik = str(cik)
    base_url = "https://www.sec.gov/Archives/edgar/data/" + cik + "/" + asc
    xml_tree = req.get(base_url + "/FilingSummary.xml", headers=head)
    trigger_one = 'DISCLOSURE' # We will primarily look for these documents
    trigger_list = ['ACCOUNTING','TAXES','RISK','LEASES', 'DEBT','COMMITMENTS','STOCK',] # List of triggers to look for in each document name
    resdata = {} #Dict to hold end results
    root = et.fromstring(xml_tree.text) # Fetch XML ROOT
    for report in root.iter('Report'):
            for trigger_word in trigger_list:
                #Check if the trigger words exists in the document name
                partname = report.find('LongName').text
                if (trigger_word.lower() in partname.lower() and trigger_one.lower() in partname.lower()):
                    if ("table" in partname.lower() or "details" in partname.lower()):
                        continue
                    soup = bs(req.get(base_url + '/' + report.find('HtmlFileName').text,headers=head).text, 'lxml')
                    # Fragment the stripped text into list of sentences
                    txt_list_unrefined = re.split( r' *[\n\.\?!][\'"\)\]]* *' , soup.text.strip())
                    # Remove short sequences
                    txt_list = [ txt for txt in txt_list_unrefined if len(txt) > 100 ]
                    # Encode The Sentence
                    encoded_text = tokenizer(txt_list, return_tensors="pt", padding=True)
                    # Predict the sentiment tensor
                    sentiment_tensor = finbert(**encoded_text)[0].detach().numpy()
                    # Get the non-zero sentiments
                    refined_sentiment = [i for i in sentiment_tensor if np.argmax(i) != 0]
                    #Run their average, if all sentiments are neutral, average those instead
                    if refined_sentiment:
                        average_sentiment = np.average( refined_sentiment, axis=0)
                    else:
                        average_sentiment = np.average( sentiment_tensor, axis=0)
                    # do some modifications to the average sentiment

                    minval = np.min(average_sentiment)
                    
                    for i in range(3):
                        average_sentiment[i] = (average_sentiment[i] - minval)

                    sumval = np.sum(average_sentiment)
                    for i in range(3):
                        average_sentiment[i] = average_sentiment[i]/sumval
                    if average_sentiment[1] > 0.5:
                        resdata[partname] = 'good'
                    elif average_sentiment[2] > 0.5:
                        resdata[partname] = 'bad'
                    else:
                        resdata[partname] = 'neutral'
                    break
    return resdata

In [13]:
print(getSentiment(1342936,"0001342936-12-000012"))


{'000070 - Disclosure - Note 2. Significant Accounting Policies': 'bad', '000100 - Disclosure - Note 5. Income Taxes': 'neutral', '000110 - Disclosure - Note 6 . Concentration of Risk': 'neutral'}


In [None]:
company_summary = pd.read_csv('company_summary.csv')
