In [1]:
import torch
print("CUDA Availability: ", torch.cuda.is_available())
gpu_index = torch.cuda.current_device()
print("Current Device : ", torch.cuda.get_device_name(gpu_index))
print("Available Devices")
for i in range(torch.cuda.device_count()):
    print(i, ") Device Name: ", torch.cuda.get_device_name(i))

CUDA Availability:  True
Current Device :  NVIDIA GeForce RTX 3060 Laptop GPU
Available Devices
0 ) Device Name:  NVIDIA GeForce RTX 3060 Laptop GPU


## Now Testing Tensorflow-gpu

- Language: markdown
- Path: finbert-analysis.ipynb

In [2]:
from tensorflow.python.client import device_lib
def get_available_gpus():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos if x.device_type == 'GPU']
print(get_available_gpus())

['/device:GPU:0']


## Getting Started

Import the transformers

In [1]:
from transformers import BertTokenizer, BertForSequenceClassification
import numpy as np

finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')

In [2]:
sentences = [
    "The Company’s financial instruments that are exposed to concentrations of credit risk consist primarily of cash, cash equivalents, restricted cash, available-for-sale securities, and accounts receivable."
    "Although the Company maintains cash deposits, cash equivalent balances, and available-for-sale securities with multiple financial institutions, the deposits, at times, may exceed federally insured limits.",
    "Cash and cash equivalents may be withdrawn or redeemed on demand.",
    "The Company believes that the financial institutions that hold its cash and cash equivalents and restricted cash are financially sound and, accordingly, minimal credit risk exists with respect to these balances.",
    "The Company also maintains investments in U.S. treasury securities, U.S. government agency securities, commercial paper, and corporate bonds that carry high credit ratings and accordingly, minimal credit risk exists with respect to these balances.",
    "Cash equivalents consist of money market funds, commercial paper, and corporate bonds which are invested through financial institutions in the United States."
]

In [3]:
inputs = tokenizer(sentences, return_tensors="pt", padding=True)
outputs = finbert(**inputs)[0]
labels = {0:'neutral', 1:'positive',2:'negative'}
for idx, sent in enumerate(sentences):
    print(sent, '----', labels[np.argmax(outputs.detach().numpy()[idx])])

The Company’s financial instruments that are exposed to concentrations of credit risk consist primarily of cash, cash equivalents, restricted cash, available-for-sale securities, and accounts receivable.Although the Company maintains cash deposits, cash equivalent balances, and available-for-sale securities with multiple financial institutions, the deposits, at times, may exceed federally insured limits. ---- neutral
Cash and cash equivalents may be withdrawn or redeemed on demand. ---- neutral
The Company believes that the financial institutions that hold its cash and cash equivalents and restricted cash are financially sound and, accordingly, minimal credit risk exists with respect to these balances. ---- positive
The Company also maintains investments in U.S. treasury securities, U.S. government agency securities, commercial paper, and corporate bonds that carry high credit ratings and accordingly, minimal credit risk exists with respect to these balances. ---- neutral
Cash equivale

## Now analysing the code bit by bit

In [4]:
inputs = tokenizer(sentences, return_tensors="pt", padding=True)
inputs

{'input_ids': tensor([[    3,     6,    37, 30748,    58,    39,   786,    15,    21,  2856,
             9,  7632,     7,    97,   177,  1768,   106,     7,    50,   585,
            50,  1499,   585,  1349,    50,   585,   213,    85,    14,    85,
           282,    86,   585,     8,   491,  1179,    48,   488,     6,    37,
          2705,    50,   976,   585,    50,  1764,  1344,   585,     8,   213,
            85,    14,    85,   282,    86,    20,   533,    39,  1616,   585,
             6,   976,   585,    28,  1101,   585,    32,  1441, 11574,  2256,
          2574,    48,     4],
        [    3,    50,     8,    50,  1499,    32,    25, 11461,    16,  6039,
            19,   302,    48,     4,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
       

In [5]:
outputs = finbert(**inputs)
y = outputs[0]
y_sum = np.average(y.detach().numpy(),axis=0)
y_sum

array([ 3.9878306, -3.2382145, -3.6300595], dtype=float32)

In [6]:
labels = {0:'neutral', 1:'positive',2:'negative'}
for i, sentence in enumerate(sentences):
    #Iterate through every sentence
    print("\n",sentence)
    j = np.argmax(y.detach().numpy()[i])
    print("\nSentiment: " , labels[j])
    print("Sentiment Probability: " , y.detach().numpy()[i][j])
j_sum = np.argmax(y_sum)
print("\nOverall Sentiment: " , labels[j_sum])


 The Company’s financial instruments that are exposed to concentrations of credit risk consist primarily of cash, cash equivalents, restricted cash, available-for-sale securities, and accounts receivable.Although the Company maintains cash deposits, cash equivalent balances, and available-for-sale securities with multiple financial institutions, the deposits, at times, may exceed federally insured limits.

Sentiment:  neutral
Sentiment Probability:  4.79147

 Cash and cash equivalents may be withdrawn or redeemed on demand.

Sentiment:  neutral
Sentiment Probability:  6.8653197

 The Company believes that the financial institutions that hold its cash and cash equivalents and restricted cash are financially sound and, accordingly, minimal credit risk exists with respect to these balances.

Sentiment:  positive
Sentiment Probability:  2.8210857

 The Company also maintains investments in U.S. treasury securities, U.S. government agency securities, commercial paper, and corporate bonds t

In [7]:
file_base = "https://www.sec.gov/Archives/edgar/data/1477333/000147733322000008/" # Base URL for filing
filing_summary = file_base + "FilingSummary.xml"

In [8]:
import requests as req
import xml.etree.ElementTree as et

head = {
    "User-Agent": "Alpha-Explorer/1.0",
    "Connection": "keep-alive"
}

res = req.get(filing_summary, headers=head)

root = et.fromstring(res.text)
root.tag

'FilingSummary'

In [9]:
component_dict = [] # Array to store our dictionary
trigger_one = ['DISCLOSURE']
trigger_list = ['ACCOUNTING', 'LEASES', 'DEBT','COMMITMENTS','STOCK',] # List of triggers to look for
for report in root.iter('Report'):
    dict = {}
    for trigger_word in trigger_list:
        name = report.find('LongName').text
        if (trigger_word.lower() in name.lower() and 'disclosure' in name.lower()):
            if ("table" in name.lower() or "details" in name.lower()):
                continue
            try:
                dict["name"] = report.find('LongName').text
                dict["url"] = file_base+report.find('HtmlFileName').text
                component_dict.append(dict)
            except:
                print("No report found at some point")

component_dict

[{'name': '2103102 - Disclosure - Summary of Significant Accounting Policies',
  'url': 'https://www.sec.gov/Archives/edgar/data/1477333/000147733322000008/R10.htm'},
 {'name': '2124106 - Disclosure - Leases',
  'url': 'https://www.sec.gov/Archives/edgar/data/1477333/000147733322000008/R14.htm'},
 {'name': '2129107 - Disclosure - Debt',
  'url': 'https://www.sec.gov/Archives/edgar/data/1477333/000147733322000008/R15.htm'},
 {'name': '2135108 - Disclosure - Commitments and Contingencies',
  'url': 'https://www.sec.gov/Archives/edgar/data/1477333/000147733322000008/R16.htm'},
 {'name': '2139109 - Disclosure - Common Stock',
  'url': 'https://www.sec.gov/Archives/edgar/data/1477333/000147733322000008/R17.htm'},
 {'name': '2143110 - Disclosure - Stock-based Compensation',
  'url': 'https://www.sec.gov/Archives/edgar/data/1477333/000147733322000008/R18.htm'},
 {'name': '2204201 - Disclosure - Summary of Significant Accounting Policies (Policies)',
  'url': 'https://www.sec.gov/Archives/edga

In [16]:
from bs4 import BeautifulSoup as bs
import re
text_dat = {}
for component in component_dict:
    soup = bs(
        req.get(
            component["url"],
            headers=head
        ).text,
        "html"
    )
    txtlist = re.split(r' *[\n\.\?!][\'"\)\]]* *',soup.text.strip())
    text_dat[component['name']] = [ txt for txt in txtlist if len(txt) > 100 ]



In [26]:
txt = "The Company leases facilities under non-cancellable operating leases primarily in the United States, South Africa, the United Kingdom and Canada. The Company’s operating leases have remaining lease terms of between less than one to 12 years, some of which include options to extend the leases for up to five years, and some of which include options to terminate the leases within one year. These options to extend the terms of the Company’s operating leases were not deemed to be reasonably certain of exercise as of lease commencement and are therefore not included in the determination of their respective non-cancellable lease terms. The future lease payments due under non-cancellable operating lease arrangements contain fixed rent increases over the term of the lease. The Company also leases office equipment under non-cancellable leases.The following table presents the components of lease expense on the Company’s condensed consolidated statements of operations and comprehensive loss for each of the periods indicated.The Company leases facilities under non-cancellable operating leases primarily in the United States, South Africa, the United Kingdom and Canada. The Company’s operating leases have remaining lease terms of between less than one to 12 years, some of which include options to extend the leases for up to five years, and some of which include options to terminate the leases within one year. These options to extend the terms of the Company’s operating leases were not deemed to be reasonably certain of exercise as of lease commencement and are therefore not included in the determination of their respective non-cancellable lease terms. The future lease payments due under non-cancellable operating lease arrangements contain fixed rent increases over the term of the lease. The Company also leases office equipment under non-cancellable leases.The following \table presents the components of lease expense on the Company’s condensed consolidated statements of operations and comprehensive loss for each of the periods indicated.The Company leases facilities under non-cancellable operating leases primarily in the United States, South Africa, the United Kingdom and Canada. The Company’s operating leases have remaining lease terms of between less than one to 12 years, some of which include options to extend the leases for up to five years, and some of which include options to terminate the leases within one year.lable operating leases primarily in the United States, South Africa, the United Kingdom and Canada. The Company’s operating leases have remaining lease terms"
print(len(txt))
x = tokenizer(txt, return_tensors="pt", padding=True)
encoded_sentiment = finbert(**x)[0].detach().numpy()
print(encoded_sentiment)

2603
[[ 7.096506  -4.374116  -4.7448015]]


In [11]:
from transformers import BertTokenizer, BertForSequenceClassification
import numpy as np

finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')

In [None]:
labels = {0:'neutral', 1:'positive',2:'negative'}

outputs = []

for key,txt_list in text_dat.items():
    print(key)
    print("Example Sentence: ", txt_list[0])

    output = {}
    output['name'] = key

    encoded_text = tokenizer(txt_list, return_tensors="pt", padding=True)
    encoded_sentiment = finbert(**encoded_text)[0].detach().numpy()
    refined_sentiment = [i for i in encoded_sentiment if np.argmax(i) != 0]
    average_sentiment = np.average(refined_sentiment, axis=0)

    output['component-sentiment'] = labels[np.argmax(average_sentiment)]

    print("Overall", output['component-sentiment'])
    output['sentiment-list'] = []
    output['values-list'] = []
    for i,sentence in enumerate(txt_list):
        output['sentiment-list'].append( labels[ np.argmax(  encoded_sentiment[i]  ) ] )
        output['values-list'].append(encoded_sentiment[i])


    outputs.append(output)
    print("----------------------------------------------------------------------------------------------------------------------------------------------")

## Same Calc But in Pytorch

In [5]:
import torch
print("CUDA Availability: ", torch.cuda.is_available())
gpu_index = torch.cuda.current_device()
print("Current Device : ", torch.cuda.get_device_name(gpu_index))
print("Available Devices")
for i in range(torch.cuda.device_count()):
    print(i, ") Device Name: ", torch.cuda.get_device_name(i))
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

CUDA Availability:  True
Current Device :  NVIDIA GeForce RTX 3060 Laptop GPU
Available Devices
0 ) Device Name:  NVIDIA GeForce RTX 3060 Laptop GPU


In [20]:
#Random testing

X_train = torch.FloatTensor([0., 1., 2., 3., 4.])
X_train.is_cuda

False

In [28]:
# Common practice to hold the GPU in device
X_train = X_train.to(device)
X_train.is_cuda

True

In [10]:
import torch
from GPUtil import showUtilization as gpu_usage
from numba import cuda

def free_gpu_cache():
    print("Initial GPU Usage")
    gpu_usage()                             

    torch.cuda.empty_cache()

    cuda.select_device(0)
    cuda.close()
    cuda.select_device(0)

    print("GPU Usage after emptying the cache")
    gpu_usage()

free_gpu_cache()    

ImportError: Numba needs NumPy 1.21 or less

In [14]:
from transformers import BertTokenizer, BertForSequenceClassification
import numpy as np
import torch

torch.cuda.empty_cache()

finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
finbert.eval()
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')

gpu = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
cpu = "cpu"
if torch.cuda.is_available():
    finbert = finbert.cuda()

In [17]:
labels = {0:'neutral', 1:'positive',2:'negative'}

outputs = []

for key,txt_list in text_dat.items():
    print(key)
    print("Example Sentence: ", txt_list[0])
    print("Total Number of Sentences: ", len(txt_list))
    output = {}
    output['name'] = key
    encoded_text = tokenizer(txt_list, return_tensors="pt", padding=True).cuda()
    sentiment_tensor = finbert(**encoded_text)
    encoded_sentiment = sentiment_tensor[0].detach().numpy()
    refined_sentiment = [i for i in encoded_sentiment if np.argmax(i) != 0]
    average_sentiment = np.average(refined_sentiment, axis=0)

    output['component-sentiment'] = labels[np.argmax(average_sentiment)]

    print("Overall", output['component-sentiment'])
    # output['sentiment-list'] = []
    # output['values-list'] = []
    # for i,sentence in enumerate(txt_list):
    #     output['sentiment-list'].append( labels[ np.argmax(  encoded_sentiment[i]  ) ] )
    #     output['values-list'].append(encoded_sentiment[i])


    outputs.append(output)
    print("----------------------------------------------------------------------------------------------------------------------------------------------")

2103102 - Disclosure - Summary of Significant Accounting Policies
Example Sentence:  Summary of Significant Accounting PoliciesConcentrations of RisksThe Company’s revenue is reliant on its customers utilizing Internet-based services
Total Number of Sentences:  168


AttributeError: 