## Youtube
https://www.youtube.com/watch?v=DkzbCJtFvqM

### DistilBert Example

In [4]:
from transformers import pipeline
classifier = pipeline('sentiment-analysis', model='ProsusAI/finbert')

Downloading:   0%|          | 0.00/758 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/252 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [5]:
classifier('Institutional owners may take dramatic actions as Hudbay Minerals Inc.s (TSE:HBM) recent 5.1% drop adds to one-year losses')

[{'label': 'negative', 'score': 0.9683200716972351}]

In [6]:
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification, pipeline

In [7]:
model_name = 'ProsusAI/finbert'
model = TFAutoModelForSequenceClassification.from_pretrained(model_name, from_pt=True)
tokenizer = AutoTokenizer.from_pretrained(model_name)
classifier = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForSequenceClassification: ['bert.embeddings.position_ids']
- This IS expected if you are initializing TFBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


In [13]:
classifier('HudBay Minerals (HBM) delivered earnings and revenue surprises of 16.67% and 19.33%, respectively , for the quarter ended September 2022. Do the numbers hold clues to what lies ahead for the stock?')

[{'label': 'neutral', 'score': 0.9174888730049133}]

In [9]:
inputs = tokenizer('Institutional owners may take dramatic actions as Hudbay Minerals Inc.s (TSE:HBM) recent 5.1% drop adds to one-year losses')

In [10]:
inputs

{'input_ids': [101, 12148, 5608, 2089, 2202, 6918, 4506, 2004, 15876, 18939, 4710, 13246, 4297, 1012, 1055, 1006, 24529, 2063, 1024, 1044, 25526, 1007, 3522, 1019, 1012, 1015, 1003, 4530, 9909, 2000, 2028, 1011, 2095, 6409, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [11]:
tf_batch = tokenizer(
    ['Institutional owners may take dramatic actions as Hudbay Minerals Inc.s (TSE:HBM) recent 5.1% drop adds to one-year losses',
    'Earnings Preview: HudBay Minerals (HBM) Q3 Earnings Expected to Decline'],
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors='tf'
)

In [12]:
for key, value in tf_batch.items():
    print(f'{key}: {value.numpy().tolist()}')

input_ids: [[101, 12148, 5608, 2089, 2202, 6918, 4506, 2004, 15876, 18939, 4710, 13246, 4297, 1012, 1055, 1006, 24529, 2063, 1024, 1044, 25526, 1007, 3522, 1019, 1012, 1015, 1003, 4530, 9909, 2000, 2028, 1011, 2095, 6409, 102], [101, 16565, 19236, 1024, 15876, 18939, 4710, 13246, 1006, 1044, 25526, 1007, 1053, 2509, 16565, 3517, 2000, 6689, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
token_type_ids: [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
attention_mask: [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]


### Fine tuning a pretrained model on custom dataset

In [1]:
import pandas as pd
df = pd.read_csv('stock-market-news.csv')
X = df.content

In [2]:
X_train = X[:6].tolist()
X_test = X[6:9].tolist()

y_train = [0, 1, 0, 0, 1, 1]
y_test = [1, 0, 1]

In [8]:
train_encodings = tokenizer(X_train, truncation=True, padding=True)
test_encodings = tokenizer(X_test, truncation=True, padding=True)

### Convert these encodings into Dataset objects

In [9]:
import tensorflow as tf

train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    y_train
))

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    y_test
))

In [41]:
test_dataset

<TensorSliceDataset element_spec=({'input_ids': TensorSpec(shape=(160,), dtype=tf.int32, name=None), 'token_type_ids': TensorSpec(shape=(160,), dtype=tf.int32, name=None), 'attention_mask': TensorSpec(shape=(160,), dtype=tf.int32, name=None)}, TensorSpec(shape=(), dtype=tf.int32, name=None))>

In [10]:
from transformers import BertForSequenceClassification, BertTokenizer, TFTrainer, TFTrainingArguments

training_args = TFTrainingArguments(
    output_dir='./results',         # output directory
    num_train_epochs=2,             # total number of training epochs
    per_device_train_batch_size=4,  # batch size per device during training
    per_device_eval_batch_size=8,   # batch size per device for evaluation
    warmup_steps=500,               # number of warmup steps for learning rate scheduler
    weight_decay=0.01,              # strength of weight decay
    # logging_dir='./logs',           # directory for storing logs
    logging_steps=10
)
    

In [None]:
with training_args.strategy.scope():
    model = BertForSequenceClassification.from_pretrained(model_name)

trainer = TFTrainer(
    model=model,                    # the instantiated transformers model to be trained
    args=training_args,             # training arguments, defined above
    train_dataset=train_dataset,    # training dataset
    eval_dataset=test_dataset       # evaluation dataset
)

trainer.train()

## Other Finbert Example > 512 tokens, sliding window approach
https://www.youtube.com/watch?v=WEAAs_0etJQ

In [1]:
from transformers import BertForSequenceClassification, BertTokenizer
import torch

In [2]:
tokenizer = BertTokenizer.from_pretrained('ProsusAI/finbert')

model = BertForSequenceClassification.from_pretrained('ProsusAI/finbert')

In [44]:
txt = """
Most readers would already be aware that Hudbay Minerals' (TSE:HBM) stock increased significantly by 31% over the past month. But the company's key financial indicators appear to be differing across the board and that makes us question whether or not the company's current share price momentum can be maintained. In this article, we decided to focus on Hudbay Minerals' ROE.

Return on equity or ROE is a key measure used to assess how efficiently a company's management is utilizing the company's capital. Simply put, it is used to assess the profitability of a company in relation to its equity capital.

See our latest analysis for Hudbay Minerals

How Do You Calculate Return On Equity?
The formula for ROE is:

Return on Equity = Net Profit (from continuing operations) ÷ Shareholders' Equity

So, based on the above formula, the ROE for Hudbay Minerals is:

4.9% = US$77m ÷ US$1.6b (Based on the trailing twelve months to September 2022).

The 'return' refers to a company's earnings over the last year. Another way to think of that is that for every CA$1 worth of equity, the company was able to earn CA$0.05 in profit.
Why Is ROE Important For Earnings Growth?
Thus far, we have learned that ROE measures how efficiently a company is generating its profits. Depending on how much of these profits the company reinvests or "retains", and how effectively it does so, we are then able to assess a company’s earnings growth potential. Assuming all else is equal, companies that have both a higher return on equity and higher profit retention are usually the ones that have a higher growth rate when compared to companies that don't have the same features.

Hudbay Minerals' Earnings Growth And 4.9% ROE
On the face of it, Hudbay Minerals' ROE is not much to talk about. A quick further study shows that the company's ROE doesn't compare favorably to the industry average of 12% either. Therefore, it might not be wrong to say that the five year net income decline of 32% seen by Hudbay Minerals was probably the result of it having a lower ROE. However, there could also be other factors causing the earnings to decline. Such as - low earnings retention or poor allocation of capital.

However, when we compared Hudbay Minerals' growth with the industry we found that while the company's earnings have been shrinking, the industry has seen an earnings growth of 29% in the same period. This is quite worrisome.

past-earnings-growth
past-earnings-growth
The basis for attaching value to a company is, to a great extent, tied to its earnings growth. What investors need to determine next is if the expected earnings growth, or the lack of it, is already built into the share price. By doing so, they will have an idea if the stock is headed into clear blue waters or if swampy waters await. If you're wondering about Hudbay Minerals''s valuation, check out this gauge of its price-to-earnings ratio, as compared to its industry.

Is Hudbay Minerals Efficiently Re-investing Its Profits?
Hudbay Minerals' low LTM (or last twelve month) payout ratio of 5.1% (implying that it retains the remaining 95% of its profits) comes as a surprise when you pair it with the shrinking earnings. This typically shouldn't be the case when a company is retaining most of its earnings. So there could be some other explanations in that regard. For example, the company's business may be deteriorating.

In addition, Hudbay Minerals has been paying dividends over a period of at least ten years suggesting that keeping up dividend payments is way more important to the management even if it comes at the cost of business growth. Based on the latest analysts' estimates, we found that the company's future payout ratio over the next three years is expected to hold steady at 5.1%. However, Hudbay Minerals' future ROE is expected to decline to 0.4% despite there being not much change anticipated in the company's payout ratio.

Summary
Overall, we have mixed feelings about Hudbay Minerals. Even though it appears to be retaining most of its profits, given the low ROE, investors may not be benefitting from all that reinvestment after all. The low earnings growth suggests our theory correct. That being so, the latest industry analyst forecasts show that the analysts are expecting to see a huge improvement in the company's earnings growth rate. Are these analysts expectations based on the broad expectations for the industry, or on the company's fundamentals? Click here to be taken to our analyst's forecasts page for the company.

Have feedback on this article? Concerned about the content? Get in touch with us directly. Alternatively, email editorial-team (at) simplywallst.com.
"""

In [49]:
tokens = tokenizer.encode_plus(txt, add_special_tokens=False)

tokens.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [51]:
input_ids = tokens['input_ids']
attention_mask = tokens['attention_mask']

In [52]:
input_ids[:10]

[2087, 8141, 2052, 2525, 2022, 5204, 2008, 15876, 18939, 4710]

In [53]:
start = 0
window_length = 512

total_len = len(input_ids)

loop = True

while loop:
    end = start + window_length
    if end >= total_len:
        loop = False
        end = total_len
    print(f'start = {start}')
    print(f'end = {end}')
    start = end

start = 0
end = 512
start = 512
end = 1023


In [113]:
from transformers import BertForSequenceClassification, BertTokenizer
import torch

In [114]:
# Code for a single < 512 window

def small_text(input_ids, attention_mask):
    
    proba_list = []

    input_dict = {
        'input_ids' : torch.Tensor([input_ids]).long(),
        'attention_mask' : torch.Tensor([attention_mask]).int()
    }

    outputs = model(**input_dict)

    probabilities = torch.nn.functional.softmax(outputs[0], dim =-1)
    proba_list.append(probabilities)

    return proba_list

In [115]:
def large_text(input_ids, attention_mask, total_len):
        
    proba_list = []
    start = 0
    window_length = 510
    loop = True

    while loop:
        end = start + window_length
        if end >= total_len:
            loop = False
            end = total_len

        # 1 => Define the text chunk
        input_ids_chunk = input_ids[start : end]
        attention_mask_chunk = attention_mask[start : end]
        
        # 2 => Append [CLS] and [SEP]
        input_ids_chunk = [101] + input_ids_chunk + [102]
        attention_mask_chunk = [1] + attention_mask_chunk + [1]
        
        # 3 => Convert regular python list to Pytorch Tensor
        input_dict = {
            'input_ids' : torch.Tensor([input_ids_chunk]).long(),
            'attention_mask' : torch.Tensor([attention_mask_chunk]).int()
        }
        
        outputs = model(**input_dict)
        
        probabilities = torch.nn.functional.softmax(outputs[0], dim = -1)
        proba_list.append(probabilities)

        start = end
    
    return proba_list

In [116]:
def text_tokenizer(txt):
    tokens = tokenizer.encode_plus(txt, add_special_tokens=False)

    input_ids = tokens['input_ids']
    attention_mask = tokens['attention_mask']

    return input_ids, attention_mask


In [117]:
def sentiment_analysis(txt):
    
    input_ids, attention_mask = text_tokenizer(txt)

    total_len = len(input_ids)    
   
    if total_len < window_length:
        proba_list = small_text(input_ids, attention_mask)
        return proba_list
    
    elif total_len >= window_length:
        proba_list = large_text(input_ids, attention_mask, total_len)
        return proba_list

In [118]:
def sentiment_analysis_long(txt):
    tokens = tokenizer.encode_plus(txt, add_special_tokens=False)

    input_ids = tokens['input_ids']
    attention_mask = tokens['attention_mask']

    proba_list = []

    total_len = len(input_ids)    
    start = 0
    window_length = 510
    
    if total_len < window_length:
        proba_list = small_text(input_ids, attention_mask)
        return proba_list
    
    loop = True

    while loop:
        end = start + window_length
        if end >= total_len:
            loop = False
            end = total_len

        # 1 => Define the text chunk
        input_ids_chunk = input_ids[start : end]
        attention_mask_chunk = attention_mask[start : end]
        
        # 2 => Append [CLS] and [SEP]
        input_ids_chunk = [101] + input_ids_chunk + [102]
        attention_mask_chunk = [1] + attention_mask_chunk + [1]
        
        # 3 => Convert regular python list to Pytorch Tensor
        input_dict = {
            'input_ids' : torch.Tensor([input_ids_chunk]).long(),
            'attention_mask' : torch.Tensor([attention_mask_chunk]).int()
        }
        
        outputs = model(**input_dict)
        
        probabilities = torch.nn.functional.softmax(outputs[0], dim = -1)
        proba_list.append(probabilities)

        start = end

    return proba_list

In [119]:
def get_mean_from_proba(proba_list):
    # 0 - positive
    # 1 - negative
    # 2 - neutral

    with torch.no_grad():
        stacks = torch.stack(proba_list)
        stacks = stacks.resize(stacks.shape[0], stacks.shape[2])
        mean = stacks.mean(dim=0)
        sentiment = torch.argmax(mean).item()
    return mean, stacks, sentiment

In [120]:
proba_list_multi = sentiment_analysis(txt)
mean, stk, sentiment = get_mean_from_proba(proba_list_multi)



In [124]:
print(mean)
print(stk)
print(sentiment)
proba_list_multi

tensor([0.0484, 0.4730, 0.4786])
tensor([[0.0937, 0.4260, 0.4804],
        [0.0281, 0.9278, 0.0441],
        [0.0236, 0.0652, 0.9112]])
2


[tensor([[0.0937, 0.4260, 0.4804]], grad_fn=<SoftmaxBackward0>),
 tensor([[0.0281, 0.9278, 0.0441]], grad_fn=<SoftmaxBackward0>),
 tensor([[0.0236, 0.0652, 0.9112]], grad_fn=<SoftmaxBackward0>)]

In [97]:
stacks = torch.stack(proba_list_multi)
stacks
shape = stacks.shape
shape
xx = torch.reshape(stacks, (shape[0], shape[2]))
xx

tensor([[[0.0937, 0.4260, 0.4804]],

        [[0.0281, 0.9278, 0.0441]],

        [[0.0236, 0.0652, 0.9112]]], grad_fn=<StackBackward0>)

## Jay Alammar
https://jalammar.github.io/a-visual-guide-to-using-bert-for-the-first-time/

In [13]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv(
    'https://github.com/clairett/pytorch-sentiment-classification/raw/master/data/SST2/train.tsv', 
    delimiter='\t', 
    header=None
)

In [3]:
df.head()

Unnamed: 0,0,1
0,"a stirring , funny and finally transporting re...",1
1,apparently reassembled from the cutting room f...,0
2,they presume their audience wo n't sit still f...,0
3,this is a visually stunning rumination on love...,1
4,jonathan parker 's bartleby should have been t...,1


In [22]:
df[0]

'a stirring , funny and finally transporting re imagining of beauty and the beast and 1930s horror films'

### Loading the Pre-trained BERT model

In [4]:
model_class, tokenizer_class, pretrained_weights = (
    ppb.DistilBertModel, 
    ppb.DistilBertTokenizer, 
    'distilbert-base-uncased'
)

# Non distilled BERT
# model_class, tokenizer_class, pretrained_weights = (
#     ppb.BertModel, 
#     ppb.BertTokenizer, 
#     'bert-base-uncased'
# )

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)


Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### Model #1 Preparing the Dataset

#### Tokenization

In [6]:
tokenized = df[0].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

#### Padding

In [10]:
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

In [11]:
np.array(padded).shape

(6920, 67)

#### Masking

In [None]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

### Model #1 Deep Learning

In [14]:
# 6m 43.2s to run

input_ids = torch.tensor(padded)  
attention_mask = torch.tensor(attention_mask)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

In [15]:
features = last_hidden_states[0][:,0,:].numpy()

In [16]:
labels = df[1]

### Model #2 Train/Test Split

In [17]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels)

In [18]:
lr_clf = LogisticRegression()
lr_clf.fit(train_features, train_labels)

In [19]:
lr_clf.score(test_features, test_labels)

0.8445086705202313

In [20]:
from sklearn.dummy import DummyClassifier
clf = DummyClassifier()

scores = cross_val_score(clf, train_features, train_labels)
print("Dummy classifier score: %0.3f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Dummy classifier score: 0.519 (+/- 0.00)
