In [147]:
import torch
import numpy as np
from transformers import BertTokenizer, BertForMaskedLM, BertModel

In [224]:
# model            = {}
# model['GooBERT'] = BertForMaskedLM.from_pretrained('./GooBERT')
# model['FinBERT'] = BertForMaskedLM.from_pretrained('FinBERT-Prime_128MSL-250K')
# model['PreBERT'] = BertForMaskedLM.from_pretrained('FinBERT-Pre2K_128MSL-250K')
# model['ComBERT'] = BertForMaskedLM.from_pretrained('FinBERT-Combo_128MSL-250K')
# model['FedBERT-BERT'] = BertForMaskedLM.from_pretrained('FedBERT')
model['FedBERT-Fin'] = BertForMaskedLM.from_pretrained('FedBERT-Fin')
# model['FedBERT-Prime'] = BertForMaskedLM.from_pretrained('FedBERT-prime')

Some weights of the model checkpoint at FedBERT-Fin were not used when initializing BertForMaskedLM: ['bert.embeddings.position_ids']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [32]:
fin_target_para = '[CLS] We adjust our earnings for items that we believe do not reflect the underlying operations of the company . [SEP] These are non cash items consisting of primarily the loss on financial instruments at fair value and income taxes . [SEP]'

fed_target_para = '[CLS] The recovery is likely to face headwinds even if the downside risks do not materialize . [SEP] Fiscal support will remain vital . [SEP] It will be appropriate to shift the focus of monetary policy from stabilization to accommodation by supporting a full recovery in employment and a sustained return of inflation to its objective . [SEP]'

In [33]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
fin_tokenized = tokenizer.tokenize(fin_target_para)
fed_tokenized = tokenizer.tokenize(fed_target_para)

In [225]:
fin_mask_idx = [4, 18, 24, 33, 39]
for i in fin_mask_idx:
    fin_tokenized[i] = '[MASK]'

fin_tokens_tensor = torch.tensor([tokenizer.convert_tokens_to_ids(fin_tokenized)])

preds  = {}
for m in model:
    with torch.no_grad():
        preds[m] = model[m](fin_tokens_tensor)[0]
        tokens = []
        for i in fin_mask_idx:
            predicted_index = torch.argmax(preds[m][0, i]).item()
            predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
            tokens.append(predicted_token)
        print(f'{m} : {tokens}')

GooBERT : ['judgment', 'system', 'financial', 'assets', 'level']
FinBERT : ['earnings', 'business', 'cash', 'instruments', 'taxes']
PreBERT : ['operations', 'segments', 'cash', 'instruments', 'taxes']
ComBERT : ['results', 'business', 'recurring', 'instruments', 'taxes']
FedBERT-BERT : ['expectations', 'bank', '##bank', 'assets', 'growth']
FedBERT-Fin : ['estimates', 'firm', '-', 'instruments', 'taxes']
FedBERT-Prime : [',', ',', ',', ',', ',']


In [226]:
fed_mask_idx = [2, 22, 32, 43, 50, 56]
for i in fed_mask_idx:
    fed_tokenized[i] = '[MASK]'

fed_tokens_tensor = torch.tensor([tokenizer.convert_tokens_to_ids(fed_tokenized)])

preds  = {}
for m in model:
    with torch.no_grad():
        preds[m] = model[m](fed_tokens_tensor)[0]
        tokens = []
        for i in fed_mask_idx:
            predicted_index = torch.argmax(preds[m][0, i]).item()
            predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
            tokens.append(predicted_token)
        print(f'{m} : {tokens}')

GooBERT : ['economy', 'financial', 'possible', 'recovery', 'debt', 'gdp']
FinBERT : ['company', 'government', 'important', 'inflation', '2018', 'capital']
PreBERT : ['company', 'this', 'necessary', 'stabilization', 'value', 'capital']
ComBERT : ['fed', 'this', 'necessary', 'recovery', 'inflation', 'capital']
FedBERT-BERT : ['economy', 'fiscal', 'important', 'investment', 'output', 'inflation']
FedBERT-Fin : ['economy', 'policy', 'important', 'stabilization', 'employment', 'inflation']
FedBERT-Prime : [',', ',', ',', ',', ',', ',']


In [177]:
' '.join(fed_tokenized)

'[CLS] the [MASK] is likely to face head ##wind ##s even if the downs ##ide risks do not material ##ize . [SEP] [MASK] support will remain vital . [SEP] it will be [MASK] to shift the focus of monetary policy from stabilization to [MASK] by supporting a full recovery in [MASK] and a sustained return of [MASK] to its objective . [SEP]'

In [None]:
economy', 'fiscal', 'possible', 'employment', '2008', 'inflation

# Eval

In [115]:
with open('./sifted_Statements.txt') as f:
    content = f.readlines()
lines = [x.strip() for x in content]
lines_tokenized = [tokenizer.tokenize(line) for line in lines if len(line) < 128]

lines_tokenized_copy = lines_tokenized.copy()
masked_idx = {}
total = 0
for i in range(len(lines_tokenized_copy)):
    masked_idx[i] = {}
    for j in range(len(lines_tokenized_copy[i])):
        if np.random.rand() < 0.1:
            masked_idx[i][j] = lines_tokenized_copy[i][j]
            lines_tokenized_copy[i][j] = '[MASK]'
            total += 1
lines_tokens_tensor = [torch.tensor([tokenizer.convert_tokens_to_ids(l)]) for l in lines_tokenized_copy]

In [117]:
for m in model:
    with torch.no_grad():
        count = 0
        for i in range(len(lines_tokens_tensor)):
            preds_m = model[m](lines_tokens_tensor[i])[0]
            for j in masked_idx[i]:
                predicted_index = torch.argmax(preds_m[0, j]).item()
                predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
                if predicted_token == masked_idx[i][j]:
                    count += 1
        print(m, count / total)

GooBERT 0.35902948402948404
FinBERT 0.40617321867321865
PreBERT 0.36732186732186733
ComBERT 0.20347051597051596
FedBERT-BERT 0.05835380835380835
FedBERT-Fin 0.04914004914004914


In [227]:
with torch.no_grad():
    count = 0
    for i in range(len(lines_tokens_tensor)):
        preds_m = model['FedBERT-Fin'](lines_tokens_tensor[i])[0]
        for j in masked_idx[i]:
            predicted_index = torch.argmax(preds_m[0, j]).item()
            predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
            if predicted_token == masked_idx[i][j]:
                count += 1
    print(m, count / total)

FedBERT-Prime 0.46867321867321865


# Cosine

In [167]:
from sklearn.metrics.pairwise import cosine_similarity

In [153]:
bm = BertModel.from_pretrained('FedBERT-prime')

Some weights of the model checkpoint at FedBERT-prime were not used when initializing BertModel: ['bert.embeddings.position_ids']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [171]:
max_sim = 0
max_idx = (i, j)
with torch.no_grad():
    for i in range(len(lines_tokens_tensor)):
        for j in range(i+1, len(lines_tokens_tensor)):
            emb_0 = bm(lines_tokens_tensor[i])[0][0][-1]
            emb_1 = bm(lines_tokens_tensor[j])[0][0][-1]
            dst = cosine_similarity(emb_0.reshape(1, -1), emb_1.reshape(1, -1))
            if dst > max_sim:
                max_sim = dst
                max_idx = (i, j)

KeyboardInterrupt: 

In [None]:
max_idx