In [5]:
import torch
from pytorch_transformers import BertTokenizer, BertForMaskedLM

In [10]:
model            = {}
model['GooBERT'] = BertForMaskedLM.from_pretrained('./GooBERT')
model['FinBERT'] = BertForMaskedLM.from_pretrained('FinBERT-Prime_128MSL-250K')
model['PreBERT'] = BertForMaskedLM.from_pretrained('FinBERT-Pre2K_128MSL-250K')
model['ComBERT'] = BertForMaskedLM.from_pretrained('FinBERT-Combo_128MSL-250K')

In [12]:
# Fed
S1 = '[CLS] the company has a fiduciary duty to its shareholders . [SEP]'
S2 = 'one of its many regulatory requirements . [SEP]'

# Fin
S1 = '[CLS] market conditions have improved since the 2007-2009 recession . [SEP]'
S2 = 'conditions remain challenging for financial institutions . [SEP]'

tokenizer      = BertTokenizer.from_pretrained('bert-base-uncased')
text           = f'{S1} {S2}'
tokenized_text = tokenizer.tokenize(text)

for i, word in enumerate(tokenized_text):
    print("{} {}".format(i, word))
    
print(tokenized_text)

0 [CLS]
1 market
2 conditions
3 have
4 improved
5 since
6 the
7 2007
8 -
9 2009
10 recession
11 .
12 [SEP]
13 conditions
14 remain
15 challenging
16 for
17 financial
18 institutions
19 .
20 [SEP]
['[CLS]', 'market', 'conditions', 'have', 'improved', 'since', 'the', '2007', '-', '2009', 'recession', '.', '[SEP]', 'conditions', 'remain', 'challenging', 'for', 'financial', 'institutions', '.', '[SEP]']


In [16]:
import numpy as np

MI = [10, 14]

for i in MI :
    tokenized_text[i] = '[MASK]'

print(tokenized_text)

indexed_tokens   = tokenizer.convert_tokens_to_ids(tokenized_text)
segments_ids     = [0] * len(tokenizer.tokenize(S1)) + [1] * len(tokenizer.tokenize(S2))

print(segments_ids)

tokens_tensor    = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])

preds  = {}
for m in model:
    with torch.no_grad():
        preds[m] = model[m](tokens_tensor, token_type_ids = segments_tensors)[0]
        predicted_index = torch.argmax(preds[m][0, 14]).item()
        predicted_indecies = np.argsort((preds[m][0, 10]))
        print(m)
        for i in range(1, 6):
            print(f'{i}th most likely {tokenizer.convert_ids_to_tokens([predicted_indecies[-i].item()])[0]}')
        print("")
        
for m in preds:
    tokens = []
    for i in MI:
        predicted_index = torch.argmax(preds[m][0, i]).item()
        predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
        tokens.append(predicted_token)

    print(f'{m} : {tokens}')

['[CLS]', 'market', 'conditions', 'have', 'improved', 'since', 'the', '2007', '-', '2009', '[MASK]', '.', '[SEP]', 'conditions', '[MASK]', 'challenging', 'for', 'financial', 'institutions', '.', '[SEP]']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
GooBERT
1th most likely recession
2th most likely period
3th most likely season
4th most likely crisis
5th most likely year

FinBERT
1th most likely recession
2th most likely crisis
3th most likely period
4th most likely attacks
5th most likely hurricanes

PreBERT
1th most likely period
2th most likely season
3th most likely fires
4th most likely deadline
5th most likely seasons

ComBERT
1th most likely recession
2th most likely period
3th most likely crisis
4th most likely peak
5th most likely time

GooBERT : ['recession', 'are']
FinBERT : ['recession', 'remain']
PreBERT : ['period', 'are']
ComBERT : ['recession', 'remain']


Positive examples have better results in Fin, negative Goo performs better.  Financial documents accentuate the positive?

Finbert feels 

