In [None]:
#@title Check availble memory of GPU
# Check that we are using 100% of GPU
# memory footprint support libraries/code
!ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
!pip -q install gputil
!pip -q install psutil
!pip -q install humanize
import psutil
import humanize
import os
import GPUtil as GPU
GPUs = GPU.getGPUs()
# XXX: only one GPU on Colab and isn’t guaranteed
gpu = GPUs[0]
def printm():
 process = psutil.Process(os.getpid())
 print("Gen RAM Free: " + humanize.naturalsize( psutil.virtual_memory().available ), " | Proc size: " + humanize.naturalsize( process.memory_info().rss))
 print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))
printm()

In [None]:
!pip install -qq transformers
!pip install -qq nlp

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# imports
from transformers import (
    BertForMaskedLM,
    BertForNextSentencePrediction,
    BertModel,
    BertTokenizer,
    BertConfig,
    Trainer,
    BertForPreTraining,
    DataCollatorForLanguageModeling,
    DataCollatorForNextSentencePrediction,
    TrainingArguments,
    LineByLineTextDataset,
    TextDatasetForNextSentencePrediction,
)
import torch
import matplotlib.pyplot as plt
import numpy as np
from torch.utils.data.dataset import Dataset

In [None]:
pretrain = 'BERT'
task = 'MLM'
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

if task == 'MLM':
    dataset = LineByLineTextDataset(
        tokenizer=tokenizer,
        file_path="/content/drive/My Drive/230T2 MLTS/Colab Notebooks/data/new_sifted_Speech.txt",
        block_size=128,
    )
    eval_dataset = LineByLineTextDataset(
        tokenizer=tokenizer,
        file_path="/content/drive/My Drive/230T2 MLTS/Colab Notebooks/data/new_sifted_Statements.txt",
        block_size=128,
    )
    bert_type = BertForMaskedLM
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)
elif task == 'NSP':
    dataset = TextDatasetForNextSentencePrediction(
        tokenizer=tokenizer,
        file_path="/content/drive/My Drive/230T2 MLTS/Colab Notebooks/data/new_sifted_Speech.txt",
        block_size=32,
    )
    bert_type = BertForPreTraining
    data_collator = DataCollatorForNextSentencePrediction(tokenizer=tokenizer, mlm=True, block_size=32)
else:
    bert_type = BertModel

if pretrain == 'BERT':
    model = bert_type.from_pretrained('/content/drive/My Drive/230T2 MLTS/Colab Notebooks/params/BERT/checkpoint-100000')
elif pretrain =='FIN':
    model = bert_type.from_pretrained('/content/drive/My Drive/230T2 MLTS/Colab Notebooks/params/FIN/checkpoint-100000')
else:
    model = bert_type.from_pretrained('/content/drive/My Drive/230T2 MLTS/Colab Notebooks/params/PRIME/checkpoint-100000')

## Training

In [None]:
training_args = {
    "output_dir": "/content/drive/My Drive/230T2 MLTS/Colab Notebooks/params/"+pretrain,
    "overwrite_output_dir": True,
    "logging_dir": "/content/drive/My Drive/230T2 MLTS/Colab Notebooks/params/logs",
    "learning_rate": 1e-4,
    "do_train": True,
    "do_eval": True,
    "max_steps": 50000,
    "warmup_steps": 100,
    "weight_decay": 0.001,
    "per_device_train_batch_size": 32,
    "per_device_eval_batch_size": 4,
    "logging_steps": 2500,
    "fp16": True,
    "save_steps": 5000,
    "save_total_limit": 2,
}

training_args = TrainingArguments(**training_args)

In [None]:
model = model.train()

# create the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset
)
# train
trainer.train()

## Eval

In [None]:
import random
models = [BertForMaskedLM.from_pretrained('/content/drive/My Drive/230T2 MLTS/Colab Notebooks/params/BERT/checkpoint-100000'),
        BertForMaskedLM.from_pretrained('/content/drive/My Drive/230T2 MLTS/Colab Notebooks/params/FIN/checkpoint-100000'),
        BertForMaskedLM.from_pretrained('/content/drive/My Drive/230T2 MLTS/Colab Notebooks/params/PRIME/checkpoint-100000'),
        BertForMaskedLM.from_pretrained('/content/drive/My Drive/230T2 MLTS/Colab Notebooks/params/FinBERT-Prime_128MSL-250K'),
        BertForMaskedLM.from_pretrained('bert-base-uncased')
]

dataset = LineByLineTextDataset(
        tokenizer=tokenizer,
        file_path="/content/drive/My Drive/230T2 MLTS/Colab Notebooks/data/new_sifted_Speech.txt",
        block_size=128,
)

eval_dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="/content/drive/My Drive/230T2 MLTS/Colab Notebooks/data/new_sifted_Statements.txt",
    block_size=128,
)

model_names = ['FedBERT-BERT', 'FedBERT-Fin', 'FedBERT-Prm', 'FinBERT', 'BERT']

In [None]:
probs = [0.05, 0.15, 0.25, 0.35, 0.45]
train_loss = [[] for i in range(len(models))]
eval_loss = [[] for i in range(len(models))]

n = 250
train_idxes = random.sample(list(range(len(dataset))), n)
eval_idxes = random.sample(list(range(len(eval_dataset))), n)
for prob in probs:
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=prob)
    for m, model in enumerate(models):
        total_loss = 0
        for i in train_idxes:
            outputs = model(data_collator.mask_tokens(dataset[i].unsqueeze(0))[0], labels=dataset[i].unsqueeze(0))
            total_loss += outputs[0].item()
        train_loss[m].append(total_loss/n)

        total_loss = 0
        for i in eval_idxes:
            outputs = model(data_collator.mask_tokens(eval_dataset[i].unsqueeze(0))[0], labels=eval_dataset[i].unsqueeze(0))
            total_loss += outputs[0].item()
        eval_loss[m].append(total_loss/n)

In [None]:
fig, ax = plt.subplots(ncols=2, figsize=(12,6), constrained_layout=True, sharey=True)
for i in range(len(train_loss)):
    ax[0].plot(probs, train_loss[i], label=model_names[i])
for i in range(len(train_loss)):
    ax[1].plot(probs, eval_loss[i], label=model_names[i])
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(12,4))
for i in range(len(train_loss)):
    plt.plot(probs, np.array(train_loss[i]) - np.array(eval_loss[i]), label=model_names[i])
plt.axhline(y=0, color='black', linestyle='-.')
plt.legend()
plt.show()

## Using the models

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# s1 = 'the existence of substantial economic shock'
# s2 = 'the man being hit by the car is in shock'

# s1 = 'The man was accused of robbing a bank'
# s2 = 'The man went fishing by the river bank'

s1 = 'the fed is promoting its long term price stability target'
s2 = 'it is the most purchased item at his local target'

token_1 = torch.tensor(tokenizer.encode(s1)).unsqueeze(0)
token_2 = torch.tensor(tokenizer.encode(s2)).unsqueeze(0)

for m, model in enumerate(models):
    embedding_1 = model(token_1)
    embedding_2 = model(token_2)
    last_hidden_states_1 = embedding_1[0][0].detach().numpy()
    last_hidden_states_2 = embedding_2[0][0].detach().numpy()
    print(model_names[m], cosine_similarity(last_hidden_states_1[[-2]], last_hidden_states_2[[-2]]).round(2))

In [None]:
sentence = 'the federal reserve is committed to using its full range of tools to support the u.s. economy in this challenging time, thereby promoting its maximum employment and price stability goals'
fed_tokenized = tokenizer.tokenize(sentence)
fed_mask_idx = [11, 19, 26, 29, 32]
print([fed_tokenized[i] for i in fed_mask_idx])

# sentence = 'And, while many of these changes have improved the efficiency of our financial system and lowered costs for consumers, it is only realistic to1 acknowledge that they also present new and sometimes daunting tests for community banks'
# fed_tokenized = tokenizer.tokenize(sentence)
# fed_mask_idx = [13, 17, 19, 38, 41]
# print([fed_tokenized[i] for i in fed_mask_idx])

for i in fed_mask_idx:
    fed_tokenized[i] = '[MASK]'

fed_tokens_tensor = torch.tensor([tokenizer.convert_tokens_to_ids(fed_tokenized)])

preds  = {}
for m, model in enumerate(models):
    with torch.no_grad():
        preds[model_names[m]] = model(fed_tokens_tensor)[0]
        tokens = []
        for i in fed_mask_idx:
            predicted_index = torch.argmax(preds[model_names[m]][0, i]).item()
            predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
            tokens.append(predicted_token)
        print(f'{model_names[m]} : {tokens}')

In [None]:
sentence = 'the federal reserve is committed to using its full range of tools to support the u.s. economy in this challenging time, thereby promoting its maximum employment and price stability goals'
fed_tokenized = tokenizer.tokenize(sentence)


In [None]:
sentence = ' And, while many of these changes have improved the efficiency of our financial system and lowered costs for consumers, it is only realistic to1 acknowledge that they also present new and sometimes daunting tests for community banks'
fed_tokenized = tokenizer.tokenize(sentence)
fed_mask_idx = [13, 17, 19, 38, 41]
print([fed_tokenized[i] for i in fed_mask_idx])