In [2]:
# Standard Python Imports
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
import json
import re
import string
import random
import time
import datetime

# Extra non-standard utilities
from argparse import Namespace
from collections import Counter
from tqdm import tqdm_notebook
import datasets

# Data management and Math imports
import numpy as np
import pandas as pd

# Torch Imports
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader


# Hugging Face Imports
# from sentence_transformers import SentenceTransformer
from transformers import BertTokenizer, BertForMaskedLM, BertConfig
# from transformers import LineByLineTextDataset
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from transformers import AutoTokenizer, AutoModelForMaskedLM
from transformers import pipeline
from transformers import EvalPrediction

from sklearn.metrics import accuracy_score, precision_recall_fscore_support

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
torch.cuda.empty_cache()

In [3]:
args = Namespace(
    data_filepath = './processed_data/sentences.csv',
    tokenizer_save_path = './tokenizer/mlm_tokenizer',
    model_save_path = './models/mlm_model',
    train_split = 0.7,
    num_samples = 10000
)

## Data preparation

In [4]:
# Load in data

raw_data_df = pd.read_csv(args.data_filepath)

raw_data_df = raw_data_df[:args.num_samples]

raw_data_df['split'] = 'train'
num_train_rows = int(len(raw_data_df) * (1 - args.train_split)//2) - 1
raw_data_df.loc[:num_train_rows, 'split'] = 'val'
raw_data_df.loc[num_train_rows:num_train_rows + num_train_rows, 'split'] = 'test'
raw_data_df.head()


dataset = datasets.Dataset.from_pandas(raw_data_df.rename(columns={'sentence' : 'text'}))

In [4]:
# tokenizer= AutoTokenizer.from_pretrained("bert-base-uncased")
tokenizer= AutoTokenizer.from_pretrained("casehold/legalbert")
# tokenizer = tokenizer_base.train_new_from_iterator(training_corpus, 20000)

In [32]:
def tokenize_and_mask(examples):
    tokens = tokenizer(examples["text"], truncation=True, padding='max_length', max_length=512, return_tensors='pt')
    return tokens

In [37]:
train_dataset = dataset.filter(lambda example: example['split'] == 'train')
val_dataset = dataset.filter(lambda example: example['split'] == 'val')
test_dataset = dataset.filter(lambda example: example['split'] == 'test')

tokenized_train_dataset = train_dataset.map(tokenize_and_mask, batched=True)
tokenized_val_dataset = val_dataset.map(tokenize_and_mask, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_and_mask, batched=True)

Filter: 100%|██████████| 9002/9002 [00:00<00:00, 252324.79 examples/s]
Filter: 100%|██████████| 9002/9002 [00:00<00:00, 287143.89 examples/s]
Filter: 100%|██████████| 9002/9002 [00:00<00:00, 276653.56 examples/s]
Map: 100%|██████████| 6303/6303 [00:01<00:00, 3566.96 examples/s]
Map: 100%|██████████| 1349/1349 [00:00<00:00, 3735.38 examples/s]
Map: 100%|██████████| 1350/1350 [00:00<00:00, 3770.43 examples/s]


### Setup tokenizer

In [None]:
# Check tokenization
print(dataset['text'][123])
input_ids = tokenizer(dataset['text'][123]).input_ids
subword_view = [tokenizer.convert_ids_to_tokens(id) for id in input_ids]
np.array(subword_view)

### Setup Data Collator (for making sentences)

In [38]:
'''
    data collator will replace 15% of tokens with [MASK] token uniformly
'''
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [39]:
def compute_metrics(p: EvalPrediction):
    preds = np.argmax(p.predictions, axis=1)
    labels = p.label_ids
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

In [40]:
def set_seed(seed_value=42):
    """Set seed for reproducibility."""
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)  # if using multi-GPU.
    # Additional settings for more reproducibility
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

## Initialize Model and Training Routine

In [None]:
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)
model.to(device)

In [11]:
torch.cuda.empty_cache()
set_seed(42)

In [None]:
print(torch.cuda.mem_get_info()[0] // 1024 ** 2)
print(torch.cuda.mem_get_info()[1] // 1024 ** 2)

In [None]:
training_args = TrainingArguments(
    output_dir=args.model_save_path,
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=16,
    # per_device_eval_batch_size = 16,
    # save_steps=5000,
    # save_total_limit=2,
    evaluation_strategy='epoch',
    learning_rate = 3e-5,
    save_safetensors= True,
)


In [None]:






trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    compute_metrics=compute_metrics
)

In [None]:
trainer.save_model(args.model_save_path)

In [5]:
text1 = 'Yet they lie wholly outside the scope of Parliamentary [MASK].' #law

text2 = 'The opening words of section 3(1) of the 1934 Act provided that the 1923 Act was to apply for the purpose of enabling a person holding a [MASK] under this Act to acquire such ancillary rights as may be required for the exercise of the rights granted by the licence, and shall have effect accordingly.' # license

mlm_model = AutoModelForMaskedLM.from_pretrained("./models/mlm_model")

pipeline_model = pipeline('fill-mask', model=mlm_model, tokenizer=tokenizer, top_k=10)

result = pipeline_model(text2)

for pred in result:
    print(f">>> {pred['sequence']}")

>>> the opening words of section 3 ( 1 ) of the 1934 act provided that the 1923 act was to apply for the purpose of enabling a person holding a licence under this act to acquire such ancillary rights as may be required for the exercise of the rights granted by the licence, and shall have effect accordingly.
>>> the opening words of section 3 ( 1 ) of the 1934 act provided that the 1923 act was to apply for the purpose of enabling a person holding a license under this act to acquire such ancillary rights as may be required for the exercise of the rights granted by the licence, and shall have effect accordingly.
>>> the opening words of section 3 ( 1 ) of the 1934 act provided that the 1923 act was to apply for the purpose of enabling a person holding a permit under this act to acquire such ancillary rights as may be required for the exercise of the rights granted by the licence, and shall have effect accordingly.
>>> the opening words of section 3 ( 1 ) of the 1934 act provided that the

In [None]:
text = 'Yet they lie wholly outside the scope of Parliamentary [MASK].'

pipeline_model = pipeline('fill-mask', model="bert-base-uncased", top_k = 10)

result = pipeline_model(text)

for pred in result:
    print(f">>> {pred['sequence']}")

In [None]:
def predict_top_10(text, tokenizer_path):
    tokenizer = BertTokenizer.from_pretrained(tokenizer_path)

    # Check if cuda available
    if torch.cuda.is_available():
        model.to('cuda')
    else:
        model.to('cpu')

    inputs = tokenizer(text, return_tensors="pt").to('cuda')

    token_logits = model(**inputs).logits

    mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]

    mask_token_logits = token_logits[0, mask_token_index, :]

    top_5_tokens = torch.topk(mask_token_logits, 10, dim=1).indices[0].tolist()

    for token in top_5_tokens:
        print(f"'>>> {text.replace(tokenizer.mask_token, tokenizer.decode([token]))}'")
                         

In [None]:
predict_top_10('Yet they lie wholly outside the scope of Parliamentary [MASK].', 'casehold/legalbert')

In [49]:
from datasets import load_metric
def evaluate_mlm(model_path, tokenizer_path, title):



    model = AutoModelForMaskedLM.from_pretrained("./models/mlm_model")

    # Check if cuda available
    if torch.cuda.is_available():
        model.to('cuda')
    else:
        model.to('cpu')
    
    model.eval()

    print('Evaluating ' + f'[{title}]')
    print('============================================')

    perplexity_metric = load_metric("perplexity")

    with torch.no_grad():
        for batch in tokenized_test_dataset:
            inputs = {k: v.to(device) for k, v in batch.items() if k in model.forward.__code__.co_varnames}
            outputs = model(**inputs)
            loss = outputs.loss
            total_eval_loss += loss.item()
            nb_eval_steps += 1

            # Update perplexity metric
            perplexity_metric.add_batch(predictions=outputs.logits, references=inputs["labels"])

    # Calculate the average loss and final perplexity
    avg_loss = total_eval_loss / nb_eval_steps
    final_perplexity = perplexity_metric.compute()["perplexity"]

    print(f"Average Loss: {avg_loss}")
    print(f"Perplexity: {final_perplexity}")

    return avg_loss, final_perplexity


In [None]:
t

In [50]:
evaluate_mlm(None, 'casehold/legalbert', 'Title')

Evaluating [Title]


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


AttributeError: 'list' object has no attribute 'to'