In [2]:
import os
import json
import re
import string
import random
import time
import datetime

import numpy as np
import pandas as pd
# import matplotlib.pyplot as plt

from argparse import Namespace
from tqdm.notebook import tqdm
from datasets import Dataset

import transformers
from transformers import BertTokenizer, BertModel
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import BertForSequenceClassification, AdamW
from transformers import get_linear_schedule_with_warmup
from transformers import pipeline

import torch.nn.functional as F
import torch
from torch.utils.data import DataLoader, TensorDataset

from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.feature_extraction import text
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score

2024-03-25 14:27:02.123021: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
evaluate = False # if in evaluation mode, set to true 

In [4]:
args = Namespace(
    data_path = 'processed_data/casehold_processed.csv',
    pretuned_model_path = "./models/mlm_model_manual",
    # model_save_path = 'models/sentence_pair_classification',
    model_save_path = './models/mlm_te_test',
    num_samples=15000,
    batch_size = 16,
    learn_rate = 1e-5,
    epochs = 5,
    device='cpu',
    train_split=0.7,
    model_state_file='casehold_state.pth',
    patience = 3,
    freeze=False
)

In [5]:
casehold_df = pd.read_csv(args.data_path)

## Setup Tokenizer

In [6]:
tokenizer = BertTokenizer.from_pretrained('casehold/legalbert')

## 1 Sentence Pair Classification
- Utilize binary labels (related or not related) for each pair

In [7]:
class_df = casehold_df[:args.num_samples]

Configure Split 

In [8]:
class_df['split'] = 'train'

num_val_rows = int(len(class_df) * (1 - args.train_split)//2) - 1

# 15% for validation and test each , remaining 70% for train
class_df.loc[:num_val_rows, 'split'] = 'val'
class_df.loc[num_val_rows: num_val_rows + num_val_rows, 'split'] = 'test'

print('Number of train samples : ' + str((class_df['split'] == 'train').sum()))
print('Number of val samples : ' + str((class_df['split'] == 'val').sum()))
print('Number of test samples : ' + str((class_df['split'] == 'test').sum()))


class_df.head()

Number of train samples : 10501
Number of val samples : 2249
Number of test samples : 2250


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  class_df['split'] = 'train'


Unnamed: 0.1,Unnamed: 0,context,holding,binary_label,relevance_label,split
0,0,"Drapeau’s cohorts, the cohort would be a “vict...",holding that possession of a pipe bomb is a cr...,1,1.0,val
1,1,"Drapeau’s cohorts, the cohort would be a “vict...",holding that bank robbery by force and violenc...,0,0.652,val
2,2,"Drapeau’s cohorts, the cohort would be a “vict...",holding that sexual assault of a child qualifi...,0,0.647,val
3,3,"Drapeau’s cohorts, the cohort would be a “vict...",holding for the purposes of 18 usc 924e that ...,0,0.67,val
4,4,"Drapeau’s cohorts, the cohort would be a “vict...",holding that a court must only look to the sta...,0,0.639,val


## Data Preparation : creation of datasets and dataloader objects

In [9]:
####################################################
############## Setup Train Dataloader ##############
####################################################

encoded_data_train = [tokenizer.encode_plus(row['context'], row['holding'], add_special_tokens=True, max_length=512, pad_to_max_length=True, truncation=True) for index,row in class_df[class_df['split'] == 'train'].iterrows()]
input_ids_train = [item['input_ids'] for item in encoded_data_train]
attention_masks_train = [item['attention_mask'] for item in encoded_data_train]
labels_train = [row['binary_label'] for index,row in class_df[class_df['split'] == 'train'].iterrows()]

# Convert to tensors
input_ids_train = torch.tensor(input_ids_train)
attention_masks_train = torch.tensor(attention_masks_train)
labels_train = torch.tensor(labels_train)

# Create a dataset
dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)

dataloader_train = DataLoader(dataset_train, batch_size=args.batch_size, shuffle=True) # NOTE : maybe set pin_memory=True



In [10]:
####################################################
############## Setup Val Dataloader ################
####################################################

encoded_data_val = [tokenizer.encode_plus(row['context'], row['holding'], add_special_tokens=True, max_length=512, pad_to_max_length=True, truncation=True) for index,row in class_df[class_df['split'] == 'val'].iterrows()]
input_ids_val = [item['input_ids'] for item in encoded_data_val]
attention_masks_val = [item['attention_mask'] for item in encoded_data_val]
labels_val = [row['binary_label'] for index,row in class_df[class_df['split'] == 'val'].iterrows()]

# Convert to tensors
input_ids_val = torch.tensor(input_ids_val)
attention_masks_val = torch.tensor(attention_masks_val)
labels_val = torch.tensor(labels_val)

# Create a dataset
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

dataloader_val = DataLoader(dataset_val, batch_size=args.batch_size, shuffle=True) # NOTE : maybe set pin_memory=True

In [11]:
###################################################
############## Setup Test Dataloader ##############
###################################################

encoded_data_test = [tokenizer.encode_plus(row['context'], row['holding'], add_special_tokens=True, max_length=512, pad_to_max_length=True, truncation=True) for index,row in class_df[class_df['split'] == 'test'].iterrows()]
input_ids_test = [item['input_ids'] for item in encoded_data_test]
attention_masks_test = [item['attention_mask'] for item in encoded_data_test]
labels_test = [row['binary_label'] for index,row in class_df[class_df['split'] == 'test'].iterrows()]

# Convert to tensors
input_ids_test = torch.tensor(input_ids_test)
attention_masks_test = torch.tensor(attention_masks_test)
labels_test = torch.tensor(labels_test)

# Create a dataset
dataset_test = TensorDataset(input_ids_test, attention_masks_test, labels_test)

dataloader_test = DataLoader(dataset_test, batch_size=args.batch_size, shuffle=True) # NOTE : maybe set pin_memory=True

## Training 

### Initialize Model, Optimizer and Learning Rate Scheduler for Fine-Tuning Routine

In [12]:
# Initialize model and optimizer
if not evaluate:
    model = BertForSequenceClassification.from_pretrained(args.pretuned_model_path, num_labels=2)

    if args.freeze: 
        print('freeze')
        for param in model.base_model.parameters():
            param.requires_grad  = False
        model.classifier = torch.nn.Linear(model.config.hidden_size, 2)

    if args.freeze: 
        optimizer = AdamW(model.classifier.parameters(), lr=args.learn_rate)
    else : 
        optimizer = AdamW(model.parameters(), lr=args.learn_rate)

    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=len(dataloader_train) * 0.05, num_training_steps=len(dataloader_train) * args.epochs)

    print(torch.cuda.memory_allocated())
    print(torch.cuda.memory_reserved())

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./models/mlm_model_manual and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0
0




### Accuracy Function for Fine-Tuning Routine

In [13]:

def calculate_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

### Configure hardware acceleration 

In [14]:
if not evaluate:
  torch.cuda.empty_cache()
  if torch.cuda.is_available():
    args.device = 'cuda'

  model.to(args.device)
  print(args.device)

cuda


### Fine-Tuning Setup and Routine

In [None]:
if not evaluate:
  # Setup progress bars 
  train_progress = tqdm(total=0, desc='Train Batches', leave=True)
  validation_progress = tqdm(total=0, desc='Validation Batches', leave=True)
  epoch_progress = tqdm(total=args.epochs, desc='Epoch', leave=True)

  # Configure early stopping 
  best_val_accuracy = 0.0
  patience = 3
  num_epochs_no_improvement = 0

  # Intialize loss function (Using cross entropy but with two classes for binary)
  loss_fn = torch.nn.CrossEntropyLoss()

  # Fine-Tuning Loop 
  for epoch in range(args.epochs):

    model.train() 
    total_train_loss = 0
    total_train_accuracy = 0 #NEW

    train_progress.reset(total=len(dataloader_train))
    validation_progress.reset(total=len(dataloader_val))

    # Iterate through training batches 
    for step, batch in enumerate(dataloader_train):
      b_input_ids, b_input_mask, b_labels = batch
      b_input_ids, b_input_mask, b_labels = b_input_ids.to(args.device), b_input_mask.to(args.device), b_labels.to(args.device)

      model.zero_grad()
      outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)

      loss = outputs.loss # Extract loss 
      total_train_loss += loss.item()
      loss.backward() # Perform backpropogation using gradients of loss 

      logits = outputs.logits.detach().cpu().numpy()#NEW
      label_ids = b_labels.to('cpu').numpy()#NEW
      total_train_accuracy += calculate_accuracy(logits, label_ids)#NEW

      torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
      optimizer.step()
      scheduler.step()

      train_progress.update(1)

    avg_train_loss = total_train_loss / len(dataloader_train)
    print(f'Epoch {epoch}: Average Training Loss: {avg_train_loss}')

    model.eval()
    total_eval_accuracy = 0
    total_eval_loss = 0

    # Iterate through validation data
    for batch in dataloader_val:
      b_input_ids, b_input_mask, b_labels = batch
      b_input_ids, b_input_mask, b_labels = b_input_ids.to(args.device), b_input_mask.to(args.device), b_labels.to(args.device)

      with torch.no_grad():
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

      logits = outputs.logits.detach().cpu().numpy() # Extract raw prediction logits 
      label_ids = b_labels.to('cpu').numpy() # move logits to cpu so that they can be passed to calculate accuracy function which uses numpy 
      total_eval_loss += loss_fn(outputs.logits.squeeze(-1), b_labels).item() # compute cross entropy loss 

      total_eval_accuracy += calculate_accuracy(logits, label_ids)

      validation_progress.update(1)

    avg_val_accuracy = total_eval_accuracy / len(dataloader_val)
    print(f'Epoch {epoch}: Validation Accuracy: {avg_val_accuracy}')

    # Checkpointing and Early Stopping
    if avg_val_accuracy > best_val_accuracy:
        print(f'Validation accuracy improved from {best_val_accuracy} to {avg_val_accuracy}. Saving model...')
        best_val_accuracy = avg_val_accuracy
        num_epochs_no_improvement = 0
        # Save the model using save_pretrained
        model.save_pretrained(args.model_save_path)
    else:
        num_epochs_no_improvement += 1
        if num_epochs_no_improvement >= args.patience:
            print("Early stopping triggered.")
            break  # Exit the training loop

    epoch_progress.update(1)


    

## Evaluation

### Manual Checking

In [19]:
def classify_sentence_pair(sentence1, sentence2, model, tokenizer):
    # Prepare the input sentence pair
    inputs = tokenizer(sentence1, sentence2, return_tensors="pt", padding=True, truncation=True, max_length=512)

    # Get predictions
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits

    # Convert logits to probabilities
    # takes shape [[x ,y]] where x is probability of unrelated and y is probability or related
    probabilities = torch.softmax(logits, dim=1)

    print(probabilities)

    # Convert probabilities to binary predictions
    predicted_class_id = torch.argmax(probabilities, dim=1).item()

    return predicted_class_id  # 1 for related, 0 for unrelated


In [20]:
# sentence1 = "The legislation was passed in 1999."
# sentence2 = "Legislation enacted in 1999 started a major reform."

sentence1 = ("They also rely on Oswego Laborers’ Local 214 Pension Fund v. Marine "
            "Midland Bank, 85 N.Y.2d 20, 623 N.Y.S.2d 529, 647 N.E.2d 741 (1996), which "
            "held that a plaintiff 'must demonstrate that the acts or practices have a "
            "broader impact on consumers at large.' Defs.’ Mem. at 14 (quoting Oswego "
            "Laborers’, 623 N.Y.S.2d 529, 647 N.E.2d at 744). As explained above, how-"
            "ever, Plaintiffs have adequately alleged that Defendants’ unauthorized "
            "use of the DEL MONICO’s name in connection with non-Ocinomled "
            "restaurants and products caused consumer harm or injury to the public, "
            "and that they had a broad impact on consumers at large inasmuch as "
            "such use was likely to cause consumer confusion. See, e.g., CommScope, "
            "Inc. of N.C. v. CommScope (U.S.A) Int’l Grp. Co., 809 F. Supp.2d 33, 38 "
            "(N.D.N.Y 2011) (<HOLDING>); New York City Triathlon, LLC v. NYC Triathlon" )

sentence2 = "holding that plaintiff stated a 349 claim where plaintiff alleged facts plausibly suggesting that defendant intentionally registered its corporate name to be confusingly similar to plaintiffs CommScope trademark"
sentence3 = "A logit is the raw output of the model's final layer, and it's a real number that can be positive, negative, or zero."
sentence4 = "holding that plaintiff stated a 349 claim where plaintiff alleged facts plausibly suggesting that defendant intentionally registered its corporate name to be confusingly similar to plaintiffs CommScope trademark, despite being unauthorized to do so."

prediction = classify_sentence_pair(sentence1, sentence2, model, tokenizer)
print(prediction)
print("Classified as:", "Related" if prediction == 1 else "Unrelated")

print('\n')

prediction = classify_sentence_pair(sentence1, sentence3, model, tokenizer)
print(prediction)
print("Classified as:", "Related" if prediction == 1 else "Unrelated")

print('\n')

prediction = classify_sentence_pair(sentence1, sentence4, model, tokenizer)
print(prediction)
print("Classified as:", "Related" if prediction == 1 else "Unrelated")


tensor([[0.3599, 0.6401]])
1
Classified as: Related


tensor([[0.9652, 0.0348]])
0
Classified as: Unrelated


tensor([[0.2597, 0.7403]])
1
Classified as: Related


### Evaluation metrics on Test data

In [12]:
def evaluate_sequence_pair_class(model_path,  title):
    '''
    Routine for evaluating model for sequence pair classification
    '''

    progress = tqdm(total=len(dataloader_test), desc='Train Batches', leave=True)

    # load model and tokenizer
    # model = AutoModelForSequenceClassification.from_pretrained(model_path)
    model = BertForSequenceClassification.from_pretrained(model_path)


    # Check if cuda available
    if torch.cuda.is_available():
        # model.to('cuda')
        args.device = 'cuda'
    else:
        # model.to('cpu')
        args.device = 'cpu'

    print(args.device)

    model.to(args.device)

    model.eval()

    predictions, true_labels, all_logits = [], [], []

    print('Evaluating ' + f'[{title}]')
    print('============================================')


    with torch.no_grad(): # disable calculating gradients (more efficient for evaluation)
        for batch in dataloader_test:
            progress.update(1)

            input_ids, attention_mask, labels = tuple(t.to(args.device) for t in batch)
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1).flatten() # find index of max value in logits tensor (where each index corresponds to a binary class


            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
            all_logits.extend(logits.cpu().numpy())


    all_logits = np.concatenate([logits[np.newaxis, :] for logits in all_logits], axis=0)

    all_neg = all_logits[:,0]
    all_pos = all_logits[:,1]

    print("Average logit for negative class:", np.mean(all_neg))
    print("Average logit for positive class:", np.mean(all_pos))

    # Calculate metrics
    accuracy = accuracy_score(true_labels, predictions)
    print(f'Accuracy: {accuracy}')



### Textual Entailment Model

In [18]:
evaluate_sequence_pair_class('jimmyjz1127/single_te', 'Single TE Model')

Train Batches:   0%|          | 0/141 [00:00<?, ?it/s]

cuda
Evaluating [None]
Average logit for negative class: 1.3472387
Average logit for positive class: -1.2677445
Accuracy: 0.8293333333333334


### Selective Question & Answering Model

In [11]:
evaluate_sequence_pair_class('jimmyjz1127/qa_model', 'None')

Train Batches:   0%|          | 0/141 [00:00<?, ?it/s]

cuda
Evaluating [None]
Average logit for negative class: 4.690757
Average logit for positive class: -4.308231
Accuracy: 0.7995555555555556


### Sequential Multi-Task Model (Textual Entailment + Selective Question & Answering)

In [13]:
evaluate_sequence_pair_class('jimmyjz1127/multi_sequential', 'Multi-Task Sequential')

Train Batches:   0%|          | 0/141 [00:00<?, ?it/s]

cuda
Evaluating [None]
Average logit for negative class: 0.7068491
Average logit for positive class: -0.18210216
Accuracy: 0.7164444444444444


### Parallel Multi-Task Model (Textual Entailment + Selective Question & Answering)

In [15]:
evaluate_sequence_pair_class('jimmyjz1127/combined/multi_parallel', 'Multi-Task Parallel')

Train Batches:   0%|          | 0/141 [00:00<?, ?it/s]

cuda
Evaluating [None]
Average logit for negative class: 0.8133309
Average logit for positive class: -0.7324584
Accuracy: 0.8044444444444444


### Vanilla bert-base-uncased Model 

In [17]:
evaluate_sequence_pair_class('bert-base-uncased',  'bert-case-uncased')

Train Batches:   0%|          | 0/141 [00:00<?, ?it/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


cuda
Evaluating [bert-case-uncased]
Average logit for negative class: 0.43700904
Average logit for positive class: 0.18648519
Accuracy: 0.7373333333333333


### Parallel Multi-Task Model with MLM Further Pretraining

In [20]:
evaluate_sequence_pair_class('jimmyjz1127/multi_parallel_mlm',  'Parallel Multi-Task with MLM')

Train Batches:   0%|          | 0/141 [00:00<?, ?it/s]

cuda
Evaluating [Sequence Pair Classificaiton Evaluation Metrics]
Average logit for negative class: 1.0163242
Average logit for positive class: -1.1829501
Accuracy: 0.86


### casehold/legalbert Model (Chaldikis et al.)

In [21]:
evaluate_sequence_pair_class('casehold/legalbert',  'legalbert')

Train Batches:   0%|          | 0/141 [00:00<?, ?it/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at casehold/legalbert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


cuda
Evaluating [legalbert]
Average logit for negative class: 0.70902157
Average logit for positive class: 0.67972016
Accuracy: 0.6346666666666667


: 