In [1]:
import os
import json
import re
import string
import random
import time
import datetime

import numpy as np
import pandas as pd
# import matplotlib.pyplot as plt

from argparse import Namespace
from tqdm.notebook import tqdm

# from datasets import Dataset

import transformers
from transformers import BertTokenizer, BertModel, BertConfig
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import BertForSequenceClassification, AdamW
from transformers import get_linear_schedule_with_warmup
from transformers import pipeline
from transformers import BertTokenizer, DataCollatorForLanguageModeling

import torch.nn.functional as F
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset, Dataset

from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.feature_extraction import text
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score

In [4]:
args = Namespace(
    data_path = './raw_data/ir_data/privacy_policy/policy_train_data.csv',
    pretuned_model_path = 'bert-base-uncased',
    model_save_path='./models/qa_model',
    num_samples=15000,
    batch_size = 16,
    learn_rate = 2e-5,
    epochs = 5,
    device='cpu',
    train_split=0.7,
    patience = 3,
    freeze=False
)

## Data Preparation 

In [8]:
tokenizer = BertTokenizer.from_pretrained('casehold/legalbert')

In [6]:
df = pd.read_csv(args.data_path, sep='\t')[:args.num_samples]

In [7]:
df.head()

Unnamed: 0,Folder,DocID,QueryID,SentID,Split,Query,Segment,Label
0,../../Dataset/Train/com.cake.browser,Cake Web Browser _1,Cake Web Browser _1_0,Cake Web Browser _1_0_0,train,do you keep the data of mine and upload to you...,"This privacy policy, with our Terms of Servic...",Irrelevant
1,../../Dataset/Train/com.cake.browser,Cake Web Browser _1,Cake Web Browser _1_0,Cake Web Browser _1_0_1,train,do you keep the data of mine and upload to you...,We encourage you to read this privacy policy c...,Irrelevant
2,../../Dataset/Train/com.cake.browser,Cake Web Browser _1,Cake Web Browser _1_0,Cake Web Browser _1_0_2,train,do you keep the data of mine and upload to you...,By using our application or other online servi...,Irrelevant
3,../../Dataset/Train/com.cake.browser,Cake Web Browser _1,Cake Web Browser _1_0,Cake Web Browser _1_0_3,train,do you keep the data of mine and upload to you...,"When we post changes to this privacy policy, ...",Irrelevant
4,../../Dataset/Train/com.cake.browser,Cake Web Browser _1,Cake Web Browser _1_0,Cake Web Browser _1_0_4,train,do you keep the data of mine and upload to you...,We encourage you to review this privacy policy...,Irrelevant


In [9]:
df['split'] = 'train'

num_val_rows = int(len(df) * (1 - args.train_split)//2) - 1

# 15% for validation and test each , remaining 70% for train
df.loc[:num_val_rows, 'split'] = 'val'
df.loc[num_val_rows: num_val_rows + num_val_rows, 'split'] = 'test'

print('Number of train samples : ' + str((df['split'] == 'train').sum()))
print('Number of val samples : ' + str((df['split'] == 'val').sum()))
print('Number of test samples : ' + str((df['split'] == 'test').sum()))


df.head()

Number of train samples : 10501
Number of val samples : 2249
Number of test samples : 2250


Unnamed: 0,Folder,DocID,QueryID,SentID,Split,Query,Segment,Label,split
0,../../Dataset/Train/com.cake.browser,Cake Web Browser _1,Cake Web Browser _1_0,Cake Web Browser _1_0_0,train,do you keep the data of mine and upload to you...,"This privacy policy, with our Terms of Servic...",Irrelevant,val
1,../../Dataset/Train/com.cake.browser,Cake Web Browser _1,Cake Web Browser _1_0,Cake Web Browser _1_0_1,train,do you keep the data of mine and upload to you...,We encourage you to read this privacy policy c...,Irrelevant,val
2,../../Dataset/Train/com.cake.browser,Cake Web Browser _1,Cake Web Browser _1_0,Cake Web Browser _1_0_2,train,do you keep the data of mine and upload to you...,By using our application or other online servi...,Irrelevant,val
3,../../Dataset/Train/com.cake.browser,Cake Web Browser _1,Cake Web Browser _1_0,Cake Web Browser _1_0_3,train,do you keep the data of mine and upload to you...,"When we post changes to this privacy policy, ...",Irrelevant,val
4,../../Dataset/Train/com.cake.browser,Cake Web Browser _1,Cake Web Browser _1_0,Cake Web Browser _1_0_4,train,do you keep the data of mine and upload to you...,We encourage you to review this privacy policy...,Irrelevant,val


In [11]:
mapping = {'Irrelevant' : 0, 'Relevant' : 1}
df['Label'] = df['Label'].map(mapping)
df.head()

Unnamed: 0,Folder,DocID,QueryID,SentID,Split,Query,Segment,Label,split
0,../../Dataset/Train/com.cake.browser,Cake Web Browser _1,Cake Web Browser _1_0,Cake Web Browser _1_0_0,train,do you keep the data of mine and upload to you...,"This privacy policy, with our Terms of Servic...",0,val
1,../../Dataset/Train/com.cake.browser,Cake Web Browser _1,Cake Web Browser _1_0,Cake Web Browser _1_0_1,train,do you keep the data of mine and upload to you...,We encourage you to read this privacy policy c...,0,val
2,../../Dataset/Train/com.cake.browser,Cake Web Browser _1,Cake Web Browser _1_0,Cake Web Browser _1_0_2,train,do you keep the data of mine and upload to you...,By using our application or other online servi...,0,val
3,../../Dataset/Train/com.cake.browser,Cake Web Browser _1,Cake Web Browser _1_0,Cake Web Browser _1_0_3,train,do you keep the data of mine and upload to you...,"When we post changes to this privacy policy, ...",0,val
4,../../Dataset/Train/com.cake.browser,Cake Web Browser _1,Cake Web Browser _1_0,Cake Web Browser _1_0_4,train,do you keep the data of mine and upload to you...,We encourage you to review this privacy policy...,0,val


In [12]:
####################################################
############## Setup Train Dataloader ##############
####################################################

encoded_data_train = [tokenizer.encode_plus(row['Query'], row['Segment'], add_special_tokens=True, max_length=512, pad_to_max_length=True, truncation=True) for index,row in df[df['split'] == 'train'].iterrows()]
input_ids_train = [item['input_ids'] for item in encoded_data_train]
attention_masks_train = [item['attention_mask'] for item in encoded_data_train]
labels_train = [row['Label'] for index,row in df[df['split'] == 'train'].iterrows()]

# Convert to tensors
input_ids_train = torch.tensor(input_ids_train)
attention_masks_train = torch.tensor(attention_masks_train)
labels_train = torch.tensor(labels_train)

# Create a dataset
dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)

dataloader_train = DataLoader(dataset_train, batch_size=args.batch_size, shuffle=True) # NOTE : maybe set pin_memory=True



In [21]:
####################################################
############## Setup Val Dataloader ################
####################################################

encoded_data_val = [tokenizer.encode_plus(row['Query'], row['Segment'], add_special_tokens=True, max_length=512, pad_to_max_length=True, truncation=True) for index,row in df[df['split'] == 'val'].iterrows()]
input_ids_val = [item['input_ids'] for item in encoded_data_val]
attention_masks_val = [item['attention_mask'] for item in encoded_data_val]
labels_val = [row['Label'] for index,row in df[df['split'] == 'val'].iterrows()]

# Convert to tensors
input_ids_val = torch.tensor(input_ids_val)
attention_masks_val = torch.tensor(attention_masks_val)
labels_val = torch.tensor(labels_val)

# Create a dataset
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

dataloader_val = DataLoader(dataset_val, batch_size=args.batch_size, shuffle=True) # NOTE : maybe set pin_memory=True



In [15]:
###################################################
############## Setup Test Dataloader ##############
###################################################

encoded_data_test = [tokenizer.encode_plus(row['Query'], row['Segment'], add_special_tokens=True, max_length=512, pad_to_max_length=True, truncation=True) for index,row in df[df['split'] == 'test'].iterrows()]
input_ids_test = [item['input_ids'] for item in encoded_data_test]
attention_masks_test = [item['attention_mask'] for item in encoded_data_test]
labels_test = [row['Label'] for index,row in df[df['split'] == 'test'].iterrows()]

# Convert to tensors
input_ids_test = torch.tensor(input_ids_test)
attention_masks_test = torch.tensor(attention_masks_test)
labels_test = torch.tensor(labels_test)

# Create a dataset
dataset_test = TensorDataset(input_ids_test, attention_masks_test, labels_test)

dataloader_test = DataLoader(dataset_test, batch_size=args.batch_size, shuffle=True) # NOTE : maybe set pin_memory=True

## Training

In [17]:
def compute_accuracy(y_pred, y_target):
    y_target = y_target.cpu()
    y_pred_indices = (torch.sigmoid(y_pred)>0.5).cpu().long()#.max(dim=1)[1]
    n_correct = torch.eq(y_pred_indices, y_target).sum().item()
    return n_correct / len(y_pred_indices) * 100


def calculate_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [16]:
# Initialize model and optimizer
model = BertForSequenceClassification.from_pretrained(args.pretuned_model_path)

if args.freeze: 
    for param in model.base_model.parameters():
        param.requires_grad  = False


if args.freeze: 
    optimizer = AdamW(model.classifier.parameters(), lr=args.learn_rate)
else : 
    optimizer = AdamW(model.parameters(), lr=args.learn_rate)

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=len(dataloader_train) * 0.05, num_training_steps=len(dataloader_train) * args.epochs)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
torch.cuda.empty_cache()
if torch.cuda.is_available():
  args.device = 'cuda'

model.to(args.device)
print(args.device)

cuda


In [22]:
train_progress = tqdm(total=0, desc='Train Batches', leave=True)
validation_progress = tqdm(total=0, desc='Validation Batches', leave=True)
epoch_progress = tqdm(total=args.epochs, desc='Epoch', leave=True)

best_val_accuracy = 0.0
patience = 3
num_epochs_no_improvement = 0


loss_fn = torch.nn.CrossEntropyLoss()

for epoch in range(args.epochs):

  model.train()
  total_train_loss = 0
  total_train_accuracy = 0 #NEW

  train_progress.reset(total=len(dataloader_train))
  validation_progress.reset(total=len(dataloader_val))

  # Training Loop
  for step, batch in enumerate(dataloader_train):
    b_input_ids, b_input_mask, b_labels = batch
    b_input_ids, b_input_mask, b_labels = b_input_ids.to(args.device), b_input_mask.to(args.device), b_labels.to(args.device)

    model.zero_grad()
    outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)

    loss = outputs.loss
    total_train_loss += loss.item()
    loss.backward()

    logits = outputs.logits.detach().cpu().numpy()#NEW
    label_ids = b_labels.to('cpu').numpy()#NEW
    total_train_accuracy += calculate_accuracy(logits, label_ids)#NEW

    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    optimizer.step()
    scheduler.step()

    train_progress.update(1)

  avg_train_loss = total_train_loss / len(dataloader_train)
  print(f'Epoch {epoch}: Average Training Loss: {avg_train_loss}')

  model.eval()
  total_eval_accuracy = 0
  total_eval_loss = 0

  # Validation Loop
  for batch in dataloader_val:
    b_input_ids, b_input_mask, b_labels = batch
    b_input_ids, b_input_mask, b_labels = b_input_ids.to(args.device), b_input_mask.to(args.device), b_labels.to(args.device)

    with torch.no_grad():
      outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

    logits = outputs.logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    total_eval_loss += loss_fn(outputs.logits.squeeze(-1), b_labels).item() # perhaps just outputs.loss (need to include labels as parameter in model() above)

    total_eval_accuracy += calculate_accuracy(logits, label_ids)

    validation_progress.update(1)

  avg_val_accuracy = total_eval_accuracy / len(dataloader_val)
  print(f'Epoch {epoch}: Validation Accuracy: {avg_val_accuracy}')

  # Checkpointing and Early Stopping
  if avg_val_accuracy > best_val_accuracy:
      print(f'Validation accuracy improved from {best_val_accuracy} to {avg_val_accuracy}. Saving model...')
      best_val_accuracy = avg_val_accuracy
      num_epochs_no_improvement = 0
      # Save the model using save_pretrained
      model.save_pretrained(args.model_save_path)
  else:
      num_epochs_no_improvement += 1
      if num_epochs_no_improvement >= args.patience:
          print("Early stopping triggered.")
          break  # Exit the training loop

  epoch_progress.update(1)


  

Train Batches: 0it [00:00, ?it/s]

Validation Batches: 0it [00:00, ?it/s]

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Epoch 0: Average Training Loss: 0.16887832330072036
Epoch 0: Validation Accuracy: 0.9490248226950354
Validation accuracy improved from 0.0 to 0.9490248226950354. Saving model...
Epoch 1: Average Training Loss: 0.08187438722951848
Epoch 1: Validation Accuracy: 0.9535559495665878
Validation accuracy improved from 0.9490248226950354 to 0.9535559495665878. Saving model...
Epoch 2: Average Training Loss: 0.0449700141061355
Epoch 2: Validation Accuracy: 0.9583333333333334
Validation accuracy improved from 0.9535559495665878 to 0.9583333333333334. Saving model...
Epoch 3: Average Training Loss: 0.016689012151450622
Epoch 3: Validation Accuracy: 0.9605496453900709
Validation accuracy improved from 0.9583333333333334 to 0.9605496453900709. Saving model...
Epoch 4: Average Training Loss: 0.0064789490723486136
Epoch 4: Validation Accuracy: 0.9547872340425532


## Evaluation

In [26]:
from transformers import BertTokenizer, BertModel, BertConfig
def evaluate_sequence_pair_class(model_path,  title):
    '''
    Routine for evaluating model for sequence pair classification
    '''

    progress = tqdm(total=len(dataloader_test), desc='Train Batches', leave=True)

    # load model and tokenizer
    # model = AutoModelForSequenceClassification.from_pretrained(model_path)
    model = BertForSequenceClassification.from_pretrained(model_path, num_labels=2)


    # Check if cuda available
    if torch.cuda.is_available():
        # model.to('cuda')
        args.device = 'cuda'
    else:
        # model.to('cpu')
        args.device = 'cpu'

    print(args.device)

    model.to(args.device)

    model.eval()

    predictions, true_labels = [], []

    print('Evaluating ' + f'[{title}]')
    print('============================================')


    with torch.no_grad(): # disable calculating gradients (more efficient for evaluation)
        for batch in dataloader_test:
            progress.update(1)

            input_ids, attention_mask, labels = tuple(t.to(args.device) for t in batch)
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1).flatten() # find index of max value in logits tensor (where each index corresponds to a binary class)

            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    # Calculate metrics
    accuracy = accuracy_score(true_labels, predictions)
    precision, recall, f1, acc = precision_recall_fscore_support(true_labels, predictions, average='binary')
    print(f'Accuracy: {accuracy}\nPrecision: {precision}\nRecall: {recall}\nF1 Score: {f1}')



In [24]:
evaluate_sequence_pair_class('./models/qa_model',  'Sequence Pair Classificaiton Evaluation Metrics')

Train Batches:   0%|          | 0/141 [00:00<?, ?it/s]

cuda
Evaluating [Sequence Pair Classificaiton Evaluation Metrics]
2
2
None
Accuracy: 0.9742222222222222
Precision: 0.696969696969697
Recall: 0.323943661971831
F1 Score: 0.4423076923076923


In [27]:
evaluate_sequence_pair_class('./models/sentence_pair_classification',  'Sequence Pair Classificaiton Evaluation Metrics')

Train Batches:   0%|          | 0/141 [00:00<?, ?it/s]

cuda
Evaluating [Sequence Pair Classificaiton Evaluation Metrics]
Accuracy: 0.9582222222222222
Precision: 0.0
Recall: 0.0
F1 Score: 0.0
