In [103]:
#Loading libraries
import numpy as np
import pandas as pd
import os
import re

import sklearn
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score
from sklearn.preprocessing import LabelEncoder

from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup
import torch
from torch import nn
from torch.utils.data import random_split, DataLoader,Dataset
import torch.optim as optim

from tqdm import tqdm

In [108]:
root = "/content/drive/MyDrive/Colab Notebooks/"

max_len = 512   #maximum token length
batch_size = 8
epochs = 3

Data Cleaning

In [105]:
df = pd.read_csv(os.path.join(root, "Legal Text", "legal_text_classification.csv"))

In [106]:
df.describe()

Unnamed: 0,case_id,case_outcome,case_title,case_text
count,24985,24985,24985,24809
unique,24985,10,18581,17920
top,Case1,cited,Minister for Immigration and Ethnic Affairs v ...,submitted that this Court should hold that the...
freq,1,12219,70,42


In [5]:
df.case_outcome.value_counts()

case_outcome
cited            12219
referred to       4384
applied           2448
followed          2256
considered        1712
discussed         1024
distinguished      608
related            113
affirmed           113
approved           108
Name: count, dtype: int64

In [6]:
df.isna().sum()     #Checking for missing values

case_id           0
case_outcome      0
case_title        0
case_text       176
dtype: int64

In [7]:
df = df.fillna('')   #Filling missing values with empty string

In [8]:
df.isna().sum()

case_id         0
case_outcome    0
case_title      0
case_text       0
dtype: int64

Data Processing

In [9]:
df['case_text_sum'] = df["case_title"] + " " + df["case_text"]    #Creating a new column by concatenating the case title and text

In [10]:
df.describe()

Unnamed: 0,case_id,case_outcome,case_title,case_text,case_text_sum
count,24985,24985,24985,24985.0,24985
unique,24985,10,18581,17921.0,24940
top,Case1,cited,Minister for Immigration and Ethnic Affairs v ...,,Gudjala People # 2 v Native Title Registrar [2...
freq,1,12219,70,176.0,5


In [11]:
df_cleaned = df.iloc[:,[1,4]]      #removing columns except label and case summed text

In [12]:
df_cleaned = df_cleaned.drop_duplicates()

In [13]:
df_cleaned.describe()

Unnamed: 0,case_outcome,case_text_sum
count,24940,24940
unique,10,24940
top,cited,Alpine Hardwood (Aust) Pty Ltd v Hardys Pty Lt...
freq,12198,1


In [14]:
chars = '''!-{}@#$%'"\|/^&*_'''   #List of characters to ne removed from text

In [15]:
def pre_process(text):
    text = re.sub(r"https?://\S+","",text)   #remove URL
    text = text.strip()  # remove leading and trailing whitespace
    text = " ".join(text.split()) #remove extra whitespace
    text = "".join([x for x in text if x not in chars])  #removing unneccesary characters
    return text.lower() #converting to lowercase and returning

In [None]:
df_cleaned["case_text_processed"] = df_cleaned["case_text_sum"].apply(pre_process)   #Pre processing text

In [17]:
df_cleaned.describe()

Unnamed: 0,case_outcome,case_text_sum,case_text_processed
count,24940,24940,24940
unique,10,24940,24938
top,cited,Alpine Hardwood (Aust) Pty Ltd v Hardys Pty Lt...,nabd of 2002 v minister of immigration amp; mu...
freq,12198,1,2


In [18]:
df_cleaned[df_cleaned['case_text_processed'].duplicated(keep = False)]  #Duplication coming due to URL removal

Unnamed: 0,case_outcome,case_text_sum,case_text_processed
3630,cited,NABD of 2002 v Minister of Immigration &amp; M...,nabd of 2002 v minister of immigration amp; mu...
5762,cited,NABD of 2002 v Minister of Immigration &amp; M...,nabd of 2002 v minister of immigration amp; mu...
24039,referred to,"Construction, Forestry, Mining and Energy Unio...","construction, forestry, mining and energy unio..."
24043,referred to,"Construction, Forestry, Mining and Energy Unio...","construction, forestry, mining and energy unio..."


In [19]:
df_processed = df_cleaned.drop(columns=['case_text_sum']).drop_duplicates()    #Dropping duplicates

In [20]:
df_processed.reset_index(drop=True, inplace=True)   #resetting index

In [21]:
df_processed.describe()

Unnamed: 0,case_outcome,case_text_processed
count,24938,24938
unique,10,24938
top,cited,alpine hardwood (aust) pty ltd v hardys pty lt...
freq,12197,1


Tokenizing and Encoding

In [22]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')   #Downloading and assigning Bert tokenizer from HuggingFace

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [23]:
def bert_encoding(text):
    encoded_dict = tokenizer.encode_plus(
                                text,
                                add_special_tokens=True,  #For adding [CLS] and [SEP] tokens in each sequence
                                max_length=max_len,
                                padding='max_length',   #For adding 0 tokens at end for padding
                                return_attention_mask=True,
                                return_tensors='pt',
                                truncation=True     #For truncating sequences greater than maximum seq length
                           )
    return encoded_dict['input_ids'], encoded_dict['attention_mask']

In [24]:
tqdm.pandas()

In [25]:
res = df_processed["case_text_processed"].progress_apply(bert_encoding)  #Tokenizing and encoding text for Bert

100%|██████████| 24938/24938 [05:54<00:00, 70.43it/s] 


In [26]:
input_ids, attention_masks = zip(*list(res))   #unzipping res variable

In [27]:
input_ids = torch.cat(input_ids, dim=0)    #For converting iterable of tensors to tensor
attention_masks = torch.cat(attention_masks, dim=0)

Data Loading

In [28]:
le = LabelEncoder()
df_processed['labels'] = le.fit_transform(df_processed['case_outcome'])  #Convert case outcome to target labels

In [29]:
target_tensor = torch.tensor(df_processed['labels'],dtype = torch.int64)

In [30]:
class Cus_Dataset(Dataset):    #Custom Dataset class
    def __init__(self, input_ids, attention_masks, target_tensor):
        self.input_ids = input_ids
        self.attention_masks = attention_masks
        self.target = target_tensor

    def __len__(self):
        return len(self.target)

    def __getitem__(self,index):
        return self.input_ids[index], self.attention_masks[index], self.target[index]

In [31]:
dataset = Cus_Dataset(input_ids, attention_masks, target_tensor) #Creating custom dataset object for conversion to data loader

In [32]:
gen = torch.Generator()
gen.manual_seed(0)  #for geenrating a fixed split
train_df, test_df = random_split(dataset, [0.8, 0.2], generator=gen)   #Train test split

In [33]:
#Creating Data Loaders for train and test dataset
train_data_loader = DataLoader(train_df, batch_size=batch_size, shuffle=True)
test_data_loader = DataLoader(test_df, batch_size=batch_size, shuffle=True)

Fine Tuning pre-trained BERT model for classification

In [34]:
#Download and assign Bert model from Huggingface
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=10)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [35]:
# Define optimizer and learning rate scheduler
optimizer = optim.Adam(model.parameters())
total_steps = len(train_data_loader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

In [None]:
#Assigning model to device i.e. cpu or gpu
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()    #Activate layers for training model

In [None]:
#Fine Tuning loop
for epoch in range(epochs):
    total_train_loss = 0

    for batch in tqdm(train_data_loader, desc=f"Epoch {epoch + 1}"):
        batch = tuple(t.to(device) for t in batch)
        input_ids, attention_mask, labels = batch

        optimizer.zero_grad()      #For zeroing previous parameter gradients

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_train_loss += loss.item()

        loss.backward()   #for parameter gradients
        #torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()   #changing parameters through optimizer
        scheduler.step()    #changing learning rate

    avg_train_loss = total_train_loss / len(train_data_loader)
    print(f"Average training loss: {avg_train_loss}")

# Save the fine-tuned model
model.save_pretrained(os.path.join(root, "fine_tuned_bert_model"))
tokenizer.save_pretrained(os.path.join(root, "fine_tuned_bert_model"))

Epoch 1: 100%|██████████| 2494/2494 [30:23<00:00,  1.37it/s]


Average training loss: 1.6992024598283202


Epoch 2: 100%|██████████| 2494/2494 [30:23<00:00,  1.37it/s]


Average training loss: 1.631990006006806


Epoch 3: 100%|██████████| 2494/2494 [30:23<00:00,  1.37it/s]


Average training loss: 1.5934051329743508


Epoch 4:  59%|█████▉    | 1475/2494 [17:59<12:17,  1.38it/s]

Evaluation

In [107]:
#Reading fine tuned saved Bert model and tokenizer
tokenizer = BertTokenizer.from_pretrained(os.path.join(root, "fine_tuned_bert_model"))
model = BertForSequenceClassification.from_pretrained(os.path.join(root, "fine_tuned_bert_model"))

In [None]:
model.eval()   #For deactivating some layers in the model as it is used for inferencing only

In [100]:
#Evaluation loop
f1_per_batch = []
accuracy_per_batch = []

for batch_no, batch in enumerate(test_data_loader):

  batch = tuple(t.to(device) for t in batch)
  input_ids, attention_mask, labels = batch

  with torch.no_grad():                                       #For not calculating any gradients
    outputs = model(input_ids, attention_mask=attention_mask)

  y_predicted = torch.argmax(outputs.logits, dim=1).tolist()     #Getting predicted labels
  predicted_labels = le.inverse_transform(y_predicted)

  y_true = labels.tolist()

  #Calculating f1 and accuracy score for the predicted values
  f1 = f1_score(y_true = y_true, y_pred = y_predicted, average = 'weighted')
  accuracy = accuracy_score(y_true,y_predicted)

  f1_per_batch.append(f1)
  accuracy_per_batch.append(accuracy)

  if batch_no + 1 == 10:
    break

print(f"F1 Score : {sum(f1_per_batch)/len(f1_per_batch)}")
print(f"Accuracy : {sum(accuracy_per_batch)/len(accuracy_per_batch)}")

F1 Score : 0.27821345321345314
Accuracy : 0.4125
