# legal document classifier

In [7]:
!pip install -q transformers datasets tokenizers

## dependencies



In [13]:
import tensorflow as tf
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score, accuracy_score

## dataset


In [9]:
# setup and configuration
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/kaggle.json
!chmod 600 ~/.kaggle/kaggle.json

In [10]:
# downloading data
!kaggle datasets download -d amohankumar/legal-text-classification-dataset

Dataset URL: https://www.kaggle.com/datasets/amohankumar/legal-text-classification-dataset
License(s): apache-2.0
Downloading legal-text-classification-dataset.zip to /content
  0% 0.00/14.9M [00:00<?, ?B/s]
100% 14.9M/14.9M [00:00<00:00, 1.53GB/s]


In [11]:
# unzippiong the data
!unzip legal-text-classification-dataset.zip

Archive:  legal-text-classification-dataset.zip
  inflating: legal_text_classification.csv  


In [12]:
!ls

kaggle.json		       legal-text-classification-dataset.zip
legal_text_classification.csv  sample_data


In [14]:
# reading data into a data frame
data = pd.read_csv("/content/legal_text_classification.csv")
data.head()

Unnamed: 0,case_id,case_outcome,case_title,case_text
0,Case1,cited,Alpine Hardwood (Aust) Pty Ltd v Hardys Pty Lt...,Ordinarily that discretion will be exercised s...
1,Case2,cited,Black v Lipovac [1998] FCA 699 ; (1998) 217 AL...,The general principles governing the exercise ...
2,Case3,cited,Colgate Palmolive Co v Cussons Pty Ltd (1993) ...,Ordinarily that discretion will be exercised s...
3,Case4,cited,Dais Studio Pty Ltd v Bullett Creative Pty Ltd...,The general principles governing the exercise ...
4,Case5,cited,Dr Martens Australia Pty Ltd v Figgins Holding...,The preceding general principles inform the ex...


In [15]:
data.shape

(24985, 4)

In [16]:
data.isnull().sum()

Unnamed: 0,0
case_id,0
case_outcome,0
case_title,0
case_text,176


In [17]:
data = data.dropna()
data.isnull().sum()

Unnamed: 0,0
case_id,0
case_outcome,0
case_title,0
case_text,0


In [18]:
data.shape

(24809, 4)

In [19]:
data.duplicated().sum()

np.int64(0)

In [20]:
data.head()

Unnamed: 0,case_id,case_outcome,case_title,case_text
0,Case1,cited,Alpine Hardwood (Aust) Pty Ltd v Hardys Pty Lt...,Ordinarily that discretion will be exercised s...
1,Case2,cited,Black v Lipovac [1998] FCA 699 ; (1998) 217 AL...,The general principles governing the exercise ...
2,Case3,cited,Colgate Palmolive Co v Cussons Pty Ltd (1993) ...,Ordinarily that discretion will be exercised s...
3,Case4,cited,Dais Studio Pty Ltd v Bullett Creative Pty Ltd...,The general principles governing the exercise ...
4,Case5,cited,Dr Martens Australia Pty Ltd v Figgins Holding...,The preceding general principles inform the ex...


In [21]:
data.columns

Index(['case_id', 'case_outcome', 'case_title', 'case_text'], dtype='object')

In [23]:
data["case_outcome"].value_counts()

Unnamed: 0_level_0,count
case_outcome,Unnamed: 1_level_1
cited,12110
referred to,4363
applied,2438
followed,2252
considered,1699
discussed,1018
distinguished,603
related,112
approved,108
affirmed,106


In [24]:
# map labels to integers
label_list = sorted(data["case_outcome"].dropna().unique().tolist())
id2label = {i: l for i, l in enumerate(label_list)}
label2id = {l: i for i, l in id2label.items()}
data["label"] = data["case_outcome"].map(label2id)
N_LABELS = len(label_list)
print("N labels:", N_LABELS)

N labels: 10


In [25]:
data.head()

Unnamed: 0,case_id,case_outcome,case_title,case_text,label
0,Case1,cited,Alpine Hardwood (Aust) Pty Ltd v Hardys Pty Lt...,Ordinarily that discretion will be exercised s...,3
1,Case2,cited,Black v Lipovac [1998] FCA 699 ; (1998) 217 AL...,The general principles governing the exercise ...,3
2,Case3,cited,Colgate Palmolive Co v Cussons Pty Ltd (1993) ...,Ordinarily that discretion will be exercised s...,3
3,Case4,cited,Dais Studio Pty Ltd v Bullett Creative Pty Ltd...,The general principles governing the exercise ...,3
4,Case5,cited,Dr Martens Australia Pty Ltd v Figgins Holding...,The preceding general principles inform the ex...,3


In [26]:
data = data.drop("case_id", axis=1)
data.head()

Unnamed: 0,case_outcome,case_title,case_text,label
0,cited,Alpine Hardwood (Aust) Pty Ltd v Hardys Pty Lt...,Ordinarily that discretion will be exercised s...,3
1,cited,Black v Lipovac [1998] FCA 699 ; (1998) 217 AL...,The general principles governing the exercise ...,3
2,cited,Colgate Palmolive Co v Cussons Pty Ltd (1993) ...,Ordinarily that discretion will be exercised s...,3
3,cited,Dais Studio Pty Ltd v Bullett Creative Pty Ltd...,The general principles governing the exercise ...,3
4,cited,Dr Martens Australia Pty Ltd v Figgins Holding...,The preceding general principles inform the ex...,3


In [27]:
data["label"].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
3,12110
8,4363
1,2438
7,2252
4,1699
5,1018
6,603
9,112
2,108
0,106


## train, validation and test split(stratified)

In [28]:
train_df, temp_df = train_test_split(
    data,
    test_size = 0.2,
    stratify = data["label"],
    random_state = 21
)
val_df, test_df = train_test_split(
    temp_df,
    test_size = 0.5,
    stratify = temp_df["label"],
    random_state = 21
)

print(train_df.shape, val_df.shape, test_df.shape)

(19847, 4) (2481, 4) (2481, 4)


## tokenizer and token length analysis


In [30]:
MODEL_NAME = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [32]:
# inspectiong token lengths to choose max_length
def token_length(texts, sample = 2000):
  sample_texts = texts.sample(min(len(texts), sample), random_state=21).tolist()
  lengths = [
      len(tokenizer.encode(t, add_special_tokens=True)) for t in sample_texts
  ]
  return np.percentile(lengths, [50, 75, 90, 95, 99])

token_len = token_length(train_df["case_text"])
print("token length percentiles (50,75,90,95,99):", token_len)

Token indices sequence length is longer than the specified maximum sequence length for this model (1289 > 512). Running this sequence through the model will result in indexing errors


token length percentiles (50,75,90,95,99): [ 317.    541.25 1012.2  1521.05 4118.65]
