### Installations

In [2]:
# !pip install datasets
# !pip install nltk
# !pip install transformers
# !pip install torch
import warnings
warnings.filterwarnings("ignore")

In [22]:
import numpy as np  
import pandas as pd
import spacy

In [4]:
# load sectionized_data.csv
data = pd.read_csv('sectionized_data.csv')
data

Unnamed: 0,URL_id,Header,Body
0,https://www.elitigation.sg/gd/s/2024_SGHC_86,,
1,https://www.elitigation.sg/gd/s/2024_SGHC_86,IN THE GENERAL DIVISION OF THE HIGH COURT OF T...,\nSuit No 383 of 2020\nBetween\nLim Ing Haan\n...
2,https://www.elitigation.sg/gd/s/2024_SGHC_86,JUDGMENT,\n[Damages -- Assessment - Loss of future earn...
3,https://www.elitigation.sg/gd/s/2024_SGHC_86,Lim Ing Haan\nv\nTuan 'Abdu Qayyim bin Tuan Is...,\nGeneral Division of the High Court -- Suit N...
4,https://www.elitigation.sg/gd/s/2024_SGHC_86,See Kee Oon JAD:\nIntroduction,"\n1 The plaintiff, Ms Lim Ing Haan (""Ms Lim""),..."
...,...,...,...
152511,https://www.elitigation.sg/gd/s/2022_SGCA_22,Our decision\nOverview of the disciplinary pro...,\n31 In Iskandar bin Rahmat v Law Society of S...
152512,https://www.elitigation.sg/gd/s/2022_SGCA_22,Applicable law,\n32 The law on statutory interpretation is se...
152513,https://www.elitigation.sg/gd/s/2022_SGCA_22,Is the Council empowered to refer matters back...,\n34 We begin with the plain words of s 87. As...
152514,https://www.elitigation.sg/gd/s/2022_SGCA_22,Is there a prima facie case of sufficient grav...,\n48 Having found that the Council was entitle...


### Preprocess Data

In [5]:
# drop rows with NaN values, reset index
data = data.dropna().reset_index(drop=True)
data.head()

Unnamed: 0,URL_id,Header,Body
0,https://www.elitigation.sg/gd/s/2024_SGHC_86,IN THE GENERAL DIVISION OF THE HIGH COURT OF T...,\nSuit No 383 of 2020\nBetween\nLim Ing Haan\n...
1,https://www.elitigation.sg/gd/s/2024_SGHC_86,JUDGMENT,\n[Damages -- Assessment - Loss of future earn...
2,https://www.elitigation.sg/gd/s/2024_SGHC_86,Lim Ing Haan\nv\nTuan 'Abdu Qayyim bin Tuan Is...,\nGeneral Division of the High Court -- Suit N...
3,https://www.elitigation.sg/gd/s/2024_SGHC_86,See Kee Oon JAD:\nIntroduction,"\n1 The plaintiff, Ms Lim Ing Haan (""Ms Lim""),..."
4,https://www.elitigation.sg/gd/s/2024_SGHC_86,Facts,"\n2 As an interventional cardiologist, Ms Lim ..."


#### Extracting documents into .txt files for annotation

In [6]:
# # documents = data.groupby('URL_id')['URL_id'].agg(['unique'])
# documents = data.groupby('URL_id').agg({'Header': ' '.join, 'Body': ' '.join}).reset_index()
# # documents = documents.rename('text').reset_index()
# documents

In [7]:
# training_data = documents.iloc[:int(len(documents)*0.3)]
# training_data

In [8]:
# for i in range(0, len(training_data)):
#     header_text = str(training_data['Header'][i])
#     with open(f'documents_to_annotate/header{i+1}.txt', 'w') as file:
#         file.write(header_text)
#     body_text = str(training_data['Body'][i])
#     with open(f'documents_to_annotate/body{i+1}.txt', 'w') as file:
#         file.write(body_text)

#### Aggregate all training data into a list

In [9]:
import os
notebook_dir = os.getcwd()
notebook_dir

'c:\\Users\\Selina\\OneDrive\\Documents\\GitHub\\case-judgment-analysis\\NamedEntityRecognition'

In [10]:
import json

directory = os.path.join(notebook_dir, 'content\jsonl_files')
print(directory)
training_data = []
os.chdir(directory)

for filename in os.listdir('.'):
    if filename.endswith('.jsonl'): 
        with open(filename, 'r') as file:
            data = json.load(file)
            training_data.append(data)

print(len(training_data))

c:\Users\Selina\OneDrive\Documents\GitHub\case-judgment-analysis\NamedEntityRecognition\content\jsonl_files
39


In [11]:
print(training_data[0])

{'text': '\n\nInternational Factors Leasing Pte Ltd v The Personal Representative of Tan Hock Kee & Others\n[2002] SGHC 270\n\nCase Number\n:\nSuit No 1443 of 2001, RA No 107 of 2002\nDecision Date\n:\n18 November 2002\nTribunal/Court\n:\nHigh Court\nCoram\n:\nWoo Bih Li JC\nCounsel Name(s)\n:\nSean Lim and Tan Aik How (Hin Tat & Partners) for the plaintiff; Hri Kumar and Gary Low (Drew & Napier LLC) for all the defendants\nParties\n:\n--\nCivil Procedure - Summary judgment - Application for summary judgment - Counterclaim by defendants - Application for stay of execution - Whether to grant stay of execution where counterclaim existing\nContract - Contractual terms - Use of excess payment by borrower towards reducing principal sum due\nJudgment\nGROUNDS OF DECISION\nBackground\n\n1. The Plaintiff International Factors Leasing Pte Ltd (\'IFL\') claims against the First and Second Defendants Tan Hock Kee (deceased) and THK Realty Pte Ltd (\'THK Realty\') payment of an outstanding loan an

In [43]:
for doc in training_data:
    labels = doc['label']
    entities = []
    for label in labels:
        entity = (label[0], label[1], label[2])
        entities.append(entity)
    
    doc["entities"] = entities
    doc["label"] = ""

print(training_data[0]['entities'])

[(2, 110, 'PRECEDENT'), (235, 248, 'JUDGE'), (267, 275, 'LAWYER'), (280, 291, 'LAWYER'), (336, 354, 'LAWYER'), (759, 806, 'PETITIONER'), (862, 874, 'RESPONDENT'), (890, 908, 'ORG'), (1004, 1017, 'RESPONDENT'), (1019, 1034, 'OTHER_PERSON'), (1039, 1050, 'OTHER_PERSON'), (1186, 1189, 'RESPONDENT'), (1996, 1999, 'ORG'), (2102, 2105, 'ORG'), (2302, 2305, 'ORG'), (2431, 2456, 'COURT'), (2465, 2526, 'PRECEDENT'), (2567, 2570, 'ORG'), (2700, 2703, 'ORG'), (2708, 2711, 'PETITIONER'), (2801, 2814, 'DATE'), (2948, 2960, 'DATE'), (2965, 2982, 'DATE'), (3369, 3379, 'PROVISION'), (3415, 3427, 'OTHER_PERSON'), (3432, 3442, 'ORG'), (3687, 3692, 'OTHER_PERSON'), (3832, 3835, 'ORG'), (3922, 3925, 'ORG'), (4049, 4052, 'ORG'), (4094, 4097, 'ORG'), (4156, 4164, 'OTHER_PERSON'), (4178, 4181, 'ORG'), (4396, 4399, 'ORG'), (4450, 4470, 'OTHER_PERSON'), (4503, 4506, 'ORG'), (4661, 4664, 'ORG'), (4685, 4690, 'OTHER_PERSON'), (4752, 4755, 'ORG'), (4879, 4890, 'OTHER_PERSON'), (4963, 4968, 'OTHER_PERSON'), (5000,

### spaCy

In [12]:
# !pip install -U spacy
# !pip install spacy_transformers

In [13]:
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm
import json

In [23]:
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
     ---------------------------------------- 0.0/587.7 MB ? eta -:--:--
     --------------------------------------- 0.5/587.7 MB 11.1 MB/s eta 0:00:53
     --------------------------------------- 2.5/587.7 MB 20.1 MB/s eta 0:00:30
     --------------------------------------- 4.5/587.7 MB 23.8 MB/s eta 0:00:25
      -------------------------------------- 8.1/587.7 MB 27.1 MB/s eta 0:00:22
      ------------------------------------- 10.0/587.7 MB 27.8 MB/s eta 0:00:21
      ------------------------------------- 11.5/587.7 MB 28.5 MB/s eta 0:00:21
      ------------------------------------- 13.3/587.7 MB 29.7 MB/s eta 0:00:20
      ------------------------------------- 14.8/587.7 MB 27.3 MB/s eta 0:00:22
     - ------------------------------------ 15.6/587.7 MB 24.2 MB/s eta 0:00:24
     - -----------------------

In [24]:
nlp = spacy.load("en_core_web_lg")
nlp

<spacy.lang.en.English at 0x10dbbba6460>

In [46]:
# Define a function to create spaCy DocBin objects from the annotated data
def get_spacy_doc(file, data):
    # Create a blank spaCy pipeline
    nlp = spacy.blank('en')
    db = DocBin()

    # Iterate through the data
    for text in tqdm(data):
        doc = nlp.make_doc(text['text'])
        annot = text['entities']

        ents = []
        entity_indices = []

        # Extract entities from the annotations
        for start, end, label in annot:
            skip_entity = False
            for idx in range(start, end):
                if idx in entity_indices:
                    skip_entity = True
                    break
            if skip_entity:
                continue

        entity_indices = entity_indices + list(range(start, end))
        try:
            span = doc.char_span(start, end, label=label, alignment_mode='strict')
        except:
            continue

        if span is None:
            # Log errors for annotations that couldn't be processed
            err_data = str([start, end]) + "    " + str(text) + "\n"
            file.write(err_data)
        else:
            ents.append(span)

        try:
            doc.ents = ents
            db.add(doc)
        except:
            pass

    return db

In [15]:
# # Split the annotated data into training and testing sets
# from sklearn.model_selection import train_test_split
# train, test = train_test_split(training_data, test_size=0.25)

# # Display the number of items in the training and testing sets
# len(train), len(test)

(29, 10)

In [44]:
training_data[0]

{'text': '\n\nInternational Factors Leasing Pte Ltd v The Personal Representative of Tan Hock Kee & Others\n[2002] SGHC 270\n\nCase Number\n:\nSuit No 1443 of 2001, RA No 107 of 2002\nDecision Date\n:\n18 November 2002\nTribunal/Court\n:\nHigh Court\nCoram\n:\nWoo Bih Li JC\nCounsel Name(s)\n:\nSean Lim and Tan Aik How (Hin Tat & Partners) for the plaintiff; Hri Kumar and Gary Low (Drew & Napier LLC) for all the defendants\nParties\n:\n--\nCivil Procedure - Summary judgment - Application for summary judgment - Counterclaim by defendants - Application for stay of execution - Whether to grant stay of execution where counterclaim existing\nContract - Contractual terms - Use of excess payment by borrower towards reducing principal sum due\nJudgment\nGROUNDS OF DECISION\nBackground\n\n1. The Plaintiff International Factors Leasing Pte Ltd (\'IFL\') claims against the First and Second Defendants Tan Hock Kee (deceased) and THK Realty Pte Ltd (\'THK Realty\') payment of an outstanding loan an

In [47]:
log_filepath = os.path.join(notebook_dir, 'trained_models', 'train_file.txt')
train_filepath = os.path.join(notebook_dir, 'trained_models', 'train_data.spacy')

# Open a file to log errors during annotation processing
file = open(log_filepath,'w')

# Create spaCy DocBin objects for training and testing data
db = get_spacy_doc(file, training_data)
db.to_disk(train_filepath)

# Close the error log file
file.close()

100%|██████████| 39/39 [00:17<00:00,  2.20it/s]


In [30]:
notebook_dir

'c:\\Users\\Selina\\OneDrive\\Documents\\GitHub\\case-judgment-analysis\\NamedEntityRecognition'

In [48]:
os.chdir(notebook_dir)
!python -m spacy init fill-config ./base_config.cfg ./config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [None]:
# for clearing memory
import gc
gc.collect()

In [49]:
os.chdir(notebook_dir)
# train model
!python -m spacy train ./config.cfg --output ./output --paths.train ./trained_models/train_data.spacy --paths.dev ./trained_models/train_data.spacy

[38;5;4mℹ Saving to output directory: output[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00   3131.12    0.00    0.00    0.00    0.00
[38;5;3m⚠ Aborting and saving the final best model. Encountered exception:
MemoryError((508473, 768), dtype('float32'))[0m


Traceback (most recent call last):
  File "c:\Users\Selina\Anaconda3\lib\runpy.py", line 194, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "c:\Users\Selina\Anaconda3\lib\runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "c:\Users\Selina\Anaconda3\lib\site-packages\spacy\__main__.py", line 4, in <module>
    setup_cli()
  File "c:\Users\Selina\Anaconda3\lib\site-packages\spacy\cli\_util.py", line 87, in setup_cli
    command(prog_name=COMMAND)
  File "c:\Users\Selina\Anaconda3\lib\site-packages\click\core.py", line 1128, in __call__
    return self.main(*args, **kwargs)
  File "c:\Users\Selina\Anaconda3\lib\site-packages\typer\core.py", line 783, in main
    return _main(
  File "c:\Users\Selina\Anaconda3\lib\site-packages\typer\core.py", line 225, in _main
    rv = self.invoke(ctx)
  File "c:\Users\Selina\Anaconda3\lib\site-packages\click\core.py", line 1659, in invoke
    return _process_result(sub_ctx.command.invoke(sub_ctx))
  File "

Traceback (most recent call last):
  File "c:\Users\Selina\Anaconda3\lib\runpy.py", line 194, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "c:\Users\Selina\Anaconda3\lib\runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "c:\Users\Selina\Anaconda3\lib\site-packages\spacy\__main__.py", line 4, in <module>
    setup_cli()
  File "c:\Users\Selina\Anaconda3\lib\site-packages\spacy\cli\_util.py", line 87, in setup_cli
    command(prog_name=COMMAND)
  File "c:\Users\Selina\Anaconda3\lib\site-packages\click\core.py", line 1128, in __call__
    return self.main(*args, **kwargs)
  File "c:\Users\Selina\Anaconda3\lib\site-packages\typer\core.py", line 783, in main
    return _main(
  File "c:\Users\Selina\Anaconda3\lib\site-packages\typer\core.py", line 225, in _main
    rv = self.invoke(ctx)
  File "c:\Users\Selina\Anaconda3\lib\site-packages\click\core.py", line 1659, in invoke
    return _process_result(sub_ctx.command.invoke(sub_ctx))
  File "

[38;5;4mℹ Saving to output directory: output[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00   3131.12    0.00    0.00    0.00    0.00
  5     200       1962.40  43611.36   55.81   85.71   41.38    0.56
 10     400       5644.59   1281.31   60.00   67.74   53.85    0.60
[38;5;3m⚠ Aborting and saving the final best model. Encountered exception:
MemoryError((34245, 256), dtype('float32'))[0m


### Evaluating model performance

In [50]:
df = pd.read_csv('sectionized_data.csv')
df = df.dropna().reset_index(drop=True)
sample_text = df.sample(1)
sample_text_header = sample_text['Header'].values[0]
sample_text_body = sample_text['Body'].values[0]
sample_text_header, sample_text_body

("The Prosecution's case",
 '\n12 The Prosecution\'s primary case (the "Primary Case") was that Manik had inflicted the injury on Rahim\'s left leg ("the Fatal Injury"), with what the charge specifies as a "chopper". This chopper was described by witnesses as a big knife, and is referred to in the same manner in these grounds of decision. On the Prosecution\'s case, on 24 September 2016, members of the syndicate met at the Canteen, discussing the action to be taken at a meeting with the rival syndicate later that evening. Choppers were distributed. Manik received one, which he then brought along to the anticipated meeting with the rival syndicate at Avenue 1. During the incident, Manik used that chopper to intentionally inflict the Fatal Injury on Rahim. Knowing that Rahim was a member of the rival syndicate, Manik wanted to send a message to the rival syndicate not to interfere with their business. Manik then brought the chopper he used back to the taxi. Various statements were attrib

In [51]:
colors = {"COURT":"#FFB6C1",
        "PETITIONER":"#FFDAB9",
        "RESPONDENT":"#FFA07A",
        "JUDGE":"#FFC0CB",
        "LAWYER":"#FFDEAD",
        "DATE":"#F0E68C",
        "ORGANIZATION":"#FF69B4",
        "GPE":"#20B2AA",
        "STATUTE":"#87CEFA",
        "PRECEDENT":"#ADD8E6",
        "CASE_NUMBER":"#B0E0E6",
        "WITNESS":"#87CEEB",
        "OTHER_PERSON":"#AFEEEE"}

options = {"colors":colors}

In [52]:
model = spacy.load('./output/model-best')
doc1 = model(sample_text_header)
spacy.displacy.render(doc1, style='ent', options=options, jupyter=True)

In [53]:
doc2 = model(sample_text_body)
spacy.displacy.render(doc2, style='ent', options=options, jupyter=True)