# Multi-label Legal Text Classification for CIA

## Models and Experiments

### II. Adaptive Learning with Sentence BERT Models

In [None]:
!pip install -U sentence-transformers
!pip install transformers

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import csv
import gzip

In [14]:
from sentence_transformers import models, losses, datasets
from sentence_transformers import LoggingHandler, SentenceTransformer, util, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.datasets import DenoisingAutoEncoderDataset
import torch
from torch.optim import Adam
from torch.utils.data import DataLoader
from tqdm import tqdm
import math
import logging
from datetime import datetime

In [5]:
os.chdir("../..")
os.getcwd()

'/Users/janinedevera/Documents/School/MDS 2021-2023/Thesis/multilabel-legal-text-classification-CIA'

#### a. Prepare data

In [14]:
# training data (by sentence)
text = pd.read_csv("data/01 legal_texts_pipeline_sentence.csv")

In [15]:
text['sentence_clean'] = text['sentence_clean'].astype(str)
text_list = text['sentence_clean'].values.tolist()

In [16]:
text_list

['v procedur and timefram for submit comment',
 'mci/imda would like seek view comment industri member public issu question',
 'cover page includ personal/compani particular contact inform b',
 'tabl content c. summari major point d. statement interest e. comment f. conclus',
 'all submiss written clearli concis provid reason explan propos revis',
 'where feasibl respond identifi specif provis psa comment explain basi propos',
 'all submiss reach mci/imda within week later decemb p.m .. respond adher timelin late submiss consid',
 'submiss soft copi microsoft word pdf format',
 'plea submit soft copi email subject “ public consult review postal servic act submitt ’ name individu ’ views/organis name organis ’ view ” postalregul',
 'mci/imda reserv right make public part written submiss disclos ident sourc',
 'respond may request confidenti treatment part submiss respond believ proprietari confidenti commerci sensit',
 'ani inform clearli mark place separ annex',
 'respond also requir s

#### b. Pre-training with TSDAE objective for domain adaptation

In [73]:
# initialize model 
model_name = 'bert-base-uncased'
word_embedding_model = models.Transformer(model_name)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), 'cls')
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [74]:
# prepare train data
train_data = DenoisingAutoEncoderDataset(text_list)
loader = DataLoader(train_data, batch_size=8, shuffle=True, drop_last=True)

In [75]:
# loss function for tsdae 
loss = losses.DenoisingAutoEncoderLoss(model, tie_encoder_decoder=True)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertLMHeadModel: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertLMHeadModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertLMHeadModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertLMHeadModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['bert.encoder.layer.11.crossattention.self.value.bias', 'bert.encoder.layer.11.crossattention.self.key.bias', 'bert.encoder.layer.3.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.10.crossattention.self.key.bias', 'bert.encoder.

In [76]:
# model parameters 
num_epochs = 5
learning_rate = 0.0001

In [77]:
model_tsdae_path = 'models/tsdae-'+datetime.now().strftime("%Y-%m-%d_%H-%M")

In [None]:
model.fit(
    train_objectives=[(loader, loss)],
    epochs=num_epochs,
    weight_decay=0,
    scheduler='constantlr',
    optimizer_params={'lr': learning_rate},
    show_progress_bar=True,
    output_path=model_tsdae_path
)

#### c. Fine-tuning with labeled data

In [79]:
sts_dataset_path = 'data/stsb/stsbenchmark.tsv.gz'

if not os.path.exists(sts_dataset_path):
    util.http_get('https://sbert.net/datasets/stsbenchmark.tsv.gz', sts_dataset_path)

In [80]:
# load trained model 
trained_tsdae = SentenceTransformer('models/tsdae-2023-03-22_12-39/')

In [81]:
# model parameters
train_batch_size = 16
num_epochs = 5

model_save_path = 'models/sbert_stsbenchmark-'+datetime.now().strftime("%Y-%m-%d_%H-%M")

In [82]:
# dataset to dataloader
logging.info("Read STSbenchmark train dataset")

train_samples = []
dev_samples = []
test_samples = []
with gzip.open(sts_dataset_path, 'rt', encoding='utf8') as fIn:
    reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
    for row in reader:
        score = float(row['score']) / 5.0  # Normalize score to range 0 ... 1
        inp_example = InputExample(texts=[row['sentence1'], row['sentence2']], label=score)

        if row['split'] == 'dev':
            dev_samples.append(inp_example)
        elif row['split'] == 'test':
            test_samples.append(inp_example)
        else:
            train_samples.append(inp_example)

In [83]:
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)

In [84]:
# train loss
train_loss = losses.CosineSimilarityLoss(model=model)

In [85]:
# evaluator 
logging.info("Read STSbenchmark dev dataset")
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name='sts-dev')

In [86]:
# configure training
warmup_steps = math.ceil(len(train_dataloader) * num_epochs  * 0.1) #10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))

In [16]:
# train model
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=1000,
          warmup_steps=warmup_steps,
          output_path=model_save_path)

Iteration: 100%|██████████| 360/360 [13:32<00:00,  2.26s/it]
Iteration: 100%|██████████| 360/360 [13:22<00:00,  2.23s/it]
Iteration: 100%|██████████| 360/360 [14:14<00:00,  2.37s/it]
Iteration: 100%|██████████| 360/360 [14:04<00:00,  2.34s/it]
Iteration: 100%|██████████| 360/360 [15:06<00:00,  2.52s/it]
Epoch: 100%|██████████| 5/5 [1:13:49<00:00, 885.94s/it]


In [None]:
# evaluation 
model = SentenceTransformer(model_save_path)
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name='sts-test')
test_evaluator(model, output_path=model_save_path)

#### d. Calculate similarity scores

In [19]:
# specify trained model 
model_sbert_sts = SentenceTransformer('models/sbert_stsbenchmark-2023-03-23_09-50')

In [30]:
# two lists of sentences
sentences1 = text_list[10:15]
sentences2 = text_list[16:21]

# compute embedding for both lists
embeddings1 = model_sbert_sts.encode(sentences1, convert_to_tensor=True)
embeddings2 = model_sbert_sts.encode(sentences2, convert_to_tensor=True)

# compute cosine-similarities
cosine_scores = util.cos_sim(embeddings1, embeddings2)

# output the pairs with their score
for i in range(len(sentences1)):
    print("{} \t\t {} \t\t Score: {:.4f}".format(sentences1[i], sentences2[i], cosine_scores[i][i]))

respond may request confidenti treatment part submiss respond believ proprietari confidenti commerci sensit 		 mci/imda accept submiss request confidenti treatment substanti part submiss 		 Score: 0.8164
ani inform clearli mark place separ annex 		 for avoid doubt inform provid view express consult paper purpos discus consult 		 Score: 0.4503
respond also requir substanti reason request confidenti treatment 		 noth consult paper repres constitut decis made mci/imda 		 Score: 0.5547
if mci/imda grant confidenti treatment consid publicli disclos inform 		 the consult contempl consult paper without prejudic exercis power mci/imda psa subsidiari legisl thereund 		 Score: 0.6431
if mci/imda reject request confidenti treatment return inform respond submit consid inform part review 		 page annex a – public consult paper draft postal servic amend bill 		 Score: 0.5346


In [24]:
embeddings2.shape

torch.Size([5, 768])

In [None]:
# single list of sentences
sentences = ['The cat sits outside',
             'A man is playing guitar',
             'I love pasta',
             'The new movie is awesome',
             'The cat plays in the garden',
             'A woman watches TV',
             'The new movie is so great',
             'Do you like pizza?']

#Compute embeddings
embeddings = model.encode(sentences, convert_to_tensor=True)

#Compute cosine-similarities for each sentence with each other sentence
cosine_scores = util.cos_sim(embeddings, embeddings)

#Find the pairs with the highest cosine similarity scores
pairs = []
for i in range(len(cosine_scores)-1):
    for j in range(i+1, len(cosine_scores)):
        pairs.append({'index': [i, j], 'score': cosine_scores[i][j]})

#Sort scores in decreasing order
pairs = sorted(pairs, key=lambda x: x['score'], reverse=True)

for pair in pairs[0:10]:
    i, j = pair['index']
    print("{} \t\t {} \t\t Score: {:.4f}".format(sentences[i], sentences[j], pair['score']))

##### Test data

In [7]:
# sample test data
test_data = pd.read_csv("data/01 legal_texts_with_labels.csv")
test_sample = test_data.sample(frac = 0.5, random_state=1)
# definitions
oecd_defs = pd.read_csv("data/01 oecd_definitions.csv")

In [44]:
test_list = test_sample['Text']
category_list = test_sample['Category']
defs_list = oecd_defs['text_clean']
defs_name = oecd_defs['Sub_Cat']

Manual cosine similarity scores

In [42]:
test_sample

Unnamed: 0.1,Unnamed: 0,Law,Paragraph,Text,Category
108,108,Concession Contract - Rio de Janeiro/Galeão In...,10.7,"In the first five years, share transfer leadin...",A2
1143,1143,Law 7565/1986 - Brazilian Aeronautical Code,Art. 247,Any clause tending to exonerate the carrier fr...,
177,177,Notice of Auction 1/2018 (fifth round of airpo...,4.13\n4.14,Bidders must submit a bid bond in the form of ...,A4
927,927,Law 7565/1986 - Brazilian Aeronautical Code,Art. 65,The owner or operator of the aircraft that pro...,
737,737,Decree 2.256/1997 Brazilian Special Registry,Art. 4 par 3,"To be registered with the REB, foreign vessels...",A2
...,...,...,...,...,...
1201,1201,Regulation (RBAC) 154/2017 by National Civil A...,Art. 154.5 a,This regulation contains standards based on An...,
1108,1108,Law 7565/1986 - Brazilian Aeronautical Code,Art. 229,The passenger has the right to reimbursement f...,
670,670,Internal Regulation - SOE's - COMPANHIA DOCAS ...,5.3.13.4.15,The public notice may establish the requiremen...,A3
32,32,Notice of Auction 2/2011 (second round of airp...,4.14\n4.15,Bidders must submit a bid bond in the form of ...,A4


In [45]:
def calculate_sim_scores(list1, list2, defs_name, labels_true, model): 
    
    df = pd.DataFrame(columns = ['Text', 'Label_Text', 'Name', 'Score', 'Label'])
    
    for i, label in zip(list1, labels_true):
        embeddings1 = model.encode(i, convert_to_tensor = True)
        for j, name in zip(list2, defs_name):
            embeddings2 = model.encode(j, convert_to_tensor = True)
            cosine_scores = util.cos_sim(embeddings1, embeddings2)

            row = pd.DataFrame({'Text': i, 'Label_Text': j, 'Name': name,  'Score': cosine_scores[0], 'Label': label}, index=[0])
            df = pd.concat([row,df.loc[:]]).reset_index(drop=True)

    return df

In [46]:
results = calculate_sim_scores(test_list, defs_list, defs_name, category_list, model_sbert_sts)

In [47]:
results

Unnamed: 0,Text,Label_Text,Name,Score,Label
0,"In public aerodromes, if the operator does not...",when govern deregul introduc market previous e...,D3,0.317244,
1,"In public aerodromes, if the operator does not...",regul make consum le will switch supplier affe...,D2,0.244662,
2,"In public aerodromes, if the operator does not...",regul sometim limit choic avail consum for exa...,D1,0.329156,
3,"In public aerodromes, if the operator does not...",in mani countri particular supplier econom sec...,C3,0.323133,
4,"In public aerodromes, if the operator does not...",regul requir market particip publish inform pr...,C2,0.262859,
...,...,...,...,...,...
14143,"In the first five years, share transfer leadin...",regul rais cost entri exit market tend discour...,A4,0.416960,A2
14144,"In the first five years, share transfer leadin...",govern limit abil certain supplier particip bu...,A3,0.520037,A2
14145,"In the first five years, share transfer leadin...",licens permit requir oper necessarili restrict...,A2,0.487934,A2
14146,"In the first five years, share transfer leadin...",grant exclus right produc certain good provid ...,A1,0.497762,A2


In [None]:
results.write_csv("data/scores/02 sim_scores_bert.csv")
