# Multi-label Legal Text Classification for CIA

## Data Augmentation

In [None]:
!pip install translators
!pip install transformers
!pip install sentencepiece

In [None]:
import pandas as pd
import os
import random
import time
import nltk
from functions.source_parsing import *
from multiprocessing import Pool
from tqdm import *
from transformers import MarianMTModel, MarianTokenizer
from sklearn.model_selection import train_test_split

In [2]:
os.chdir("../..")
os.getcwd()

'/Users/janinedevera/Documents/School/MDS 2021-2023/Thesis/multilabel-legal-text-classification-CIA'

In [3]:
text_labels_grouped = pd.read_csv("data/01 legal_texts_with_labels_grouped.csv", index_col=0)

In [4]:
text_labels_grouped['Length'] = text_labels_grouped['Text'].str.len()
text_labels_grouped['Length'].max()

2256

In [5]:
def split_sentences(text):
    return nltk.sent_tokenize(text)

split_text_rows = []
for index, row in text_labels_grouped.iterrows():
    if row['Length'] > 500:
        text = row['Text']
        sentences = split_sentences(text)
        for sentence in sentences:
            if len(sentence) > 500:
                chunks = [sentence[i:i+500] for i in range(0, len(sentence), 500)]
                for chunk in chunks:
                    split_text_rows.append([row['Law'], chunk, row['Category'], row['Category_New']])
            else:
                split_text_rows.append([row['Law'], sentence, row['Category'], row['Category_New']])
    else:
        split_text_rows.append([row['Law'], row['Text'], row['Category'], row['Category_New']])

text_labels_split = pd.DataFrame(split_text_rows, columns=["Law", "Text", "Category", "Category_New"])


In [6]:
text_labels_split['Length'] = text_labels_split['Text'].str.len()
text_labels_split = text_labels_split.loc[text_labels_split['Length'] >= 5]

In [19]:
text_labels_split.Category_New.value_counts()

A         1251
None       750
B          227
Others      71
C           59
Name: Category_New, dtype: int64

In [20]:
text_labels_split.to_csv("data/01 legal_texts_with_labels_grouped_split.csv")

In [24]:
train_df, test_df = train_test_split(text_labels_split, test_size=0.3, random_state=999, stratify=text_labels_split['Category_New'])

print(f"Train set size: {len(train_df)}")
print(f"Test set size: {len(test_df)}")


Train set size: 1650
Test set size: 708


### I. Back Translation

Define helper functions

In [21]:
# download language translators
def get_translator(lang, from_eng = True):
    if from_eng:
        translator = ('Helsinki-NLP/opus-mt-en-' + lang)
        translator_tokenizer = MarianTokenizer.from_pretrained(translator)
        translator_model = MarianMTModel.from_pretrained(translator)
        print('en to ' + lang + ' translator downloaded')
    else:
        translator = ('Helsinki-NLP/opus-mt-' + lang + '-en')
        translator_tokenizer = MarianTokenizer.from_pretrained(translator)
        translator_model = MarianMTModel.from_pretrained(translator)
        print(lang + ' to eng translator downloaded')
    return translator_model, translator_tokenizer

# translate text
def run_translation(batch_text, model, tokenizer, language):
    formated_batch_texts = [">>{}<< {}".format(language, text) for text in batch_text]
    translated = model.generate(**tokenizer(formated_batch_texts, return_tensors="pt", padding=True))
    translated_texts = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
    return translated_texts

# back translation
def back_translation(model, tokenizer, model_back, tokenizer_back, batch_texts, original_language="en", temporary_language="fr"):
  temp_translated_batch = run_translation(batch_texts, model, tokenizer, temporary_language)
  back_translated_batch = run_translation(temp_translated_batch, model_back, tokenizer_back, original_language)
  return back_translated_batch

In [22]:
# languages
langs = ['ar', 'bg', 'cs', 'da', 'nl', 'fi',  
         'fr', 'de', 'hi', 'cy', 'id', 
         'it', 'ru', 'es', 'sv', 'sk']

In [11]:
models = {}
tokenizers = {}
for lang in langs:
    model, tokenizer = get_translator(lang, from_eng=True)
    models[lang] = model
    tokenizers[lang] = tokenizer

models_back = {}
tokenizers_back = {}
for lang in langs:
    model, tokenizer = get_translator(lang, from_eng=False)
    models_back[lang] = model
    tokenizers_back[lang] = tokenizer



en to ar translator downloaded
en to bg translator downloaded
en to cs translator downloaded
en to da translator downloaded
en to nl translator downloaded
en to fi translator downloaded
en to fr translator downloaded
en to de translator downloaded
en to hi translator downloaded
en to cy translator downloaded
en to id translator downloaded
en to it translator downloaded
en to ru translator downloaded
en to es translator downloaded
en to sv translator downloaded
en to sk translator downloaded
ar to eng translator downloaded
bg to eng translator downloaded
cs to eng translator downloaded
da to eng translator downloaded
nl to eng translator downloaded
fi to eng translator downloaded
fr to eng translator downloaded
de to eng translator downloaded
hi to eng translator downloaded
cy to eng translator downloaded
id to eng translator downloaded
it to eng translator downloaded
ru to eng translator downloaded
es to eng translator downloaded
sv to eng translator downloaded
sk to eng translator dow

In [27]:
back_translated_rows = []
num_sentences_translated = 0
start_time = time.time()

for index, row in train_df.iterrows():
    law = row["Law"]
    category = row["Category"]
    category_new = row["Category_New"]
    text = row["Text"]
    
    if category_new == "A":
        back_translated_text_set = [text]

    elif category_new == "B":
        lang_choices = random.sample(langs, 5)
        back_translated_text_set = [text]

        for lang in lang_choices:
            model = models[lang]
            tokenizer = tokenizers[lang]
            model_back = models_back[lang]
            tokenizer_back = tokenizers_back[lang]
            temp_translated_batch = run_translation([text], model, tokenizer, lang)
            back_translated_text_set += run_translation(temp_translated_batch, model_back, tokenizer_back, "en")
            
    elif category_new in ["C", "Others"]:
        back_translated_text_set = [text]

        for lang in langs:
            model = models[lang]
            tokenizer = tokenizers[lang]
            model_back = models_back[lang]
            tokenizer_back = tokenizers_back[lang]
            temp_translated_batch = run_translation([text], model, tokenizer, lang)
            back_translated_text_set += run_translation(temp_translated_batch, model_back, tokenizer_back, "en")
    else:
        back_translated_text_set = [text]
    
    for back_translated_text in back_translated_text_set:
        back_translated_rows.append([law, back_translated_text, category, category_new])
    
    # counter
    num_sentences_translated += 1
    if num_sentences_translated % 50 == 0:
        #print(f"{num_sentences_translated} sentences translated")
        elapsed_time = time.time() - start_time
        elapsed_minutes = int(elapsed_time // 60)
        elapsed_seconds = int(elapsed_time % 60)
        print(f"{num_sentences_translated} sentences translated. Time elapsed: {elapsed_minutes} mins {elapsed_seconds} secs.")

back_translated_df = pd.DataFrame(back_translated_rows, columns=["Law", "Text", "Category", "Category_New"])




50 sentences translated. Time elapsed: 2 mins 35 secs.
100 sentences translated. Time elapsed: 7 mins 47 secs.
150 sentences translated. Time elapsed: 17 mins 44 secs.
200 sentences translated. Time elapsed: 25 mins 4 secs.
250 sentences translated. Time elapsed: 30 mins 10 secs.
300 sentences translated. Time elapsed: 36 mins 52 secs.
350 sentences translated. Time elapsed: 42 mins 40 secs.
400 sentences translated. Time elapsed: 46 mins 49 secs.
450 sentences translated. Time elapsed: 54 mins 25 secs.
500 sentences translated. Time elapsed: 58 mins 30 secs.
550 sentences translated. Time elapsed: 61 mins 47 secs.
600 sentences translated. Time elapsed: 68 mins 2 secs.
650 sentences translated. Time elapsed: 68 mins 27 secs.
700 sentences translated. Time elapsed: 73 mins 15 secs.
750 sentences translated. Time elapsed: 77 mins 0 secs.
800 sentences translated. Time elapsed: 83 mins 53 secs.
850 sentences translated. Time elapsed: 90 mins 43 secs.
900 sentences translated. Time elapse

Pre-process train and test sets

In [34]:
back_translated_df.Category_New.value_counts()

B         954
A         875
Others    850
C         697
None      525
Name: Category_New, dtype: int64

In [44]:
back_translated_df['text_clean'] = preprocess_corpus_keep_stop_words(back_translated_df['Text'])
back_translated_df['text_clean'] = [stem_lemmatize(text) for text in back_translated_df['text_clean']]

back_translated_df = back_translated_df.drop_duplicates(subset=['Text', 'Category_New'])

In [49]:
test_df['text_clean'] = preprocess_corpus_keep_stop_words(test_df['Text'])
test_df['text_clean'] = [stem_lemmatize(text) for text in test_df['text_clean']]

In [51]:
back_translated_df.to_csv("data/01 train_data_augmented.csv")
test_df.to_csv("data/01 test_data.csv")