# For preparing fine-tuning data for MLM (Masked Language Modelling)

In [1]:
import os
import json
import re
import string
import random
import time
import datetime
import numpy as np
import pandas as pd
from argparse import Namespace

# from transformers import BertTokenizer
import transformers
from transformers import BertTokenizer, BertModel, BertConfig
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import BertForSequenceClassification, AdamW
from transformers import get_linear_schedule_with_warmup
from transformers import pipeline
from transformers import BertTokenizer, DataCollatorForLanguageModeling

import torch
import random
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /home/jz75/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Prepare configurations for data preparation

In [19]:
args = Namespace(
    raw_data_filename = 'aus_sum_cases_train.json',
    raw_data_path = "./raw_data/summary_data/",
    processed_data_path = "./processed_data",
    max_segment_length = 512,
    max_samples = 4000,
    mode = 'a', # a : append, w : write,
)

## Ingest Raw Data

Format :
{\
    &nbsp;&nbsp; query\
    &nbsp;&nbsp; text \
    &nbsp;&nbsp; topic_label\
    &nbsp;&nbsp; secondary_query\
    &nbsp;&nbsp; label\
}

In [7]:
tokenizer = BertTokenizer.from_pretrained('casehold/legalbert')

### Using NLTK for tokenization

In [87]:
def extract_sentences_json(all_sentences):

    # with open('./raw_data/r_legaladvice_train.json', 'r') as json_file:
    #     json_data = json.load(json_file)
    with open('./raw_data/summary_data/aus_sum_cases_train.json', 'r') as json_file:
        json_data = json.load(json_file)
    counter = 0

    for item in json_data:
        if counter >= args.max_samples : break 

        sentences = nltk.sent_tokenize(item['text']) + nltk.sent_tokenize(item['query'])

        i = 0 
        while i < len(sentences):
            sentence = sentences[i]
            i += 1
            counter += 1

            tokens = tokenizer.tokenize(sentence)

            if len(tokens) >= 510 or len(tokens) < 5 : continue
            if counter >= args.max_samples : break 
            all_sentences.append(sentence)

    print(counter)
    print(len(all_sentences))

In [71]:
def extract_sentences_casehold(all_sentences):
    casehold_df = pd.read_csv('./processed_data/casehold_processed.csv')
    counter = 0

    for index,row in casehold_df.iterrows():
        
        if counter >= args.max_samples : break 

        sentences = nltk.sent_tokenize(row['context'])

        i = 0 
        while i < len(sentences):
            sentence = sentences[i]
            counter+=1
            i += 1

            tokens = tokenizer.tokenize(sentence)

            if len(tokens) >= 510 or len(tokens) < 5 : continue
            if counter >= args.max_samples : break 
            all_sentences.append(sentence)
    print(counter)
    print(len(all_sentences))

In [72]:
def extract_sentences_ledgar(all_sentences):
    df = pd.read_json('./processed_data/LEDGAR_2016-2019_clean.jsonl', lines=True)

    counter = 0
    for index,row in df.iterrows(): 
        
        if counter >= args.max_samples : break 

        sentences = nltk.sent_tokenize(row['provision'])

        i = 0 
        while i < len(sentences):
            sentence = sentences[i]

            i += 1
            counter += 1 
            tokens = tokenizer.tokenize(sentence)

            if len(tokens) >= 510 or len(tokens) < 5 : continue
            if counter >= args.max_samples : break 
            all_sentences.append(sentence)

    print(counter)
    print(len(all_sentences))

In [83]:
all_sentences = []

In [88]:
extract_sentences_json(all_sentences)

4000
14263


In [85]:
extract_sentences_casehold(all_sentences)

4000
7282


In [86]:
extract_sentences_ledgar(all_sentences)

4000
11270


## Process/Prepare Raw Data

In [89]:
df = pd.DataFrame(all_sentences, columns=['sentence'])

df.head()

Unnamed: 0,sentence
0,The title says most of it.
1,I just heard back from a friend of some people...
2,We’ll call them Cafe Mooby.
3,"Cafe Mooby stayed open until the last minute, ..."
4,"From my experiences with my company, I was all..."


In [90]:
len(df)

14263

In [91]:
df = df.sample(frac=1).reset_index(drop=True)
df.head()

Unnamed: 0,sentence
0,(c) Roth IRA's.--Paragraph (4) of section 408A...
1,"For example, the State Constitution provides t..."
2,"See Capco of Summerville, Inc. v. J.H."
3,New York - Mother Terrorizing Family
4,at 701 (<HOLDING>); see also United Parcel Ser...


In [92]:
# Save data to file
df.to_csv(args.processed_data_path + '/sentences.csv', mode='w')