# For preparing fine-tuning data for MLM (Masked Language Modelling)

In [1]:
import os
import json
import re
import string
import random
import time
import datetime
import numpy as np
import pandas as pd
from argparse import Namespace

# from transformers import BertTokenizer

import torch
import random
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /home/jz75/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Prepare configurations for data preparation

In [18]:
args = Namespace(
    raw_data_filename = 'aus_sum_cases_train.json',
    raw_data_path = "./raw_data/",
    processed_data_path = "./processed_data/",
    max_segment_length = 512,
    max_samples = 3000,
    mode = 'a', # a : append, w : write,
)

## Ingest Raw Data

Format :
{\
    &nbsp;&nbsp; query\
    &nbsp;&nbsp; text \
    &nbsp;&nbsp; topic_label\
    &nbsp;&nbsp; secondary_query\
    &nbsp;&nbsp; label\
}

### Using NLTK for tokenization

In [5]:
def extract_sentences_json(all_sentences):

    with open(args.raw_data_path + args.raw_data_filename, 'r') as json_file:
        json_data = json.load(json_file)
    counter = 0

    for item in json_data:
        counter += 1
        if len(all_sentences) > args.max_samples : break

        sentences = nltk.sent_tokenize(item['text']) + nltk.sent_tokenize(item['query'])

        i = 0
        while i < len(sentences):
            sentence = sentences[i]
    #         i = i + 5 # every 5 sentences for more uniform coverage of documents in data
            i += 1

            # if sentence/sample larger than 512 characters
            if len(sentence) > 512 or len(sentence.split(' ')) <= 10: continue
            # if we've reached maximum number of samples needed
            if len(all_sentences) >= args.max_samples: break

            all_sentences.append(sentence)

    print(counter)
    print(len(all_sentences))

In [6]:
def extract_sentences_casehold(all_sentences):
    casehold_df = pd.read_csv(args.processed_data_path + args.raw_data_filename)
    counter = 0

    for index,row in casehold_df.iterrows():
        counter+=1
        if len(all_sentences) >= args.max_samples: break

        sentences = nltk.sent_tokenize(row['context'])

        i = 0
        while i < len(sentences):
            sentence = sentences[i]

            i+=1

            # if sentence/sample larger than 512 characters
            if len(sentence) > 512 or len(sentence.split(' ')) <= 10: continue
            # if we've reached maximum number of samples needed
            if len(all_sentences) >= args.max_samples: break

            all_sentences.append(sentence)
    print(counter)
    print(len(all_sentences))

In [19]:

all_sentences = []

# extract sentences from data
if args.raw_data_filename.split('.').pop() == 'json':
    print('1')
    extract_sentences_json(all_sentences)
else:
    print('2')
    extract_sentences_casehold(all_sentences)



print(all_sentences[0:3])

1
18949
3000
["This Act may be cited as the ``Border Hospital Survival and Illegal \nImmigrant Care Act''.", 'The Congress finds as follows:\n            (1) Immigration is a Federal responsibility.', '(2) The Immigration and Naturalization Service does not \n        take into custody all aliens who are unlawfully present in the \n        United States.']


## Process/Prepare Raw Data

In [20]:
df = pd.DataFrame(all_sentences, columns=['sentence'])

df.head()

Unnamed: 0,sentence
0,This Act may be cited as the ``Border Hospital...
1,The Congress finds as follows:\n (1...
2,(2) The Immigration and Naturalization Service...
3,(4) The Southwest border region is ill-equippe...
4,(5) The Southwest border region has been desig...


In [14]:
df.head()

Unnamed: 0,sentence
0,"Drapeau’s cohorts, the cohort would be a “vict..."
1,Felony offenses that involve explosives qualif...
2,§ 924(e)(2)(B)(ii) (defining a “violent felony...
3,Courts have found possession of a'bomb to be a...
4,"See United States v. Newman, 125 F.3d 863 (10t..."


In [21]:
# Save data to file
df.to_csv(args.processed_data_path + '/sentences.csv', mode=args.mode)

In [22]:
dataframe = pd.read_csv(args.processed_data_path + '/sentences.csv')

dataframe = dataframe.sample(frac=1).reset_index(drop=True)

dataframe.to_csv(args.processed_data_path + '/sentences.csv', mode='w')