# Load Data from JSON
convert data to dataframe and save it to csv file

In [1]:
import pandas as pd
from pandas import DataFrame
from typing import List
import re

path_judgment_train: str = r"../data/NER_TRAIN_JUDGEMENT.json"
path_preamble_train: str = r"../data/NER_TRAIN_PREAMBLE.json"

path_judgment_dev: str = r"../data/NER_DEV_JUDGEMENT.json"
path_preamble_dev: str = r"../data/NER_DEV_PREAMBLE.json"

# load data

In [2]:
def load_data():
    """
    Read the data from the json files and merge them together to one dataframe
    :return:
    """
    train_judgment_data: DataFrame = pd.read_json(path_judgment_train)
    train_preamble_data: DataFrame = pd.read_json(path_preamble_train)

    train_data: DataFrame = train_judgment_data.append(train_preamble_data)
    train_data.index = [i for i in range(len(train_data))]
    train_data.drop(["meta"], axis=1, inplace=True)

    dev_judgment_data: DataFrame = pd.read_json(path_judgment_dev)
    dev_preamble_data: DataFrame = pd.read_json(path_preamble_dev)

    dev_data: DataFrame = dev_judgment_data.append(dev_preamble_data)
    dev_data.index = [i for i in range(len(dev_data))]
    dev_data.drop(["meta"], axis=1, inplace=True)

    return train_data, dev_data

def transform_annotations(annotations: List) -> List:
    """
    Transform the Annotations from JSON format to a list of (start, end, label) tuples
    :param annotations: A List of annotations for one Sentence
    :return: A List of Tuples in format (start, end, label)
    """
    r_annotations: List = []
    for annotation in annotations:
      for res in annotation['result']:
        label = res['value']['labels'][0]
        text = res['value']["text"]
        r_annotations.append((label, text))

    return r_annotations


def clean_up_texts(text) -> str:
    # remove all \n and  \r
    text = text.replace("\n", " ")
    text = text.replace("\r", " ")
    text = re.sub("<[^>]*>", "", text) #  replace all html tags
    text = re.sub(" +", " ", text) #  replace multiple spaces with one

    return text

def clean_up_entities(entities) -> str:
    clean_entities: List = []
    for text in entities:
        # remove all \n and  \r
        text = text.replace("\n", " ")
        text = text.replace("\r", " ")
        text = re.sub("<[^>]*>", "", text) #  replace all html tags
        text = re.sub(" +", " ", text) #  replace multiple spaces with one
        clean_entities.append(text)

    return clean_entities

In [3]:
train_data, dev_data = load_data()

# prepare train_data
train_data["string"] = train_data["data"].map(lambda x: x["text"])
train_data["string"] = train_data["string"].map(clean_up_texts)
train_data['raw_entities'] = train_data["annotations"].map(transform_annotations)
train_data['entities'] = train_data["raw_entities"].map(lambda x: [i[1] for i in x])
train_data['entities'] = train_data["entities"].map(clean_up_entities)
train_data["entity_names"] = train_data["raw_entities"].map(lambda x: [i[0] for i in x])


dev_data["string"] = dev_data["data"].map(lambda x: x["text"])
dev_data["string"] = dev_data["string"].map(clean_up_texts)
dev_data['entities'] = dev_data["annotations"].map(transform_annotations)
dev_data['raw_entities'] = dev_data["annotations"].map(transform_annotations)
dev_data['entities'] = dev_data["raw_entities"].map(lambda x: [i[1] for i in x])
dev_data['entities'] = dev_data["entities"].map(clean_up_entities)
dev_data["entity_names"] = dev_data["raw_entities"].map(lambda x: [i[0] for i in x])


# drop unnecessary columns
train_data.drop(["annotations", "data", "id", "raw_entities"], axis=1, inplace=True)
dev_data.drop(["annotations", "data", "id", "raw_entities"], axis=1, inplace=True)
train_data.head()

  train_data: DataFrame = train_judgment_data.append(train_preamble_data)
  dev_data: DataFrame = dev_judgment_data.append(dev_preamble_data)


Unnamed: 0,string,entities,entity_names
0,(7) On specific query by the Bench about an e...,"[Hongkong Bank, Rahul & Co.]","[ORG, ORG]"
1,He was also asked whether Agya CRA No.326-DB o...,"[Agya, Kaur, Tarlochan Singh]","[OTHER_PERSON, OTHER_PERSON, OTHER_PERSON]"
2,"5.2 CW3 Mr Vijay Mishra , Deputy Manager, HDF...","[Vijay Mishra, HDFC Bank, Noida, UP, HDFC Bank]","[WITNESS, ORG, GPE, ORG]"
3,You are hereby asked not to carry out any cons...,[],[]
4,The pillion rider T.V. Satyanarayana Murthy al...,[T.V. Satyanarayana Murthy],[OTHER_PERSON]


In [4]:
dev_data.head()

Unnamed: 0,string,entities,entity_names
0,"True, our Constitution has no 'due process' cl...","[Constitution, R.C. Cooper v. Union of India, ...","[STATUTE, PRECEDENT, PRECEDENT]"
1,(See Principles of Statutory Interpretation by...,[G.P. Singh],[JUDGE]
2,"Their Lordships have said -- ""It is a sound ru...","[England, Heydon]","[GPE, OTHER_PERSON]"
3,"In para 13 of the plaint, it has been further ...","[29/12/2004, Delhi]","[DATE, GPE]"
4,Counsel for appellants contended that who is t...,"[Rule 2, Section 172, Hyderabad Land Revenue A...","[PROVISION, PROVISION, STATUTE]"


In [5]:
classes = train_data["entity_names"].explode().unique().tolist()
classes_lookup = {classes[i]: i for i in range(0, len(classes))}
classes_lookup

{'ORG': 0,
 'OTHER_PERSON': 1,
 'WITNESS': 2,
 'GPE': 3,
 nan: 4,
 'STATUTE': 5,
 'DATE': 6,
 'PROVISION': 7,
 'COURT': 8,
 'PRECEDENT': 9,
 'CASE_NUMBER': 10,
 'PETITIONER': 11,
 'JUDGE': 12,
 'RESPONDENT': 13,
 'LAWYER': 14}

In [6]:
word_mapping = {
    'ORG': "an organization",
    'OTHER_PERSON': "a person",
    'WITNESS': "a witness",
    'GPE': "a location",
    'STATUTE': "a statute",
    'DATE': "a date",
    'PROVISION': "a provision",
    'COURT': "a curt",
    'PRECEDENT': "a precedent",
    'CASE_NUMBER': "a case number",
    'PETITIONER': "a petitioner",
    'JUDGE': "a judge",
    'RESPONDENT': "a respondent",
    'LAWYER': "a lawyer"
}

In [7]:
general_template = "<s> {} is {}" #
negative_template = "<s> no entities found"
def create_template_single(text, entities, entity_names):
    texts = []
    labels = []

    if len(entities) == 0:
        return [text], [negative_template]

    all_entities = [entity for entity in entities]
    all_tokens = [name for name in entity_names]

    assert len(all_entities) == len(all_tokens)
    for entity, token in zip(all_entities, all_tokens):
        texts.append(text)
        labels.append(general_template.format(entity, word_mapping[token]))

        # remove entity from texts
        text = text.replace(entity, "")
        text = re.sub("  +", " ", text)

    return texts, labels

In [8]:
all_texts = []
all_labels = []
all_texts_multi = []
all_labels_multi = []

for i, row in train_data.iterrows():
    texts, labels = create_template_single(text=row["string"], entities=row["entities"], entity_names=row["entity_names"])
    all_texts.extend(texts)
    all_labels.extend(labels)

In [9]:
data_single = DataFrame({"Source sentence": all_texts, "Answer sentence": all_labels})

In [10]:
data_single.head()

Unnamed: 0,Source sentence,Answer sentence
0,(7) On specific query by the Bench about an e...,<s> Hongkong Bank is an organization
1,(7) On specific query by the Bench about an e...,<s> Rahul & Co. is an organization
2,He was also asked whether Agya CRA No.326-DB o...,<s> Agya is a person
3,He was also asked whether CRA No.326-DB of 199...,<s> Kaur is a person
4,He was also asked whether CRA No.326-DB of 199...,<s> Tarlochan Singh is a person


In [11]:
all_texts = []
all_labels = []
all_texts_multi = []
all_labels_multi = []

for i, row in dev_data.iterrows():
    texts, labels = create_template_single(text=row["string"], entities=row["entities"], entity_names=row["entity_names"])
    all_texts.extend(texts)
    all_labels.extend(labels)

In [12]:
data_dev_single =  DataFrame({"Source sentence": all_texts, "Answer sentence": all_labels})

In [13]:
data_dev_single.head()

Unnamed: 0,Source sentence,Answer sentence
0,"True, our Constitution has no 'due process' cl...",<s> Constitution is a statute
1,"True, our has no 'due process' clause or the V...","<s> R.C. Cooper v. Union of India, (1970) 1 SC..."
2,"True, our has no 'due process' clause or the V...","<s> Maneka Gandhi v. Union of India, (1978) 1 ..."
3,(See Principles of Statutory Interpretation by...,<s> G.P. Singh is a judge
4,"Their Lordships have said -- ""It is a sound ru...",<s> England is a location


In [14]:
path = "../data/bart_train_data_single.csv"
path_2 = "../data/bart_dev_data_single.csv"
data_single.to_csv(path, index=False)
data_dev_single.to_csv(path_2, index=False)

# Create TestData

In [15]:
entity_name_mapping = {
    'ORG': "organization",
    'OTHER_PERSON': "person",
    'WITNESS': "witness",
    'GPE': "location",
    'STATUTE': "statute",
    'DATE': "date",
    'PROVISION': "provision",
    'COURT': "curt",
    'PRECEDENT': "precedent",
    'CASE_NUMBER': "case number",
    'PETITIONER': "petitioner",
    'JUDGE': "judge",
    'RESPONDENT': "respondent",
    'LAWYER': "lawyer"
}

def transform_test_data_names(names: List)-> List:
    if len(names) == 0:
        return ["no entities found"]
    else:
        return [entity_name_mapping[name]for name in names]

def transform_test_data_entities(entities: List)-> List:
    if len(entities) == 0:
        return ["no entities found"]
    else:
        return entities

In [16]:
test_data: DataFrame = dev_data.rename(columns={"string": "Source sentence", "entity_names": "names"}, inplace=False)
test_data["names"] = test_data["names"].map(transform_test_data_names)
test_data["entities"] = test_data["entities"].map(transform_test_data_entities)

test_data.head()

Unnamed: 0,Source sentence,entities,names
0,"True, our Constitution has no 'due process' cl...","[Constitution, R.C. Cooper v. Union of India, ...","[statute, precedent, precedent]"
1,(See Principles of Statutory Interpretation by...,[G.P. Singh],[judge]
2,"Their Lordships have said -- ""It is a sound ru...","[England, Heydon]","[location, person]"
3,"In para 13 of the plaint, it has been further ...","[29/12/2004, Delhi]","[date, location]"
4,Counsel for appellants contended that who is t...,"[Rule 2, Section 172, Hyderabad Land Revenue A...","[provision, provision, statute]"


In [17]:
path_test_data = "../data/bart_test_data.csv"
test_data.to_csv(path_test_data, index=False)

In [41]:
# Transform entities to IoB
sentences = test_data["Source sentence"].to_list()
entities = test_data["entities"].to_list()
entity_names = test_data["names"].to_list()

def get_iob_class_mask(e, c):
    c = c.replace(" ", "")
    class_token = "I-" + c.upper()
    return " ".join([class_token for _ in e])

def transform_to_iob(texts, entities, names):
    all_iob = []
    all_iob_text = []
    for text, entity, name in zip(texts, entities, names):
        original_text = text
        clean_text = re.sub('[^A-Za-z0-9 ()\[\]]+', ' ', text)
        clean_text = re.sub(' +', ' ', clean_text)

        # clean all unnecessary punctuation
        iob_list = clean_text.split(" ")

        for e, n in zip(entity, name):

            # if no entities were found skip
            if e == "no entities found":
                continue

            # clean entity
            clean_entity = re.sub('[^A-Za-z0-9 ()\[\]]+', ' ', e)
            clean_entity = re.sub(' +', ' ', clean_entity)

            # find the exact match in the texts
            match = re.search(re.escape(clean_entity), clean_text)
            if match is None:
                a = 0
            start, end = match.span()
            submatch = clean_text[start: end]

            # find where the entity is located by counting the whitespaces up to that point
            text_before_entity: str = clean_text[: start]
            num_spaces: int = text_before_entity.count(" ")
            number_entities = len(submatch.split(" "))

            for _ in range(number_entities):
                iob_list[num_spaces] = "I-" + n.replace(" ", "").upper()
                num_spaces += 1

                if num_spaces >= len(iob_list):
                    break

        all_iob_text.append(iob_list)
        all_iob.append(["O" if not token.startswith("I-") else token for token in iob_list])

    return all_iob, all_iob_text

results, texts = transform_to_iob(sentences, entities, entity_names)

In [42]:
for r in results:
    print(r)

['O', 'O', 'I-STATUTE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'I-PRECEDENT', 'I-PRECEDENT', 'I-PRECEDENT', 'I-PRECEDENT', 'I-PRECEDENT', 'I-PRECEDENT', 'I-PRECEDENT', 'I-PRECEDENT', 'I-PRECEDENT', 'I-PRECEDENT', 'I-PRECEDENT', 'O', 'I-PRECEDENT', 'I-PRECEDENT', 'I-PRECEDENT', 'I-PRECEDENT', 'I-PRECEDENT', 'I-PRECEDENT', 'I-PRECEDENT', 'I-PRECEDENT', 'I-PRECEDENT', 'I-PRECEDENT', 'O', 'O', 'O', 'O', 'O', 'O']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'I-JUDGE', 'I-JUDGE', 'I-JUDGE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'I-LOCATION', 'O', 'O', 'O', 'O', 'O', 'O', 'I-PERSON', 'O', 'O', 'O', 'O', 'O', 'O']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'I-DATE', 'I-DATE', 'I-DATE', 'O', 'O', 'O', 'O', 'O', 'O', 'O',

In [43]:
a = []
for t in results:
    a.extend(t)

print(set(a))

{'O', 'I-CURT', 'I-PETITIONER', 'I-PERSON', 'I-CASENUMBER', 'I-STATUTE', 'I-PROVISION', 'I-ORGANIZATION', 'I-JUDGE', 'I-RESPONDENT', 'I-DATE', 'I-WITNESS', 'I-PRECEDENT', 'I-LOCATION', 'I-LAWYER'}
