# Load Data from JSON
convert data to dataframe and save it to csv file

In [1]:
import pandas
import pandas as pd
from pandas import DataFrame
from typing import List
import spacy, srsly
from spacy.training import docs_to_json, offsets_to_biluo_tags, biluo_tags_to_spans

path_judgment_train: str = r"../data/NER_TRAIN_JUDGEMENT.json"
path_preamble_train: str = r"../data/NER_TRAIN_PREAMBLE.json"

path_judgment_dev: str = r"../data/NER_DEV_JUDGEMENT.json"
path_preamble_dev: str = r"../data/NER_DEV_PREAMBLE.json"

# load data

In [2]:
def load_data():
    """
    Read the data from the json files and merge them together to one dataframe
    :return:
    """
    train_judgment_data: DataFrame = pd.read_json(path_judgment_train)
    train_preamble_data: DataFrame = pd.read_json(path_preamble_train)

    train_data: DataFrame = train_judgment_data.append(train_preamble_data)
    train_data.index = [i for i in range(len(train_data))]
    train_data.drop(["meta"], axis=1, inplace=True)

    dev_judgment_data: DataFrame = pd.read_json(path_judgment_dev)
    dev_preamble_data: DataFrame = pd.read_json(path_preamble_dev)

    dev_data: DataFrame = dev_judgment_data.append(dev_preamble_data)
    dev_data.index = [i for i in range(len(dev_data))]
    dev_data.drop(["meta"], axis=1, inplace=True)

    return train_data, dev_data

def transform_annotations(annotations: List) -> List:
    """
    Transform the Annotations from JSON format to a list of (start, end, label) tuples
    :param annotations: A List of annotations for one Sentence
    :return: A List of Tuples in format (start, end, label)
    """
    r_annotations: List = []
    for annotation in annotations:
      for res in annotation['result']:
        start = res['value']['start']
        end = res['value']['end']
        label = res['value']['labels'][0]
        r_annotations.append((start, end, label))

    return r_annotations

In [3]:
train_data, dev_data = load_data()

train_data["string"] = train_data["data"].map(lambda x: x["text"])
train_data['entities'] = train_data["annotations"].map(transform_annotations)

dev_data["string"] = dev_data["data"].map(lambda x: x["text"])
dev_data['entities'] = dev_data["annotations"].map(transform_annotations)

# drop unnecessary columns
train_data.drop(["annotations", "data", "id"], axis=1, inplace=True)
dev_data.drop(["annotations", "data", "id"], axis=1, inplace=True)
train_data.head()

  train_data: DataFrame = train_judgment_data.append(train_preamble_data)
  dev_data: DataFrame = dev_judgment_data.append(dev_preamble_data)


Unnamed: 0,string,entities
0,\n\n(7) On specific query by the Bench about a...,"[(90, 103, ORG), (267, 278, ORG)]"
1,"He was also asked whether Agya <span class=""hi...","[(26, 30, OTHER_PERSON), (101, 105, OTHER_PERS..."
2,"\n5.2 CW3 Mr Vijay Mishra , Deputy Manager, H...","[(13, 25, WITNESS), (44, 60, ORG), (62, 64, GP..."
3,You are hereby asked not to carry out any cons...,[]
4,The pillion rider T.V. Satyanarayana Murthy al...,"[(18, 43, OTHER_PERSON)]"


In [4]:
dev_data.head()

Unnamed: 0,string,entities
0,"True, our Constitution has no 'due process' cl...","[(10, 22, STATUTE), (108, 155, PRECEDENT), (16..."
1,(See Principles of Statutory Interpretation by...,"[(55, 65, JUDGE)]"
2,"Their Lordships have said -- ""It is a sound r...","[(101, 108, GPE), (134, 140, OTHER_PERSON)]"
3,"In para 13 of the plaint, it has been further ...","[(252, 262, DATE), (313, 318, GPE)]"
4,Counsel for appellants contended that who is t...,"[(169, 175, PROVISION), (275, 286, PROVISION),..."


In [5]:
print(len(train_data))
print(len(dev_data))

10995
1074


# Convert the raw data to IOB Format

In [6]:
nlp = spacy.load('en_core_web_sm')
train_docs = []
for index, row in train_data.iterrows():
    doc = nlp(row['string']) # the string data
    tags = offsets_to_biluo_tags(doc, row['entities'])
    entities = biluo_tags_to_spans(doc, tags)
    doc.ents = entities
    train_docs.append(doc)

In [7]:
dev_docs = []
for index, row in dev_data.iterrows():
    doc = nlp(row['string']) # the string data
    tags = offsets_to_biluo_tags(doc, row['entities'])
    entities = biluo_tags_to_spans(doc, tags)
    doc.ents = entities
    dev_docs.append(doc)


Digitally signed by:RAJENDER SINGH KARKI Signing..." with entities "[(22, 42, 'OTHER_PERSON'), (56, 66, 'DATE')]". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
 Likewise, In the power of attorney (exhibit P/11..." with entities "[(70, 84, 'OTHER_PERSON'), (98, 109, 'OTHER_PERSON...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.

 (c) The aforesaid cheque was presented by the c..." with entities "[(93, 104, 'ORG'), (118, 127, 'GPE'), (226, 236, '...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
    Commercial Complex, Raj Bhavan Road, Hyderaba..." with entities "[(42, 51, 'GPE')]". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to c

In [8]:
print(type(train_docs[0]))
print(train_docs[0])
print(train_docs[0].ents)

<class 'spacy.tokens.doc.Doc'>


(7) On specific query by the Bench about an entry of Rs. 1,31,37,500 on deposit side of Hongkong Bank account of which a photo copy is appearing at p. 40 of assessee's paper book, learned authorised representative submitted that it was related to loan from broker, Rahul & Co. on the basis of his submission a necessary mark is put by us on that photo copy.
(Hongkong Bank, Rahul & Co.)


# Unpack the iob tokens from above
since docs is a list of <class 'spacy.tokens.doc.Doc'> it will be unpacked into a normal list

In [9]:
unpacked_train_data = []
for paragraph in docs_to_json(train_docs)['paragraphs']:
    tokenized = []
    iob = []
    for sentence in paragraph['sentences']:
        for token in sentence['tokens']:
            iob.append(token['ner'])
            tokenized.append(token['orth'])

    unpacked_train_data.append((tokenized, iob))
train_df: DataFrame= pd.DataFrame(unpacked_train_data, columns=['sentence', 'entities'])
train_data["sentence"] = train_df["sentence"]
train_data["biluo"] = train_df["entities"]
train_data.head()

Unnamed: 0,string,entities,sentence,biluo
0,\n\n(7) On specific query by the Bench about a...,"[(90, 103, ORG), (267, 278, ORG)]","[\n\n, (, 7, ), On, specific, query, by, the, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
1,"He was also asked whether Agya <span class=""hi...","[(26, 30, OTHER_PERSON), (101, 105, OTHER_PERS...","[He, was, also, asked, whether, Agya, <, span,...","[O, O, O, O, O, U-OTHER_PERSON, O, O, O, O, O,..."
2,"\n5.2 CW3 Mr Vijay Mishra , Deputy Manager, H...","[(13, 25, WITNESS), (44, 60, ORG), (62, 64, GP...","[ \n, 5.2, CW3, Mr, Vijay, Mishra, ,, Deputy, ...","[O, O, O, O, B-WITNESS, L-WITNESS, O, O, O, O,..."
3,You are hereby asked not to carry out any cons...,[],"[You, are, hereby, asked, not, to, carry, out,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
4,The pillion rider T.V. Satyanarayana Murthy al...,"[(18, 43, OTHER_PERSON)]","[The, pillion, rider, T.V., Satyanarayana, Mur...","[O, O, O, B-OTHER_PERSON, I-OTHER_PERSON, L-OT..."


In [10]:
unpacked_dev_data = []
for paragraph in docs_to_json(dev_docs)['paragraphs']:
    tokenized = []
    iob = []
    for sentence in paragraph['sentences']:
        for token in sentence['tokens']:
            iob.append(token['ner'])
            tokenized.append(token['orth'])

    unpacked_dev_data.append((tokenized, iob))
dev_df: DataFrame= pd.DataFrame(unpacked_dev_data, columns=['sentence', 'entities'])
dev_data["sentence"] = dev_df["sentence"]
dev_data["biluo"] = dev_df["entities"]
dev_data.head()


Unnamed: 0,string,entities,sentence,biluo
0,"True, our Constitution has no 'due process' cl...","[(10, 22, STATUTE), (108, 155, PRECEDENT), (16...","[True, ,, our, Constitution, has, no, ', due, ...","[O, O, O, U-STATUTE, O, O, O, O, O, O, O, O, O..."
1,(See Principles of Statutory Interpretation by...,"[(55, 65, JUDGE)]","[(, See, Principles, of, Statutory, Interpreta...","[O, O, O, O, O, O, O, O, B-JUDGE, L-JUDGE, O, ..."
2,"Their Lordships have said -- ""It is a sound r...","[(101, 108, GPE), (134, 140, OTHER_PERSON)]","[Their, Lordships, have, said, --, , "", It, i...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3,"In para 13 of the plaint, it has been further ...","[(252, 262, DATE), (313, 318, GPE)]","[In, para, 13, of, the, plaint, ,, it, has, be...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
4,Counsel for appellants contended that who is t...,"[(169, 175, PROVISION), (275, 286, PROVISION),...","[Counsel, for, appellants, contended, that, wh...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


In [11]:
print(train_data["entities"][0])
print(train_data["biluo"][0])

[(90, 103, 'ORG'), (267, 278, 'ORG')]
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'L-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'L-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


## Check, whether the created tokens fit to the biluo tags

In [12]:
# check whether the IOB Tokens match the actual text
def filter_tokens(df: DataFrame, number_samples: int):
    """
    Check some samples of the data to check whether the samples are matching with the biluo tags
    :param df: dataframe containing data
    :param number_samples: number of samples
    :return: None
    """

    for index, row in df.iterrows():
        sentence = row["sentence"]
        iob = row["biluo"]
        print(sentence)

        assert len(sentence) == len(iob)
        for i, iob_token in enumerate(iob):
            if iob_token != 'O':
                word_token: str = sentence[i]
                print(word_token, iob_token)

        if index == number_samples:
            break

filter_tokens(train_data, 5)
filter_tokens(dev_data, 5)

['\n\n', '(', '7', ')', 'On', 'specific', 'query', 'by', 'the', 'Bench', 'about', 'an', 'entry', 'of', 'Rs', '.', '1,31,37,500', 'on', 'deposit', 'side', 'of', 'Hongkong', 'Bank', 'account', 'of', 'which', 'a', 'photo', 'copy', 'is', 'appearing', 'at', 'p.', '40', 'of', 'assessee', "'s", 'paper', 'book', ',', 'learned', 'authorised', 'representative', 'submitted', 'that', 'it', 'was', 'related', 'to', 'loan', 'from', 'broker', ',', 'Rahul', '&', 'Co.', 'on', 'the', 'basis', 'of', 'his', 'submission', 'a', 'necessary', 'mark', 'is', 'put', 'by', 'us', 'on', 'that', 'photo', 'copy', '.']
Hongkong B-ORG
Bank L-ORG
Rahul B-ORG
& I-ORG
Co. L-ORG
['He', 'was', 'also', 'asked', 'whether', 'Agya', '<', 'span', 'class="hidden_text', '"', 'id="span_5', '"', '>', 'CRA', 'No.326', '-', 'DB', 'of', '1998', '6</span', '>', 'Kaur', ',', 'mother', '-', 'in', '-', 'law', 'of', 'the', 'deceased', 'lived', 'separately', 'from', 'Tarlochan', 'Singh', '.']
Agya U-OTHER_PERSON
Kaur U-OTHER_PERSON
Tarlochan 

# Convert biluo to IOB

In [13]:
def biluo_to_iob(tags):
    """
    Reformat bilou tags to iob tags
    :param tags:  a list of tags
    :return: the list of iob tags
    """
    iob_tags = []
    for i, tag in enumerate(tags):
        if tag == "O":
            iob_tags.append(tag)
        elif tag.startswith("B-"):
            iob_tags.append("B" + tag[1:])
        elif tag.startswith("I-"):
            iob_tags.append(tag)
        elif tag.startswith("L-"):
            iob_tags.append("I" + tag[1:])
        elif tag.startswith("U-"):
            iob_tags.append("B" + tag[1:])
        else:
            raise ValueError("Invalid tag: {}".format(tag))
    return iob_tags

In [14]:
train_data['iob'] = train_data['biluo'].apply(lambda x: biluo_to_iob(x))
train_data.head()

Unnamed: 0,string,entities,sentence,biluo,iob
0,\n\n(7) On specific query by the Bench about a...,"[(90, 103, ORG), (267, 278, ORG)]","[\n\n, (, 7, ), On, specific, query, by, the, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
1,"He was also asked whether Agya <span class=""hi...","[(26, 30, OTHER_PERSON), (101, 105, OTHER_PERS...","[He, was, also, asked, whether, Agya, <, span,...","[O, O, O, O, O, U-OTHER_PERSON, O, O, O, O, O,...","[O, O, O, O, O, B-OTHER_PERSON, O, O, O, O, O,..."
2,"\n5.2 CW3 Mr Vijay Mishra , Deputy Manager, H...","[(13, 25, WITNESS), (44, 60, ORG), (62, 64, GP...","[ \n, 5.2, CW3, Mr, Vijay, Mishra, ,, Deputy, ...","[O, O, O, O, B-WITNESS, L-WITNESS, O, O, O, O,...","[O, O, O, O, B-WITNESS, I-WITNESS, O, O, O, O,..."
3,You are hereby asked not to carry out any cons...,[],"[You, are, hereby, asked, not, to, carry, out,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
4,The pillion rider T.V. Satyanarayana Murthy al...,"[(18, 43, OTHER_PERSON)]","[The, pillion, rider, T.V., Satyanarayana, Mur...","[O, O, O, B-OTHER_PERSON, I-OTHER_PERSON, L-OT...","[O, O, O, B-OTHER_PERSON, I-OTHER_PERSON, I-OT..."


In [15]:
dev_data['iob'] = dev_data['biluo'].apply(lambda x: biluo_to_iob(x))
dev_data.head()

Unnamed: 0,string,entities,sentence,biluo,iob
0,"True, our Constitution has no 'due process' cl...","[(10, 22, STATUTE), (108, 155, PRECEDENT), (16...","[True, ,, our, Constitution, has, no, ', due, ...","[O, O, O, U-STATUTE, O, O, O, O, O, O, O, O, O...","[O, O, O, B-STATUTE, O, O, O, O, O, O, O, O, O..."
1,(See Principles of Statutory Interpretation by...,"[(55, 65, JUDGE)]","[(, See, Principles, of, Statutory, Interpreta...","[O, O, O, O, O, O, O, O, B-JUDGE, L-JUDGE, O, ...","[O, O, O, O, O, O, O, O, B-JUDGE, I-JUDGE, O, ..."
2,"Their Lordships have said -- ""It is a sound r...","[(101, 108, GPE), (134, 140, OTHER_PERSON)]","[Their, Lordships, have, said, --, , "", It, i...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3,"In para 13 of the plaint, it has been further ...","[(252, 262, DATE), (313, 318, GPE)]","[In, para, 13, of, the, plaint, ,, it, has, be...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
4,Counsel for appellants contended that who is t...,"[(169, 175, PROVISION), (275, 286, PROVISION),...","[Counsel, for, appellants, contended, that, wh...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


## Save data to CSV file

In [16]:
to_drop = ["entities", "biluo"]
train_data.drop(to_drop, inplace=True, axis=1)
train_data.rename({"string": "sentence", "sentence": "words"}, axis=1, inplace=True)
train_data.head()

Unnamed: 0,sentence,words,iob
0,\n\n(7) On specific query by the Bench about a...,"[\n\n, (, 7, ), On, specific, query, by, the, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
1,"He was also asked whether Agya <span class=""hi...","[He, was, also, asked, whether, Agya, <, span,...","[O, O, O, O, O, B-OTHER_PERSON, O, O, O, O, O,..."
2,"\n5.2 CW3 Mr Vijay Mishra , Deputy Manager, H...","[ \n, 5.2, CW3, Mr, Vijay, Mishra, ,, Deputy, ...","[O, O, O, O, B-WITNESS, I-WITNESS, O, O, O, O,..."
3,You are hereby asked not to carry out any cons...,"[You, are, hereby, asked, not, to, carry, out,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
4,The pillion rider T.V. Satyanarayana Murthy al...,"[The, pillion, rider, T.V., Satyanarayana, Mur...","[O, O, O, B-OTHER_PERSON, I-OTHER_PERSON, I-OT..."


### make train test split

In [17]:
train: DataFrame = train_data.sample(frac=0.8,random_state=3771)
test: DataFrame = train_data.drop(train.index)

print(len(train))
print(len(test))

8796
2199


In [18]:
save_path_train = "../data/train_data_iob.csv"
save_path_test = "../data/test_data_iob.csv"
train.to_csv(save_path_train)
test.to_csv(save_path_test)

In [19]:
to_drop = ["entities", "biluo"]
dev_data.drop(to_drop, inplace=True, axis=1)
dev_data.rename({"string": "sentence", "sentence": "words"}, axis=1, inplace=True)
dev_data.head()

Unnamed: 0,sentence,words,iob
0,"True, our Constitution has no 'due process' cl...","[True, ,, our, Constitution, has, no, ', due, ...","[O, O, O, B-STATUTE, O, O, O, O, O, O, O, O, O..."
1,(See Principles of Statutory Interpretation by...,"[(, See, Principles, of, Statutory, Interpreta...","[O, O, O, O, O, O, O, O, B-JUDGE, I-JUDGE, O, ..."
2,"Their Lordships have said -- ""It is a sound r...","[Their, Lordships, have, said, --, , "", It, i...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3,"In para 13 of the plaint, it has been further ...","[In, para, 13, of, the, plaint, ,, it, has, be...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
4,Counsel for appellants contended that who is t...,"[Counsel, for, appellants, contended, that, wh...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


In [20]:
save_path = "../data/dev_data_iob.csv"
dev_data.to_csv(save_path)

In [21]:
data = pd.read_csv(save_path, index_col=0)
data["iob"] = data["iob"].map(lambda x: eval(x))
data.head()

Unnamed: 0,sentence,words,iob
0,"True, our Constitution has no 'due process' cl...","['True', ',', 'our', 'Constitution', 'has', 'n...","[O, O, O, B-STATUTE, O, O, O, O, O, O, O, O, O..."
1,(See Principles of Statutory Interpretation by...,"['(', 'See', 'Principles', 'of', 'Statutory', ...","[O, O, O, O, O, O, O, O, B-JUDGE, I-JUDGE, O, ..."
2,"Their Lordships have said -- ""It is a sound r...","['Their', 'Lordships', 'have', 'said', '--', '...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3,"In para 13 of the plaint, it has been further ...","['In', 'para', '13', 'of', 'the', 'plaint', ',...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
4,Counsel for appellants contended that who is t...,"['Counsel', 'for', 'appellants', 'contended', ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


In [22]:
len(train)

8796

In [23]:
len(test)

2199