# NLP Information Extraction: Preprocessing

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
from tqdm.auto import tqdm
from pprint import pprint
import json


random_seed = 42

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
data = pd.read_json(f'data/train.json')

In [4]:
data.head(1)

Unnamed: 0,id,text,label,extracted_part
0,809436509,Извещение о проведении открытого конкурса в эл...,обеспечение исполнения контракта,{'text': ['Размер обеспечения исполнения контр...


In [5]:
data['empty_extraction'] = data['extracted_part'].apply(lambda row: False if row['text'][0] else True)
data['stratify_column'] = data.apply(lambda row: row.label + ' ' + str(row.empty_extraction), axis=1)  # to have stratified split

In [6]:
train_df, val_df = train_test_split(data, 
                                    shuffle=True, 
                                    stratify=data['stratify_column'],
                                    random_state=random_seed, 
                                    test_size=0.1)

In [7]:
train_df.head(1)

Unnamed: 0,id,text,label,extracted_part,empty_extraction,stratify_column
1152,713863700,Извещение о проведении электронного аукциона д...,обеспечение гарантийных обязательств,{'text': ['Размер обеспечения гарантийных обяз...,False,обеспечение гарантийных обязательств False


In [8]:
val_df.head(1)

Unnamed: 0,id,text,label,extracted_part,empty_extraction,stratify_column
1611,496636639,ПРОЕКТ Договор № п. Волоконовка «___» ________...,обеспечение гарантийных обязательств,"{'text': [''], 'answer_start': [0], 'answer_en...",True,обеспечение гарантийных обязательств True


In [9]:
raw_dataset = DatasetDict({
    'train': Dataset.from_pandas(train_df.reset_index(drop=True)),
    'val': Dataset.from_pandas(val_df.reset_index(drop=True)),
    })
raw_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'label', 'extracted_part', 'empty_extraction', 'stratify_column'],
        num_rows: 1619
    })
    val: Dataset({
        features: ['id', 'text', 'label', 'extracted_part', 'empty_extraction', 'stratify_column'],
        num_rows: 180
    })
})

In [10]:
raw_dataset.save_to_disk('data/raw')

                                                                                               

## Label adjustment (inclusion of trailing underscores)

In [11]:
with open(f'data/train.json', 'rb') as f:
    train = json.load(f)
with open(f'data/test_with_labels.json', 'rb') as f:
    test_with_labels = json.load(f)

In [12]:
def find_first_non_underscore(text, answer_end):
    while answer_end < len(text) and text[answer_end] == '_':
        answer_end += 1
    return answer_end

# the same for answer_start but in the opposite direction
def find_last_non_underscore(text, answer_start):
    while answer_start >= 0 and text[answer_start] == '_':
        answer_start -= 1
    return answer_start

def replace_underscores(dataset):
    end_replacements = 0
    start_replacements = 0

    for d in dataset:
      text = str(d['text'])
      answer_end = d['extracted_part']['answer_end'][0]
      answer_start = d['extracted_part']['answer_start'][0]
      if text[answer_end] == '_':
        new_end = find_first_non_underscore(text, answer_end)
        if new_end != answer_end:
          d['extracted_part']['answer_end'][0] = new_end
          end_replacements += 1
      if text[answer_start] == '_':
        new_start = find_last_non_underscore(text, answer_start)
        if new_start != answer_start:
          d['extracted_part']['answer_start'][0] = new_start
          start_replacements += 1

    print(f'end_replacements: {end_replacements}')
    print(f'start_replacements: {start_replacements}')
    
    return dataset

In [13]:
train = replace_underscores(train)
test_with_labels = replace_underscores(test_with_labels)

end_replacements: 30
start_replacements: 0
end_replacements: 4
start_replacements: 0


In [14]:
with open(f'data/train_preprocessed.json', 'w') as f:
    json.dump(train, f, ensure_ascii=False, indent=4)
with open(f'data/test_with_labels_preprocessed.json', 'w') as f:
    json.dump(test_with_labels, f, ensure_ascii=False, indent=4)