### Converting datasets from BRAT-standoff to jsonlines format

In [None]:
!pip install -q jsonlines
import pandas as pd
import spacy
import os
from spacy import displacy
from spacy.tokens import *
import random
import numpy as np
from IPython.core.display import display, HTML
from pathlib import Path
import jsonlines
from tqdm.notebook import tqdm

In [None]:
def get_all_labels(directory, count=False):
  nlp = spacy.blank('ru')
  config = {"punct_chars": ['\n\n']}
  nlp.add_pipe("sentencizer", config=config)
  labels = []
  for filename in os.listdir(directory):
      f = os.path.join(directory, filename)
      if os.path.isfile(f) and filename.endswith('.txt'):
          text = get_text(f)
          ann = get_ann(f[:-3] + 'ann')
          doc = nlp(text)
          for entry in ann:
              if len(entry) == 5:
                label, span1_start, span1_end, span2_start, span2_end = entry
              elif len(entry) == 7:
                label, span1_start, span1_end, span2_start, span2_end, span3_start, span3_end = entry
              else:
                label, span_start, span_end = entry
              labels.append(label)
  if count is True:
    counter = dict.fromkeys(labels)
    values, counts = np.unique(labels, return_counts=True)
    for pair in zip(values, counts):
      counter[pair[0]] = pair[1]
    return counter
  else:
    return set(labels)
  

def get_colors(directory):
  if directory.endswith('.txt'):
    labels = get_all_labels(os.path.dirname(directory))
  else:
    labels = get_all_labels(directory)
  colors = dict.fromkeys(labels)
  for key in colors.keys():
    rand = lambda: random.randint(100, 220)
    colors[key] = '#%02X%02X%02X' % (rand(), rand(), rand())
  return colors

def get_ann(path, with_token=False):
  ann_file = open(path, 'r')
  ann = ann_file.read().split('\n')
  spl = []
  for i, line in enumerate(ann):
    splitted = []
    if line != '' and line[0] == 'T':
      span = line.split('\t')[1:]
      entry = span[0].split(' ')
      if len(entry) > 3:
          for i, e in enumerate(entry):
            if ';' in e:
              temp = e.split(';')
              chunk = [temp[0]] + [temp[1]]
              splitted += chunk
            else:
              chunk = entry[i]
              splitted += [chunk]
          if with_token:
            spl.append(splitted + [span[1]])
          else:
            spl.append(splitted)
      else:
          if with_token:
            spl.append(entry + [span[1]])
          else:
            spl.append(entry)
  return spl


def get_doc_file(path):
  nlp = spacy.blank('ru')
  all_spans = []
  filename = path.split('/')[-1]
  if os.path.isfile(path) and filename.endswith('.txt'):
      text = get_text(path)
      ann = get_ann(path[:-3] + 'ann')
      doc = nlp(text)
      spans = []
      for entry in ann:
          if len(entry) == 5:
            label, span1_start, span1_end, span2_start, span2_end = entry
            span1 = doc.char_span(int(span1_start), int(span1_end), label=label)
            span2 = doc.char_span(int(span2_start), int(span2_end), label=label)
            if span1 is not None:
              spans.append(span1)
            if span2 is not None:
              spans.append(span2)
          elif len(entry) == 7:
            label, span1_start, span1_end, span2_start, span2_end, span3_start, span3_end = entry
            span1 = doc.char_span(int(span1_start), int(span1_end), label=label)
            span2 = doc.char_span(int(span2_start), int(span2_end), label=label)
            span3 = doc.char_span(int(span3_start), int(span3_end), label=label)
            if span1 is not None:
              spans.append(span1)
            if span2 is not None:
              spans.append(span2)
            if span3 is not None:
              spans.append(span3)
          else:
            label, span_start, span_end = entry
            span = doc.char_span(int(span_start), int(span_end), label=label)
            if span is not None:
              spans.append(span)
      doc.spans['custom'] = spans
      return doc

def get_doc_dir(directory):
  docs = []
  for filename in os.listdir(directory):
      f = os.path.join(directory, filename)
      if os.path.isfile(f) and filename.endswith('.txt'):
        doc = get_doc_file(f)
        docs.append(doc)
  return docs

In [None]:
def max_len(filename):
  max_len = 0
  with jsonlines.open(filename, mode='r') as reader: 
    for obj in reader:
      if len(obj['tokens']) > max_len:
        max_len = len(obj['tokens'])
        print(len(obj['tokens']))
        print(obj['tokens'])
        print(obj['entity_mentions'])

def get_text(path):
  text_file = open(path, 'r', encoding='utf8')
  text = ' '.join(text_file.read().split('\n'))
  text = text.replace(u'\xa0', u' ')
  text = text.rstrip(' ')
  return text

def get_sentences(path):
  text_file = open(path, 'r', encoding='utf8')
  if 'bio' in path:
    sentences = text_file.read().split('\n')
  else:
    sentences = text_file.read().split('\n\n')
    
  sentences = list(filter(lambda x: x != '', sentences))
  for i, sent in enumerate(sentences):
    sentences[i] = sentences[i].replace(u'\xa0', u' ')
    sentences[i] = sentences[i].replace(u'\n', u' ')
    sentences[i] = sentences[i].rstrip(' ')
  return sentences

def get_sentence_anns(path, filter):
  tokens = spacy.blank("ru") 
  filename = path.split('/')[-1]
  sent_anns = []
  if os.path.isfile(path) and filename.endswith('.txt'):
    doc = get_doc_file(path)
    sentences = get_sentences(path)
    bound_lo = 0
    bound_hi = 0
    for i in range(len(sentences)):
      bound_hi = bound_lo + len(tokens(sentences[i]))
      sent_spans = []
      for span in doc.spans['custom']:
        if span.start >= bound_lo and span.end <= bound_hi:
          span_start = int(span.start) - bound_lo
          span_end = int(span.end) - bound_lo
        else:
          continue
        if filter is not None:
          if str(span.label_) in filter:
              sent_spans.append([span.label_, span_start, span_end, span.text])
          elif str(span.label_) == "DISO":
              sent_spans.append(["DISEASE", span_start, span_end, span.text])
          else:
            continue
        else:
          sent_spans.append([span.label_, span_start, span_end, span.text])
      sent_ann = sent_to_json([t.text for t in tokens(sentences[i])], sent_spans)
      if sent_ann['entity_mentions'] != []:
        sent_anns.append(sent_ann)
      if 'bio' in path:
        bound_lo = bound_hi
      else:
        bound_lo = bound_hi + 1
  return sent_anns


def sent_to_json(tokens, spans):
  # building json for one 'sentence'
  entity_mentions = []
  for span in spans:
    entity_mentions.append({"entity_type": str(span[0]), "start": int(span[1]), "end": int(span[2]), "text": span[3]})
  ann_json = {"tokens": tokens, "entity_mentions": entity_mentions}
  return ann_json

def file_to_json(path, filter):
  # building json for whole file
  filename = path.split('/')[-1]
  if os.path.isfile(path) and filename.endswith('.txt'):
    doc = get_doc_file(path)
    tokens = ([t.text for t in doc])
    entity_mentions = []
    for span in doc.spans['custom']:
      if filter is not None:
        if str(span.label_) in filter:
          entity_mentions.append({"entity_type": str(span.label_), "start": int(span.start), "end":int(span.end), "text": span.text})
        elif str(span.label_) == "DISO":
          entity_mentions.append({"entity_type": "DISEASE", "start": int(span.start), "end":int(span.end), "text": span.text})
        else:
          continue
      else:
          entity_mentions.append({"entity_type": str(span.label_), "start": int(span.start), "end":int(span.end), "text": span.text})
    ann_json = {"tokens": tokens, "entity_mentions": entity_mentions}
    return ann_json

def valid_labeling(path, filter, debug=False):
  # Validating sentence-wise labeling by comparing with full document labeling
  with open(path, 'r') as reader:
    text = ''.join(reader.readlines())
    if '\n\n\n' in text or '\n\n\n\n' in text or '\n\n\n\n\n' in text:
      # print('corrupted file detected with name', path)
      return False, None

  sent_anns = get_sentence_anns(path, filter)
  sum_ent = 0
  for ann in sent_anns:
    sum_ent += len(ann['entity_mentions'])

    if debug:
      for ent in ann['entity_mentions']:
        print(ent)

  json = file_to_json(path, filter)
  anns = json['entity_mentions']

  if debug:
    print('################################')
    for ann in anns:
      print(ann)
    print(sum_ent, len(anns))

  if sum_ent == len(anns):
    return True, sent_anns
  else:
    return False, None

def dir_to_jsonlines(directory, filename, filter=None):
  wrong_cnt = 0
  with jsonlines.open(filename, mode='w') as writer:
    for filename in tqdm(os.listdir(directory)):
        f = os.path.join(directory, filename)
        if os.path.isfile(f) and filename.endswith('.txt'):
          valid, sent_anns = valid_labeling(f, filter)
          if valid:
            json_lines = sent_anns
            writer.write_all(json_lines)
          else:
            wrong_cnt += 1
            # print(f'file {f} was labeled wrong, please check')
    writer.close()
  
  print(f'Conversion of {directory} done, {wrong_cnt} files were invalid')

In [None]:
nerel_train = 'datasets/NEREL-v1.1/train'
nerel_test = 'datasets/NEREL-v1.1/test'
nerel_dev = 'datasets/NEREL-v1.1/dev'

nerel_bio_train = 'datasets/nerel-bio-v1.0/train'
nerel_bio_test = 'datasets/nerel-bio-v1.0/test'
nerel_bio_dev = 'datasets/nerel-bio-v1.0/dev'

nerel_labels = get_all_labels(nerel_train, count=True)
nerel_bio_labels = get_all_labels(nerel_bio_train, count=True)

filtered_nerel_labels = set(filter(lambda x: nerel_labels[x] > 50, nerel_labels.keys()))
filtered_nerel_bio_labels = set(filter(lambda x: nerel_bio_labels[x] > 50, nerel_bio_labels.keys()))

In [None]:
out_path_nerel = 'datasets/outputs/nerel'
if not os.path.exists(out_path_nerel):
    os.mkdir(out_path_nerel)
    dir_to_jsonlines(nerel_train, f'{out_path_nerel}/train.jsonl', filter=filtered_nerel_labels)
    dir_to_jsonlines(nerel_test, f'{out_path_nerel}/test.jsonl', filter=filtered_nerel_labels)
    dir_to_jsonlines(nerel_dev, f'{out_path_nerel}/dev.jsonl', filter=filtered_nerel_labels)
else:
  print(f'Directory {out_path_nerel} already exists')

out_path_nerel_bio = 'datasets/outputs/nerel_bio'
if not os.path.exists(out_path_nerel_bio):
    os.mkdir(out_path_nerel_bio)
    dir_to_jsonlines(nerel_bio_train, f'{out_path_nerel_bio}/train.jsonl', filter=filtered_nerel_bio_labels)
    dir_to_jsonlines(nerel_bio_test, f'{out_path_nerel_bio}/test.jsonl', filter=filtered_nerel_bio_labels)
    dir_to_jsonlines(nerel_bio_dev, f'{out_path_nerel_bio}/dev.jsonl', filter=filtered_nerel_bio_labels)
else:
  print(f'Directory {out_path_nerel_bio} already exists')

Directory datasets/outputs/nerel already exists
Directory datasets/outputs/nerel_bio already exists


In [None]:
common_labels = sorted(list(filtered_nerel_labels & filtered_nerel_bio_labels) + ['DISEASE'])
print('Common labels btwn NEREL and NEREL-BIO:', common_labels)

out_path_common_labels_nerel = 'datasets/outputs/nerel_common_labels'
if not os.path.exists(out_path_common_labels_nerel):
    os.mkdir(out_path_common_labels_nerel)
    dir_to_jsonlines(nerel_train, f'{out_path_common_labels_nerel}/train.jsonl', filter=common_labels)
    dir_to_jsonlines(nerel_test, f'{out_path_common_labels_nerel}/test.jsonl', filter=common_labels)
    dir_to_jsonlines(nerel_dev, f'{out_path_common_labels_nerel}/dev.jsonl', filter=common_labels)
else:
    print(f'Directory {out_path_common_labels_nerel} already exists')


out_path_common_labels_nerel_bio = 'datasets/outputs/nerel_bio_common_labels'
if not os.path.exists(out_path_common_labels_nerel_bio):
    os.mkdir(out_path_common_labels_nerel_bio)
    dir_to_jsonlines(nerel_bio_train, f'{out_path_common_labels_nerel_bio}/train.jsonl', filter=common_labels)
    dir_to_jsonlines(nerel_bio_test, f'{out_path_common_labels_nerel_bio}/test.jsonl', filter=common_labels)
    dir_to_jsonlines(nerel_bio_dev, f'{out_path_common_labels_nerel_bio}/dev.jsonl', filter=common_labels)
else:
    print(f'Directory {out_path_common_labels_nerel_bio} already exists')

Common labels btwn NEREL and NEREL-BIO: ['AGE', 'CITY', 'COUNTRY', 'DATE', 'DISEASE', 'FACILITY', 'LOCATION', 'NUMBER', 'ORDINAL', 'ORGANIZATION', 'PERCENT', 'PERSON', 'PRODUCT', 'PROFESSION', 'STATE_OR_PROVINCE', 'TIME']
Directory datasets/outputs/nerel_common_labels already exists
Directory datasets/outputs/nerel_bio_common_labels already exists


### Misc

In [None]:
import pandas as pd

categories_nerel = {'Category': list(filtered_nerel_labels)}

df_categories_nerel = pd.DataFrame(data=categories_nerel)

categories_nerel_bio = {'Category': list(filtered_nerel_bio_labels)}

df_categories_nerel_bio = pd.DataFrame(data=categories_nerel_bio)


categories_common = {'Category': common_labels}

df_categories_common = pd.DataFrame(data=categories_common)

In [None]:
df_categories_nerel

Unnamed: 0,Category
0,WORK_OF_ART
1,RELIGION
2,PERSON
3,NUMBER
4,PRODUCT
5,PROFESSION
6,MONEY
7,STATE_OR_PROVINCE
8,EVENT
9,DISEASE


In [None]:
df_categories_nerel_bio

Unnamed: 0,Category
0,DEVICE
1,NUMBER
2,PERSON
3,PRODUCT
4,PROFESSION
5,LABPROC
6,MENTALPROC
7,FINDING
8,STATE_OR_PROVINCE
9,ANATOMY


In [None]:
df_categories_common

Unnamed: 0,Category
0,AGE
1,CITY
2,COUNTRY
3,DATE
4,DISEASE
5,FACILITY
6,LOCATION
7,NUMBER
8,ORDINAL
9,ORGANIZATION


In [None]:
########################################WWWWWWWWWRRRRRRRRRRRROOOOOOOOOONNNNNNNNGGGGGGGGGG#############################################################
# def filter_keys(dict_, lst):
#     new_dict = {}
#     for key in dict_:
#         if key in lst:
#             new_dict[key] = dict_[key]
#     return new_dict

# nerel_labels_test = get_all_labels(nerel_test, count=True)
# nerel_bio_labels_test = get_all_labels(nerel_bio_test, count=True)

# nerel_bio_labels_test['DISEASE'] = nerel_bio_labels_test['DISO']
# del nerel_bio_labels_test['DISO']

# test_counts_nerel = filter_keys(nerel_labels_test, common_labels)
# test_counts_nerel_bio = filter_keys(nerel_bio_labels_test, common_labels)

# print(dict(sorted(test_counts_nerel.items())))
# print(dict(sorted(test_counts_nerel_bio.items())))

{'AGE': 138, 'CITY': 239, 'COUNTRY': 456, 'DATE': 523, 'DISEASE': 57, 'FACILITY': 64, 'LOCATION': 61, 'NUMBER': 230, 'ORDINAL': 107, 'ORGANIZATION': 675, 'PERCENT': 7, 'PERSON': 961, 'PRODUCT': 53, 'PROFESSION': 855, 'STATE_OR_PROVINCE': 112, 'TIME': 47}
{'AGE': 64, 'CITY': 7, 'COUNTRY': 137, 'DATE': 201, 'DISEASE': 932, 'FACILITY': 37, 'LOCATION': 39, 'NUMBER': 529, 'ORDINAL': 89, 'ORGANIZATION': 108, 'PERCENT': 256, 'PERSON': 745, 'PRODUCT': 10, 'PROFESSION': 42, 'STATE_OR_PROVINCE': 12, 'TIME': 6}


In [None]:
# path_text = '/content/drive/MyDrive/NEREL/NEREL-v1.1/test/113534_text.txt'
# path_ann = '/content/drive/MyDrive/NEREL/NEREL-v1.1/test/113534_text.ann'

# path_text = '/content/drive/MyDrive/NEREL/NEREL-v1.1/test/195702_text.txt'
# path_ann = '/content/drive/MyDrive/NEREL/NEREL-v1.1/test/195702_text.ann'

# path_text = '/content/drive/MyDrive/NEREL/NEREL-v1.1/train/133570_text.txt'
# path_ann = '/content/drive/MyDrive/NEREL/NEREL-v1.1/train/133570_text.ann'

path_text = '/content/drive/MyDrive/NEREL/nerel-bio-v1.0/test/22271960_ru.txt'
path_ann = '/content/drive/MyDrive/NEREL/nerel-bio-v1.0/test/22271960_ru.ann'


json = file_to_json(path_text)
tokens = json['tokens']
# toks_nums = []
# for i in range(len(tokens)):
#   toks_nums.append(tokens[i] + ';' + str(i))
# print(toks_nums)
text = get_text(path_text)
anns = get_sentence_anns(path_text)

for i in range(len(anns)):
  print([word + ';' + str(j) for j, word in enumerate(anns[i]['tokens'])])
  print(anns[i]['entity_mentions'])
  print('-----------------------------------------------------------------------------------------')
# valid_labeling(path_text, debug=True)

['Резюме;0', 'Цель;1', ':;2', 'Определить;3', ',;4', 'в;5', 'какой;6', 'степени;7', 'грипп;8', 'способствует;9', 'заражению;10', 'лиц;11', 'всех;12', 'возрастов;13', 'в;14', 'Бангладеш;15', 'тяжелой;16', 'острой;17', 'респираторной;18', 'инфекцией;19', '(;20', 'ТОРИ;21', ');22', ',;23', 'которая;24', 'является;25', 'основной;26', 'причиной;27', 'детской;28', 'смертности;29', '.;30']
[{'entity_type': 'DISO', 'start': 17, 'end': 20, 'text': 'острой респираторной инфекцией'}, {'entity_type': 'DISO', 'start': 18, 'end': 20, 'text': 'респираторной инфекцией'}, {'entity_type': 'DISO', 'start': 21, 'end': 22, 'text': 'ТОРИ'}, {'entity_type': 'PERSON', 'start': 11, 'end': 12, 'text': 'лиц'}, {'entity_type': 'DISO', 'start': 16, 'end': 20, 'text': 'тяжелой острой респираторной инфекцией'}, {'entity_type': 'PHYS', 'start': 28, 'end': 30, 'text': 'детской смертности'}, {'entity_type': 'DISO', 'start': 8, 'end': 9, 'text': 'грипп'}, {'entity_type': 'COUNTRY', 'start': 15, 'end': 16, 'text': 'Бангл