## SciERC to BIO schema

In [24]:
import pandas as pd
import itertools

In [36]:
data_train = pd.read_json("../../../../Downloads/processed_data/json/train.json", lines=True)
data_test = pd.read_json("../../../../Downloads/processed_data/json/test.json", lines=True)
dev = pd.read_json("../../../../Downloads/processed_data/json/dev.json", lines=True)

In [37]:
data_train.shape, data_test.shape, dev.shape

((350, 5), (100, 5), (50, 5))

In [19]:
data.head()

Unnamed: 0,clusters,sentences,ner,relations,doc_key
0,"[[[17, 20], [23, 23]]]","[[English, is, shown, to, be, trans-context-fr...","[[[0, 0, Material], [10, 10, OtherScientificTe...","[[], [[29, 29, 31, 32, CONJUNCTION], [48, 49, ...",J87-1003
1,"[[[62, 64], [90, 91], [96, 98], [112, 114]], [...","[[In, this, paper, ,, a, novel, method, to, le...","[[[6, 6, Method], [10, 12, OtherScientificTerm...","[[[6, 6, 10, 12, USED-FOR], [10, 12, 14, 16, U...",CVPR_2003_18_abs
2,"[[[154, 154], [214, 214]], [[40, 44], [85, 85]...","[[In, this, paper, ,, we, present, a, digital,...","[[[7, 13, Method], [15, 21, Method], [23, 25, ...","[[[7, 13, 15, 21, USED-FOR], [15, 21, 23, 25, ...",INTERSPEECH_2013_31_abs
3,"[[[3, 3], [110, 110]]]","[[We, propose, a, method, that, automatically,...","[[[3, 3, Generic], [7, 7, OtherScientificTerm]...","[[[3, 3, 7, 7, USED-FOR], [7, 7, 20, 23, USED-...",I05-5008
4,"[[[35, 35], [69, 69]]]","[[Graph, unification, remains, the, most, expe...","[[[0, 1, Task], [8, 10, Task]], [[16, 17, Meth...","[[[0, 1, 8, 10, PART-OF]], [[16, 17, 22, 23, P...",C92-2068


In [23]:
data.head().sentences[0]

[['English',
  'is',
  'shown',
  'to',
  'be',
  'trans-context-free',
  'on',
  'the',
  'basis',
  'of',
  'coordinations',
  'of',
  'the',
  'respectively',
  'type',
  'that',
  'involve',
  'strictly',
  'syntactic',
  'cross-serial',
  'agreement',
  '.'],
 ['The',
  'agreement',
  'in',
  'question',
  'involves',
  'number',
  'in',
  'nouns',
  'and',
  'reflexive',
  'pronouns',
  'and',
  'is',
  'syntactic',
  'rather',
  'than',
  'semantic',
  'in',
  'nature',
  'because',
  'grammatical',
  'number',
  'in',
  'English',
  ',',
  'like',
  'grammatical',
  'gender',
  'in',
  'languages',
  'such',
  'as',
  'French',
  ',',
  'is',
  'partly',
  'arbitrary',
  '.'],
 ['The',
  'formal',
  'proof',
  ',',
  'which',
  'makes',
  'crucial',
  'use',
  'of',
  'the',
  'Interchange',
  'Lemma',
  'of',
  'Ogden',
  'et',
  'al.',
  ',',
  'is',
  'so',
  'constructed',
  'as',
  'to',
  'be',
  'valid',
  'even',
  'if',
  'English',
  'is',
  'presumed',
  'to',
  'con

In [18]:
' '.join(data.head().sentences[0][0]), ' '.join(data.head().sentences[0][1])

('English is shown to be trans-context-free on the basis of coordinations of the respectively type that involve strictly syntactic cross-serial agreement .',
 'The agreement in question involves number in nouns and reflexive pronouns and is syntactic rather than semantic in nature because grammatical number in English , like grammatical gender in languages such as French , is partly arbitrary .')

In [15]:
data.head().ner[0]

[[[0, 0, 'Material'],
  [10, 10, 'OtherScientificTerm'],
  [17, 20, 'OtherScientificTerm']],
 [[23, 23, 'Generic'],
  [29, 29, 'OtherScientificTerm'],
  [31, 32, 'OtherScientificTerm'],
  [42, 43, 'OtherScientificTerm'],
  [45, 45, 'Material'],
  [48, 49, 'OtherScientificTerm'],
  [51, 51, 'Material'],
  [54, 54, 'Material']],
 [[70, 71, 'Method'], [86, 86, 'Material']]]

#### Data Preparation

* new column: 'abstract_full' with joined sentences
* new column: 'ner_annotation_full' with joined annotations

In [38]:
def features_full(data):
    
    data['abstract_full'] = data['sentences'].apply(lambda x: list(itertools.chain.from_iterable(x)))
    data['ner_annotation_full'] = data['ner'].apply(lambda x: list(itertools.chain.from_iterable(x)))
    
    return data

In [40]:
data_train = features_full(data_train)
data_test = features_full(data_test)
dev = features_full(dev)

#### Pseudocode
Initialize a new sequence for each entity type (e.g., Task, Method, Material) and set the initial state to "O" (outside entity).

Iterate over each token in the text and check if it corresponds to the start or continuation of an entity.

If a token corresponds to the start of an entity, mark it with the "B-" prefix, followed by the entity type. For example, "B-Task" for the start of a Task entity.

For subsequent tokens within the same entity, mark them with the "I-" prefix, followed by the entity type. For example, "I-Task" for the continuation of a Task entity.

Tokens that are not part of any entity should be labeled as "O" (outside entity).

In [142]:
def transform_to_BIO(text, annotations):
    bio_labels = ["O"] * len(text)
    
    for start, end, entity_type in annotations:
        bio_labels[start] = "B-" + entity_type.lower()
        for i in range(start+1, end):
            bio_labels[i] = "I-" + entity_type.lower()
    
    return bio_labels

In [148]:
def generateBIOlabels(data):
    
    data['ner_BIO_full'] = data.apply(lambda x: transform_to_BIO(x['abstract_full'], x['ner_annotation_full']), axis=1)
    
    return data

In [151]:
data_train = generateBIOlabels(data_train)
data_test = generateBIOlabels(data_test)
dev = generateBIOlabels(dev)

In [155]:
data_train

Unnamed: 0,clusters,sentences,ner,relations,doc_key,abstract_full,ner_annotation_full,ner_BIO_full
0,"[[[17, 20], [23, 23]]]","[[English, is, shown, to, be, trans-context-fr...","[[[0, 0, Material], [10, 10, OtherScientificTe...","[[], [[29, 29, 31, 32, CONJUNCTION], [48, 49, ...",J87-1003,"[English, is, shown, to, be, trans-context-fre...","[[0, 0, Material], [10, 10, OtherScientificTer...","[B-material, O, O, O, O, O, O, O, O, O, B-othe..."
1,"[[[62, 64], [90, 91], [96, 98], [112, 114]], [...","[[In, this, paper, ,, a, novel, method, to, le...","[[[6, 6, Method], [10, 12, OtherScientificTerm...","[[[6, 6, 10, 12, USED-FOR], [10, 12, 14, 16, U...",CVPR_2003_18_abs,"[In, this, paper, ,, a, novel, method, to, lea...","[[6, 6, Method], [10, 12, OtherScientificTerm]...","[O, O, O, O, O, O, B-method, O, O, O, B-others..."
2,"[[[154, 154], [214, 214]], [[40, 44], [85, 85]...","[[In, this, paper, ,, we, present, a, digital,...","[[[7, 13, Method], [15, 21, Method], [23, 25, ...","[[[7, 13, 15, 21, USED-FOR], [15, 21, 23, 25, ...",INTERSPEECH_2013_31_abs,"[In, this, paper, ,, we, present, a, digital, ...","[[7, 13, Method], [15, 21, Method], [23, 25, T...","[O, O, O, O, O, O, O, B-method, I-method, I-me..."
3,"[[[3, 3], [110, 110]]]","[[We, propose, a, method, that, automatically,...","[[[3, 3, Generic], [7, 7, OtherScientificTerm]...","[[[3, 3, 7, 7, USED-FOR], [7, 7, 20, 23, USED-...",I05-5008,"[We, propose, a, method, that, automatically, ...","[[3, 3, Generic], [7, 7, OtherScientificTerm],...","[O, O, O, B-generic, O, O, O, B-otherscientifi..."
4,"[[[35, 35], [69, 69]]]","[[Graph, unification, remains, the, most, expe...","[[[0, 1, Task], [8, 10, Task]], [[16, 17, Meth...","[[[0, 1, 8, 10, PART-OF]], [[16, 17, 22, 23, P...",C92-2068,"[Graph, unification, remains, the, most, expen...","[[0, 1, Task], [8, 10, Task], [16, 17, Method]...","[B-task, O, O, O, O, O, O, O, B-task, I-task, ..."
...,...,...,...,...,...,...,...,...
345,"[[[97, 99], [128, 129], [181, 182]], [[93, 93]...","[[Learning, video, representation, is, not, a,...","[[[0, 2, Task]], [], [], [[67, 67, OtherScient...","[[], [], [], [], [[70, 71, 78, 83, USED-FOR]],...",IJCAI_2016_423_abs,"[Learning, video, representation, is, not, a, ...","[[0, 2, Task], [67, 67, OtherScientificTerm], ...","[B-task, I-task, O, O, O, O, O, O, O, O, O, O,..."
346,"[[[36, 38], [64, 68], [70, 70], [159, 159]], [...","[[For, mobile, speech, application, ,, speaker...","[[[1, 3, Task], [5, 8, Metric], [10, 11, Metri...","[[[5, 8, 1, 3, FEATURE-OF], [5, 8, 10, 11, CON...",ICASSP_2016_14_abs,"[For, mobile, speech, application, ,, speaker,...","[[1, 3, Task], [5, 8, Metric], [10, 11, Metric...","[O, B-task, I-task, O, O, B-metric, I-metric, ..."
347,"[[[91, 91], [95, 98], [121, 121]], [[39, 41], ...","[[In, this, paper, ,, we, want, to, show, how,...","[[[10, 11, Method], [15, 25, Method], [39, 41,...","[[[10, 11, 15, 25, PART-OF], [39, 41, 44, 54, ...",A97-1027,"[In, this, paper, ,, we, want, to, show, how, ...","[[10, 11, Method], [15, 25, Method], [39, 41, ...","[O, O, O, O, O, O, O, O, O, O, B-method, O, O,..."
348,"[[[29, 29], [46, 46], [97, 97]], [[48, 48], [6...","[[CriterionSM, Online, Essay, Evaluation, Serv...","[[[0, 4, Task], [15, 17, OtherScientificTerm],...","[[[15, 17, 0, 4, PART-OF], [21, 22, 15, 17, HY...",N04-1024,"[CriterionSM, Online, Essay, Evaluation, Servi...","[[0, 4, Task], [15, 17, OtherScientificTerm], ...","[B-task, I-task, I-task, I-task, O, O, O, O, O..."


In [158]:
def join_tokens(data):
    
    data['abstract_full_text'] = data['abstract_full'].apply(lambda x: ' '.join(x))
    data['ner_BIO_full_text'] = data['ner_BIO_full'].apply(lambda x: ' '.join(x))
    
    return data

In [159]:
data_train = join_tokens(data_train)
data_test = join_tokens(data_test)
dev = join_tokens(dev)

In [163]:
# export data
data_train.to_csv('../data/SciERC/data_train.csv')
data_test.to_csv('../data/SciERC/data_test.csv')
dev.to_csv('../data/SciERC/dev.csv')