In [1]:
import json
import csv
import pandas
import pandas as pd

# original json files
f = open('./Dataset/train_refind_official.json') # training set
fd = open('./Dataset/dev_refind_official.json') # dev set
ft = open('./Dataset/test_refind_official.json') # test set


int_to_label = {
'': '',
0: 'no_relation',
1: 'org:date:formed_on',
2: 'org:gpe:operations_in',
3: 'pers:org:member_of',
4: 'pers:org:employee_of',
5: 'pers:gov_agy:member_of',
6: 'org:org:acquired_by',
7: 'org:money:loss_of',
8: 'org:gpe:headquartered_in',
9: 'pers:univ:employee_of',
10: 'org:date:acquired_on',
11: 'pers:univ:attended',
12: 'org:gpe:formed_in',
13: 'org:money:profit_of',
14: 'org:money:cost_of',
15: 'org:org:subsidiary_of',
16: 'org:org:shares_of',
17: 'pers:org:founder_of',
18: 'pers:title:title',
19: 'org:money:revenue_of',
20: 'org:org:agreement_with',
21: 'pers:univ:member_of'}
label_mapping = {v: k for k, v in int_to_label.items()}

raw_data = json.load(f)
raw_datad = json.load(fd)
raw_datat = json.load(ft)

# str: rel_group, e1_type, e2_type
# list of str: token, spacy_pos, spacy_ner, spacy_deprel, sdp
# numbers: e1_start, e1_end, e2_start, e2_end,
# list of numbers: spacy_head, sdp_tok_idx

def data_preprocessor(cols, raw_data, repNER=False):
    label_count = {}
    train_data = []

    for item in raw_data:
        # to deal with formatting issues in private test set, where certain lists are stored as strings
        for c in ['spacy_ner', 'spacy_pos']:
            if isinstance(item[c], str): 
                item[c] = list(item[c][2:-2].split("', '"))
                    
        label_name = item['relation']
        label_count[label_name] = label_count.get(label_name, 0) + 1
                        
        # process token first
        text_list = item['token'].copy()
        if repNER:
            # replace words with the NER token in the original dataset 'spacy_ner' column
            NER_list = item['spacy_ner']
            for i in range(0, len(NER_list)):
                if NER_list[i] != 'O':
                    text_list[i] = '[' + NER_list[i] + ']'
                    
        text = ' '.join(text_list)
        

        merged_text = text + ' [SEP] [CLS] '
        for c in cols:
            if c in ['spacy_pos', 'spacy_ner', 'spacy_deprel', 'sdp']:
                merged_text = merged_text + ' '.join(item[c]) + ' [SEP] [CLS] '
            elif c in ['rel_group', 'e1_type', 'e2_type']:
                merged_text = merged_text + item[c] + ' [SEP] [CLS] '
            elif c in ['spacy_head', 'sdp_tok_idx' ]:
                temp = ', '.join([str(i) for i in item[c]])
                merged_text = merged_text + temp + ' [SEP] [CLS] '
            elif c in ['e1_start', 'e1_end', 'e2_start', 'e2_end']:
                merged_text = merged_text + f', {c} [SEP] [CLS] '
            else:
                raise ValueError(f'Wrong column name: {c}.')

        new_row = {'mergedText': merged_text[:-7], 'label': label_mapping[label_name]}

        train_data.append(new_row)
    return train_data, label_count

# cols = ['spacy_ner', 'spacy_pos']
cols = ['spacy_pos']

repNER = True
version = 'TPrN'

print("preprocess_data-"+str(version))


preprocess_data-TPrN


In [2]:
data = raw_data
OUT_FILENAME = 'data_train-'+version+'.csv'
# data = raw_datad
# OUT_FILENAME = 'data_dev-'+version+'.csv'
# data = raw_datat
# OUT_FILENAME = 'data_test-'+version+'.csv'

merged_data, label_count = data_preprocessor(cols, data, repNER=repNER)

with open(OUT_FILENAME, 'w', encoding='UTF8', newline='') as f:
    writer = csv.DictWriter(f, fieldnames=['mergedText', 'label'])
    writer.writeheader()
    writer.writerows(merged_data)

f.close()
print(OUT_FILENAME)

data_train-TPrN.csv


In [3]:
# data = raw_data
# OUT_FILENAME = 'data_train-'+version+'.csv'
data = raw_datad
OUT_FILENAME = 'data_dev-'+version+'.csv'
# data = raw_datat
# OUT_FILENAME = 'data_test-'+version+'.csv'

merged_data, label_count = data_preprocessor(cols, data, repNER=repNER)

with open(OUT_FILENAME, 'w', encoding='UTF8', newline='') as f:
    writer = csv.DictWriter(f, fieldnames=['mergedText', 'label'])
    writer.writeheader()
    writer.writerows(merged_data)

f.close()
print(OUT_FILENAME)

data_dev-TPrN.csv


In [4]:
# data = raw_data
# OUT_FILENAME = 'data_train-'+version+'.csv'
# data = raw_datad
# OUT_FILENAME = 'data_dev-'+version+'.csv'
data = raw_datat
OUT_FILENAME = 'data_test-'+version+'.csv'

merged_data, label_count = data_preprocessor(cols, data, repNER=repNER)

with open(OUT_FILENAME, 'w', encoding='UTF8', newline='') as f:
    writer = csv.DictWriter(f, fieldnames=['mergedText', 'label'])
    writer.writeheader()
    writer.writerows(merged_data)

f.close()
print(OUT_FILENAME)

data_test-TPrN.csv
