Converting DDI2013 XML format to tsv
( SpERT )

In [None]:
import csv
import logging
import os
import re
import bioc
from lxml import etree

In [None]:
def create_ddi_bert(gold_directory, output):
    fp = open(output, 'w')
    writer = csv.writer(fp, delimiter='\t', lineterminator='\n')
    writer.writerow(['index', 'sentence', 'entity-1', 'entity-1-type', 'entity-2', 'entity-2-type', 'label' ])
    cnt = 0
    count = 0
    false_ = 0
    for root, dirs, files in os.walk(gold_directory):
        for name in files:
            pathname = os.path.join(root, name)
            ##parsing the xml docs:             
            tree = etree.parse(pathname)
            for stag in tree.xpath('/document/sentence'):
                sentence = bioc.BioCSentence()
                sentence.offset = 0
                sentence.text = stag.get('text')

                ##Extracting entities
                entities = {}
                for etag in stag.xpath('entity'):
                    id = etag.get('id')
                    m = re.match('(\d+)-(\d+)', etag.get('charOffset'))
                    if m is None:
                        logging.warning('{}:{}: charOffset does not match. {}'.format(
                        output, id, etag.get('charOffset')))
                        continue
                    start = int(m.group(1))
                    end = int(m.group(2)) + 1
                    expected_text = etag.get('text')
                    actual_text = sentence.text[start:end]
                    if expected_text != actual_text:
                        logging.warning('{}:{}: Text does not match. Expected {}. Actual {}'.format(
                            output, id, repr(expected_text), repr(actual_text)))
                    entities[id] = {
                        'start': start,
                        'end': end,
                        'type': etag.get('type'),
                        'id': id,
                        'text': actual_text
                    }
                ##Extracting relations    
                for rtag in stag.xpath('pair'):
                    if rtag.get('ddi') == 'false':
                        label = 'DDI-false'
                        false_ += 1
                        continue
                    else:
                        label = 'DDI-{}'.format(rtag.get('type'))
                        cnt += 1
                  
                  
                    e1 = entities.get(rtag.get('e1'))
                    e2 = entities.get(rtag.get('e2'))
                    
                    text = sentence.text
              
                    row = {}
                    row['sentence'] = text
                    row['entity_1 '] = e1['text']
                    row['entity_1_type'] = e1['type']
                    row['entity_2 '] = e2['text']
                    row['entity_2_type'] = e2['type']
                    row['label'] = label
                    count += 1
                    writer.writerow([f'{rtag.get("id")}',  row['sentence'], e1['text'], row['entity_1_type'], row['entity_2 '], row['entity_2_type'], row['label']])

            
   
    print(f'Have {cnt} relations')
    print(f'Have {false_} false instances')
    print(f'Written {count} instances')



In [None]:
gold_directory = './original/Test'
output = './test_added.tsv'

In [None]:
create_ddi_bert(gold_directory, output)

Splitting train and dev

In [None]:
df = pd.read_csv('Datasets/ddi2013-type/train_added.tsv',  sep='\t')
df_new = df

df_new = df_new.sample(frac=1, random_state=42)
split = int(0.8 * len(df_new))

train_df = df_new[:split]
valid_df = df_new[split:]

train_df.to_csv('./train_train_added.tsv', sep='\t')
valid_df.to_csv('./train_dev_added.tsv', sep='\t')

Converting tsv to json

In [None]:
import spacy 
def tokenize_json(text):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    return [ str(t) for t in doc]

In [None]:
import re
def create_entities(text, e1, e1_type, e2 , e2_type, index):  
    
    i = -1
    e1_tok = tokenize_json(e1)
    e2_tok = tokenize_json(e2)

   
  #######For handling the miscellaneous in train set 
    if e2 == 'InsP(3)':
        e2 = 'InsP(3)-induced'
    elif e1 == 'amphetamine':
        e1 = 'ephedrine/amphetamine'
    elif e2 == 'halothane':
        e2 = 'enflurane;isoflurane;halothane;certain' 
    elif e2 == 'lithium':
        e2 = 'polymyxins;lithium;magnesium'
    elif e2 == 'procainamide':
        e2 = 'salts;procainamide;and'   
    elif e2 == 'gentamicin':
        e2 = 'MDP)-gentamicin' 
    elif e2 == 'isoflurane':
        e2 = 'enflurane;isoflurane;halothane;certain'
    elif e2 == 'magnesium':
        e2 = 'polymyxins;lithium;magnesium' 
         

    doc = tokenize_json(text)
  
    ## Flags 
    match_1 = 0
    match_2 = 0

    #ENTITY 1
    for t in doc:
        i += 1
        if t == e1 or t == e1_tok[0]:
            if len(e1_tok) != 1:
                print(f'Matched entity #1 is: {e1} ')
                match_1 = 1
                start_1 = i
                end_1 = i + len(e1_tok)
            else:
                print(f'Matched entity #1 is: {e1} ')
                match_1 = 1
                start_1 = i
                end_1 = i + 1
        else:
              continue
 
    i = -1
    
    ##If entity 1 does not have an exact match
    if match_1 != 1:
        for t in doc:
            i +=1
            match = re.match(e1 , t)
            if match:
                print(f"Exact match found of e1 with {t}")
                match_1 = 1
                start_1 = i
                end_1 = i+1
                print(f'Span {start_1} to {end_1}')
            else:
                continue  
  

    i = -1
    print(" I RE-INITIALISED")


    #ENTITY 2
    for t in doc:
        i += 1
        if t == e2 or t == e2_tok[0]:
            if len(e2_tok) != 1:
                print(f'Matched entity #2 is: {e2} ')
                match_2 = 1
                start_2 = i
                end_2 = i + len(e2_tok)
            else:
                print(f'Matched entity #2 is: {e2} ')
                match_2 = 1
                start_2 = i
                end_2 = i + 1
        else:
            continue
  
    i = -1
  

    ##If entity 2 does not have an exact match
    if match_2 != 1:
        for t in doc:
            i +=1
            match = re.match(e2 , t)
            if match:
                print(f"Exact match found of e2 with {t}")
                match_2 = 1
                start_2 = i
                end_2 = i+1
                print(f'Span {start_2} to {end_2}')
            else:
                continue      


    e1_d = {}
    e1_d['type'] = e1_type
    e1_d['start'] = start_1
    e1_d['end'] = end_1

    e2_d = {}
    e2_d['type'] = e2_type
    e2_d['start'] = start_2
    e2_d['end'] = end_2

    entities = {}
    entities["entities"] = [e1_d, e2_d]

    return entities


In [None]:
def create_relations(label):
    ## All the relations in ddi2013 are symmetric
    rel = {}
    rel["type"] = label
    rel["head"] = 0 
    rel["tail"] = 1
  
    relation = {}
    relation['relations'] = [rel]
    
    return relation

In [None]:
#### FOR TRAIN
import csv
import json

cnt = 0
entire_content = []
##title = ['Unnamed: 0', 'index', 'sentence', 'entity-1', 'entity-1-type', 'entity-2', 'entity-2-type', 'label']

with open('./train_train_added.tsv', newline='') as csvfile:
    i = 0 
    reader = csv.reader(csvfile, delimiter='\t')
    for row in reader:
        final_row = {}
        if i == 0:
            i += 1
            continue
        else : 
            i += 1
    
            ### Tokens in the dictionary
            text = row[2]
            text = tokenize_json(text)
         
            final_row['tokens'] = text
         
            ### Entities in the dictionary
            entities = create_entities( row[2], row[3], row[4], row[5] , row[6], row[1])
         
            final_row['entities'] = entities["entities"]
         
            relations = create_relations(row[7])
         
            final_row['relations'] = relations['relations']
         
            final_row['orig_id'] = row[1]
        
            entire_content.append(final_row)
            cnt += 1
            print(f'Have {cnt} relations till now')
         
        
print(f'Have {cnt} total relations')

with open('./train_ddi_json.json', 'w') as f:
    json.dump(entire_content, f)



In [None]:
#### FOR TEST
import csv
import json

cnt = 0
entire_content = []
##title = ['index', 'sentence', 'entity-1', 'entity-1-type', 'entity-2', 'entity-2-type', 'label']

with open('./test_added.tsv', newline='') as csvfile:
    i = 0 
    reader = csv.reader(csvfile, delimiter='\t')
    for row in reader:
        final_row = {}
        if i == 0:
            i += 1
            continue
        else : 
            if row[0] == 'DDI-MedLine.d161.s5.p5':
                continue
            if row[0] == 'DDI-MedLine.d161.s6.p0':
                continue  
            if row[0] == 'DDI-MedLine.d161.s8.p2':
                continue
            i += 1
    
             ### Tokens in the dictionary
            text = row[2]
            text = tokenize_json(text)
         
            final_row['tokens'] = text
         
            ### Entities in the dictionary
            entities = create_entities( row[2], row[3], row[4], row[5] , row[6], row[1])
         
            final_row['entities'] = entities["entities"]
         
            relations = create_relations(row[7])
         
            final_row['relations'] = relations['relations']
         
            final_row['orig_id'] = row[1]
        
            entire_content.append(final_row)
            cnt += 1
            print(f'Have {cnt} relations till now')
         
        
print(f'Have {cnt} total relations')

with open('./test_ddi_json.json', 'w') as f:
    json.dump(entire_content, f)


In [None]:
#### FOR TEST
import csv
import json

cnt = 0
entire_content = []
##title = ['Unnamed: 0','index', 'sentence', 'entity-1', 'entity-1-type', 'entity-2', 'entity-2-type', 'label']

with open('./train_dev_addeD.tsv', newline='') as csvfile:
    i = 0 
    reader = csv.reader(csvfile, delimiter='\t')
    for row in reader:
        final_row = {}
        if i == 0:
            i += 1
            continue
        else : 
            i += 1
    
             ### Tokens in the dictionary
            text = row[2]
            text = tokenize_json(text)
         
            final_row['tokens'] = text
         
            ### Entities in the dictionary
            entities = create_entities( row[2], row[3], row[4], row[5] , row[6], row[1])
         
            final_row['entities'] = entities["entities"]
         
            relations = create_relations(row[7])
         
            final_row['relations'] = relations['relations']
         
            final_row['orig_id'] = row[1]
        
            entire_content.append(final_row)
            cnt += 1
            print(f'Have {cnt} relations till now')
         
        
print(f'Have {cnt} total relations')

with open('./dev_ddi_json.json', 'w') as f:
    json.dump(entire_content, f)


Craeting types.json

In [None]:
def elaborate_entity(entity):
    e = {}
    e['short'] = entity
    e['verbose'] = entity
    return e

In [None]:
def elaborate_relation(relation):
    r = {}
    r['short'] = relation[4:]
    r['verbose'] = relation
    r['symmetric'] = True
    return r

In [None]:
import json
 
entities = {}
relations = {}

entities_list = ['drug', 'group', 'brand', 'drug_n']
for e in entities_list:
    entities[e] = elaborate_entity(e)


relations_list = ['DDI-mechanism', 'DDI-effect', 'DDI-advise', 'DDI-int']
for r in relations_list:
    relations[r] = elaborate_relation(r)

types = {}
types['entities'] = entities
types['relations'] = relations

with open('spert-master/DDI2013/ddi_types.json', 'w') as f:
    json.dump(types, f)
