In [1]:
import spacy
import random
from spacy.gold import biluo_tags_from_offsets

nlp = spacy.load("en_core_web_sm")

data_filename_adverse_effect = "DRUG-AE.rel"
data_filename_dose = "DRUG-DOSE.rel"

In [23]:
sentences = []

# handle DRUG-AE.rel
with open(data_filename_adverse_effect) as f:
    for line in f:
        line = line.strip().split("|")
        id, sentence, effect, effect_offset_begin_doc_level, effect_offset_end_doc_level, drug, drug_offset_begin_doc_level, drug_offset_end_doc_level = line
        
        doc = nlp(sentence)
        
        effect_offset_begin = sentence.find(effect)
        effect_offset_end = effect_offset_begin + int(effect_offset_end_doc_level) - int(effect_offset_begin_doc_level)
        drug_offset_begin = sentence.find(drug)
        drug_offset_end = drug_offset_begin + int(drug_offset_end_doc_level) - int(drug_offset_begin_doc_level)
        
        entities = [(effect_offset_begin, effect_offset_end, "Adverse_Effect"), 
                    (drug_offset_begin, drug_offset_end, "Drug")]
        tags = biluo_tags_from_offsets(doc, entities)
        
        for i in range(len(tags)):
            if tags[i][0] == 'U':
                tags[i] = 'B' + tags[i][1:]
            elif tags[i][0] == 'L':
                tags[i] = 'I' + tags[i][1:]
            elif tags[i][0] == 'B':
                tags[i] = 'B'
            elif tags[i][0] == 'I':
                tags[i] = 'I' 
        
        sentence = []
        for i in range(len(tags)):
            sentence.append((i, doc[i], tags[i]))
        
        for i in range(len(tags)):
            if 'Drug' in tags[i]:
                for j in range(len(tags)):
                    if 'Adverse_Effect' in tags[j]:
                        sentence[i] = (i, doc[i], tags[i], ['Causes'], [j])
        
        for i in range(len(tags)):
            if len(sentence[i]) == 3:
                sentence[i] = (str(i), str(doc[i]), tags[i], str(['N']), str([i]))
            else:
                sentence[i] = (str(i), str(doc[i]), tags[i], str(sentence[i][3]), str(sentence[i][4]))
        
        sentences.append(sentence)

# handle DRUG-DOSE.rel
with open(data_filename_dose) as f:
    for line in f:
        line = line.strip().split("|")
        id, sentence, dose, dose_offset_begin_doc_level, dose_offset_end_doc_level, drug, drug_offset_begin_doc_level, drug_offset_end_doc_level = line
        
        doc = nlp(sentence)
        
        dose_offset_begin = sentence.find(dose)
        dose_offset_end = dose_offset_begin + int(dose_offset_end_doc_level) - int(dose_offset_begin_doc_level)
        drug_offset_begin = sentence.find(drug)
        drug_offset_end = drug_offset_begin + int(drug_offset_end_doc_level) - int(drug_offset_begin_doc_level)
        
        entities = [(dose_offset_begin, dose_offset_end, "Dose"), 
                    (drug_offset_begin, drug_offset_end, "Drug")]
        tags = biluo_tags_from_offsets(doc, entities)
        
        for i in range(len(tags)):
            if tags[i][0] == 'U':
                tags[i] = 'B' + tags[i][1:]
            elif tags[i][0] == 'L':
                tags[i] = 'I' + tags[i][1:]
            elif tags[i][0] == 'B':
                tags[i] = 'B'
            elif tags[i][0] == 'I':
                tags[i] = 'I' 
        
        sentence = []
        for i in range(len(tags)):
            sentence.append((i, doc[i], tags[i]))
        
        for i in range(len(tags)):
            if 'Drug' in tags[i]:
                for j in range(len(tags)):
                    if 'Dose' in tags[j]:
                        sentence[i] = (i, doc[i], tags[i], ['Dosage'], [j])
        
        for i in range(len(tags)):
            if len(sentence[i]) == 3:
                sentence[i] = (str(i), str(doc[i]), tags[i], str(['N']), str([i]))
            else:
                sentence[i] = (str(i), str(doc[i]), tags[i], str(sentence[i][3]), str(sentence[i][4]))
        
        sentences.append(sentence)

# shuffle the list
random.shuffle(sentences)

split_ratio = int(len(sentences) * 70/100)
train_set = sentences[:split_ratio]
test_set = sentences[split_ratio:]


with open("train.txt", "w") as f:
    index = 0
    for sentence in train_set:
        f.write("#" + str(index) + "\n")
        for tuple_line in sentence:
            f.write("\t".join(tuple_line))
            f.write("\n")
        index += 1

with open("test.txt", "w") as f:
    index = 0
    for sentence in test_set:
        f.write("#" + str(index) + "\n")
        for tuple_line in sentence:
            f.write("\t".join(tuple_line))
            f.write("\n")
        index += 1