In [1]:
import sys; sys.path.append('../src')

from input_output.parser import Parser
from input_output.writer import Writer
from preprocessing.tokenizer import tokenize
from preprocessing.transformations import CRF_get_tag, CRFfeatureTransformer

from models.crf import CRFClassifier
from structs import DrugEntity

In [2]:
bank_type = 'DDI'
bank_name = 'DrugBank'
test_dir = f'../resources/Test-{bank_type}/{bank_name}/'
train_dir = f'../resources/Train/{bank_name}/'

out_file_name = 'task9.2_CRF1_1.txt'
out_folder = '../out/'
out_file = f'{out_folder}{out_file_name}'

In [3]:
train = Parser(train_dir).call()

train['tokens'] = train['sentence'].apply(tokenize)
train['crf_features'] = train['tokens'].apply(CRFfeatureTransformer().fit_transform)
train['crf_tags'] = train[['tokens', 'parsed_drugs']].apply(CRF_get_tag, axis=1)

In [4]:
train.head()

Unnamed: 0,id,sentence,parsed_drugs,parsed_pairs,tokens,crf_features,crf_tags
0,DDI-DrugBank.d25.s0,Corticotropin may accentuate the electrolyte l...,[<DrugEntity DDI-DrugBank.d25.s0.e0 0-12 Corti...,[<DrugPair DDI-DrugBank.d25.s0.p0 DDI-DrugBank...,"[{'text': 'Corticotropin', 'char_offset': [0, ...","[{'form': 'Corticotropin', 'form_lower': 'cort...","[B-drug, O, O, O, O, O, O, O, B-drug, O, O]"
1,DDI-DrugBank.d7.s0,The rate of metabolism and the leukopenic acti...,[<DrugEntity DDI-DrugBank.d7.s0.e0 54-69 cyclo...,[<DrugPair DDI-DrugBank.d7.s0.p0 DDI-DrugBank....,"[{'text': 'The', 'char_offset': [0, 2]}, {'tex...","[{'form': 'The', 'form_lower': 'the', 'suf3': ...","[O, O, O, O, O, O, O, O, O, B-drug, O, O, O, O..."
2,DDI-DrugBank.d7.s1,The physician should be alert for possible com...,[<DrugEntity DDI-DrugBank.d7.s1.e0 102-117 cyc...,[<DrugPair DDI-DrugBank.d7.s1.p0 DDI-DrugBank....,"[{'text': 'The', 'char_offset': [0, 2]}, {'tex...","[{'form': 'The', 'form_lower': 'the', 'suf3': ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3,DDI-DrugBank.d7.s2,"Cyclophosphamide treatment, which causes a mar...",[<DrugEntity DDI-DrugBank.d7.s2.e0 0-15 Cyclop...,[<DrugPair DDI-DrugBank.d7.s2.p0 DDI-DrugBank....,"[{'text': 'Cyclophosphamide', 'char_offset': [...","[{'form': 'Cyclophosphamide', 'form_lower': 'c...","[B-drug, O, O, O, O, O, O, O, O, O, O, O, O, O..."
4,DDI-DrugBank.d7.s3,If a patient has been treated with cyclophosph...,[<DrugEntity DDI-DrugBank.d7.s3.e0 35-50 cyclo...,[],"[{'text': 'If', 'char_offset': [0, 1]}, {'text...","[{'form': 'If', 'form_lower': 'if', 'suf3': 'I...","[O, O, O, O, O, O, O, B-drug, O, O, O, O, O, O..."


In [5]:
test = Parser(test_dir).call()

test['tokens'] = test['sentence'].apply(tokenize)
test['crf_features'] = test['tokens'].apply(CRFfeatureTransformer().fit_transform)

In [6]:
test.head()

Unnamed: 0,id,sentence,parsed_drugs,parsed_pairs,tokens,crf_features
0,DDI-DrugBank.d696.s0,Methysergide may reverse the analgesic activit...,[<DrugEntity DDI-DrugBank.d696.s0.e0 0-11 Meth...,[<DrugPair DDI-DrugBank.d696.s0.p0 DDI-DrugBan...,"[{'text': 'Methysergide', 'char_offset': [0, 1...","[{'form': 'Methysergide', 'form_lower': 'methy..."
1,DDI-DrugBank.d696.s1,Concurrent use with vasoconstrictor agents inc...,[<DrugEntity DDI-DrugBank.d696.s1.e0 20-41 vas...,[<DrugPair DDI-DrugBank.d696.s1.p0 DDI-DrugBan...,"[{'text': 'Concurrent', 'char_offset': [0, 9]}...","[{'form': 'Concurrent', 'form_lower': 'concurr..."
2,DDI-DrugBank.d756.s0,Dosages of concomitantly administered opioids ...,[<DrugEntity DDI-DrugBank.d756.s0.e0 38-44 opi...,[<DrugPair DDI-DrugBank.d756.s0.p0 DDI-DrugBan...,"[{'text': 'Dosages', 'char_offset': [0, 6]}, {...","[{'form': 'Dosages', 'form_lower': 'dosages', ..."
3,DDI-DrugBank.d756.s1,Combination with tramadol (Ultram) is associat...,[<DrugEntity DDI-DrugBank.d756.s1.e0 17-24 tra...,[<DrugPair DDI-DrugBank.d756.s1.p0 DDI-DrugBan...,"[{'text': 'Combination', 'char_offset': [0, 10...","[{'form': 'Combination', 'form_lower': 'combin..."
4,DDI-DrugBank.d756.s2,Additive sedative effects and confusional stat...,[<DrugEntity DDI-DrugBank.d756.s2.e0 63-77 lev...,[<DrugPair DDI-DrugBank.d756.s2.p0 DDI-DrugBan...,"[{'text': 'Additive', 'char_offset': [0, 7]}, ...","[{'form': 'Additive', 'form_lower': 'additive'..."


In [7]:
clf = CRFClassifier().fit(train['crf_features'], train['crf_tags'])
test['crf_tags'] = clf.predict(test['crf_features'])

In [8]:
drugs = []
for tokens, crf_tags in zip(test['tokens'], test['crf_tags']):
    current_drugs = []
    current_token = None
    for token, crf_tag in zip(tokens, crf_tags):
        if crf_tag == 'O':
            if current_token is not None:
                current_drugs.append(current_token)
                current_token = None
        else:
            if current_token == None:
                current_token = DrugEntity(
                    offsets=token['char_offset'],
                    de_type=crf_tag.split('-')[-1],
                    text=token['text']
                )
            else:
                current_token.offsets = [current_token.offsets[0], token['char_offset'][1]]
                current_token.text = current_token.text + ' ' + token['text']

    drugs.append(current_drugs)

test['drugs'] = drugs

In [9]:
test.head()

Unnamed: 0,id,sentence,parsed_drugs,parsed_pairs,tokens,crf_features,crf_tags,drugs
0,DDI-DrugBank.d696.s0,Methysergide may reverse the analgesic activit...,[<DrugEntity DDI-DrugBank.d696.s0.e0 0-11 Meth...,[<DrugPair DDI-DrugBank.d696.s0.p0 DDI-DrugBan...,"[{'text': 'Methysergide', 'char_offset': [0, 1...","[{'form': 'Methysergide', 'form_lower': 'methy...","[B-drug, O, O, O, O, O, O, B-group, I-group, O]","[<DrugEntity None 0-11 Methysergide drug>, <Dr..."
1,DDI-DrugBank.d696.s1,Concurrent use with vasoconstrictor agents inc...,[<DrugEntity DDI-DrugBank.d696.s1.e0 20-41 vas...,[<DrugPair DDI-DrugBank.d696.s1.p0 DDI-DrugBan...,"[{'text': 'Concurrent', 'char_offset': [0, 9]}...","[{'form': 'Concurrent', 'form_lower': 'concurr...","[O, O, O, B-group, I-group, O, B-group, I-grou...",[<DrugEntity None 20-41 vasoconstrictor agents...
2,DDI-DrugBank.d756.s0,Dosages of concomitantly administered opioids ...,[<DrugEntity DDI-DrugBank.d756.s0.e0 38-44 opi...,[<DrugPair DDI-DrugBank.d756.s0.p0 DDI-DrugBan...,"[{'text': 'Dosages', 'char_offset': [0, 6]}, {...","[{'form': 'Dosages', 'form_lower': 'dosages', ...","[O, O, O, O, B-group, O, O, O, O, O, O, O, O, ...","[<DrugEntity None 38-44 opioids group>, <DrugE..."
3,DDI-DrugBank.d756.s1,Combination with tramadol (Ultram) is associat...,[<DrugEntity DDI-DrugBank.d756.s1.e0 17-24 tra...,[<DrugPair DDI-DrugBank.d756.s1.p0 DDI-DrugBan...,"[{'text': 'Combination', 'char_offset': [0, 10...","[{'form': 'Combination', 'form_lower': 'combin...","[O, O, B-drug, O, B-brand, O, O, O, O, O, O, O...","[<DrugEntity None 17-24 tramadol drug>, <DrugE..."
4,DDI-DrugBank.d756.s2,Additive sedative effects and confusional stat...,[<DrugEntity DDI-DrugBank.d756.s2.e0 63-77 lev...,[<DrugPair DDI-DrugBank.d756.s2.p0 DDI-DrugBan...,"[{'text': 'Additive', 'char_offset': [0, 7]}, ...","[{'form': 'Additive', 'form_lower': 'additive'...","[O, O, O, O, O, O, O, O, O, B-drug, O, O, O, B...","[<DrugEntity None 63-77 levomepromazine drug>,..."


In [11]:
Writer(out_file).call(test, col_names=['drugs'])
pass

In [12]:
results = !java -jar ../bin/evaluateNER.jar {test_dir} {out_file}
!rm {out_folder}*.log *.txt
print('\n'.join(results[-5:-2]))

MACRO-AVERAGE MEASURES:
P	R	F1
0.71	0.55	0.61
