In [1]:
import sys; sys.path.append('../src')

from input_output.parser import Parser
from input_output.writer import Writer
from preprocessing.tokenizer import tokenize
from preprocessing.transformations import CRF_get_tag, CRFfeatureTransformer

from models.crf import CRFClassifier
from structs import DrugEntity

In [2]:
bank_type = 'DDI'
bank_name = 'DrugBank'
test_dir = f'../resources/Test-{bank_type}/{bank_name}/'
train_dir = f'../resources/Train/{bank_name}/'

out_file_name = 'task9.2_CRF1_1.txt'
out_folder = '../out/'
out_file = f'{out_folder}{out_file_name}'

In [3]:
train = Parser(train_dir).call()

train['tokens'] = train['sentence'].apply(tokenize)
train['crf_features'] = train['tokens'].apply(CRFfeatureTransformer().fit_transform)
train['crf_tags'] = train[['tokens', 'parsed_drugs']].apply(CRF_get_tag, axis=1)

In [4]:
train.crf_features[0]

[{'form': 'FLUOTHANE',
  'form_lower': 'fluothane',
  'suf3': 'ANE',
  'suf4': 'HANE',
  'is_upper': True,
  'is_title': False,
  'is_digit': False,
  'pos_tag': 'NNP',
  'rule_classification': 'brand',
  'BoS': True,
  'next_form': 'augments',
  'next_form_lower': 'augments',
  'next_suf3': 'nts',
  'next_suf4': 'ents',
  'next_is_upper': False,
  'next_is_title': False,
  'next_is_digit': False,
  'next_pos_tag': 'VBZ',
  'next_rule_classification': 'group'},
 {'form': 'augments',
  'form_lower': 'augments',
  'suf3': 'nts',
  'suf4': 'ents',
  'is_upper': False,
  'is_title': False,
  'is_digit': False,
  'pos_tag': 'VBZ',
  'rule_classification': 'group',
  'previous_form': 'FLUOTHANE',
  'previous_form_lower': 'fluothane',
  'previous_suf3': 'ANE',
  'previous_suf4': 'HANE',
  'previous_is_upper': True,
  'previous_is_title': False,
  'previous_is_digit': False,
  'previous_pos_tag': 'NNP',
  'previous_rule_classification': 'brand',
  'next_form': 'the',
  'next_form_lower': 'the'

In [5]:
test = Parser(test_dir).call()

test['tokens'] = test['sentence'].apply(tokenize)
test['crf_features'] = test['tokens'].apply(CRFfeatureTransformer().fit_transform)

In [6]:
test.head()

Unnamed: 0,id,sentence,parsed_drugs,parsed_pairs,tokens,crf_features
0,DDI-DrugBank.d635.s0,Other short-acting beta adrenergic aerosol bro...,[<DrugEntity DDI-DrugBank.d635.s0.e0 6-57 shor...,[<DrugPair DDI-DrugBank.d635.s0.p0 DDI-DrugBan...,"[{'text': 'Other', 'char_offset': [0, 4]}, {'t...","[{'form': 'Other', 'form_lower': 'other', 'suf..."
1,DDI-DrugBank.d703.s0,Interaction with Other Central Nervous System ...,[<DrugEntity DDI-DrugBank.d703.s0.e0 23-56 Cen...,[<DrugPair DDI-DrugBank.d703.s0.p0 DDI-DrugBan...,"[{'text': 'Interaction', 'char_offset': [0, 10...","[{'form': 'Interaction', 'form_lower': 'intera..."
2,DDI-DrugBank.d703.s1,"RESPIRATORY DEPRESSION, HYPOTENSION, AND PROFO...",[],[],"[{'text': 'RESPIRATORY', 'char_offset': [0, 10...","[{'form': 'RESPIRATORY', 'form_lower': 'respir..."
3,DDI-DrugBank.d712.s0,There have been no formal drug interaction stu...,[<DrugEntity DDI-DrugBank.d712.s0.e0 66-74 HER...,[],"[{'text': 'There', 'char_offset': [0, 4]}, {'t...","[{'form': 'There', 'form_lower': 'there', 'suf..."
4,DDI-DrugBank.d712.s1,Administration of paclitaxel in combination wi...,[<DrugEntity DDI-DrugBank.d712.s1.e0 18-27 pac...,[<DrugPair DDI-DrugBank.d712.s1.p0 DDI-DrugBan...,"[{'text': 'Administration', 'char_offset': [0,...","[{'form': 'Administration', 'form_lower': 'adm..."


In [7]:
clf = CRFClassifier().fit(train['crf_features'], train['crf_tags'])
test['crf_tags'] = clf.predict(test['crf_features'])

In [8]:
drugs = []
for tokens, crf_tags in zip(test['tokens'], test['crf_tags']):
    current_drugs = []
    current_token = None
    for token, crf_tag in zip(tokens, crf_tags):
        if crf_tag == 'O':
            if current_token is not None:
                current_drugs.append(current_token)
                current_token = None
        else:
            if current_token == None:
                current_token = DrugEntity(
                    offsets=token['char_offset'],
                    de_type=crf_tag.split('-')[-1],
                    text=token['text']
                )
            else:
                current_token.offsets = [current_token.offsets[0], token['char_offset'][1]]
                current_token.text = current_token.text + ' ' + token['text']

    drugs.append(current_drugs)

test['drugs'] = drugs

In [9]:
test.head()

Unnamed: 0,id,sentence,parsed_drugs,parsed_pairs,tokens,crf_features,crf_tags,drugs
0,DDI-DrugBank.d635.s0,Other short-acting beta adrenergic aerosol bro...,[<DrugEntity DDI-DrugBank.d635.s0.e0 6-57 shor...,[<DrugPair DDI-DrugBank.d635.s0.p0 DDI-DrugBan...,"[{'text': 'Other', 'char_offset': [0, 4]}, {'t...","[{'form': 'Other', 'form_lower': 'other', 'suf...","[O, O, O, O, O, B-group, O, O, O, O, O, O, O, ...",[<DrugEntity None 43-57 bronchodilators group>]
1,DDI-DrugBank.d703.s0,Interaction with Other Central Nervous System ...,[<DrugEntity DDI-DrugBank.d703.s0.e0 23-56 Cen...,[<DrugPair DDI-DrugBank.d703.s0.p0 DDI-DrugBan...,"[{'text': 'Interaction', 'char_offset': [0, 10...","[{'form': 'Interaction', 'form_lower': 'intera...","[O, O, O, B-group, I-group, I-group, I-group, ...",[<DrugEntity None 23-56 Central Nervous System...
2,DDI-DrugBank.d703.s1,"RESPIRATORY DEPRESSION, HYPOTENSION, AND PROFO...",[],[],"[{'text': 'RESPIRATORY', 'char_offset': [0, 10...","[{'form': 'RESPIRATORY', 'form_lower': 'respir...","[O, O, O, O, O, O, O, O, O, O, O, O, O]",[]
3,DDI-DrugBank.d712.s0,There have been no formal drug interaction stu...,[<DrugEntity DDI-DrugBank.d712.s0.e0 66-74 HER...,[],"[{'text': 'There', 'char_offset': [0, 4]}, {'t...","[{'form': 'There', 'form_lower': 'there', 'suf...","[O, O, O, O, O, O, O, O, O, O, B-brand, O, O, O]",[<DrugEntity None 66-74 HERCEPTIN brand>]
4,DDI-DrugBank.d712.s1,Administration of paclitaxel in combination wi...,[<DrugEntity DDI-DrugBank.d712.s1.e0 18-27 pac...,[<DrugPair DDI-DrugBank.d712.s1.p0 DDI-DrugBan...,"[{'text': 'Administration', 'char_offset': [0,...","[{'form': 'Administration', 'form_lower': 'adm...","[O, O, B-drug, O, O, O, B-brand, O, O, O, O, O...","[<DrugEntity None 18-27 paclitaxel drug>, <Dru..."


In [10]:
Writer(out_file).call(test, col_names=['drugs'])
pass

In [11]:
results = !java -jar ../bin/evaluateNER.jar {test_dir} {out_file}
!rm {out_folder}*.log *.txt
print('\n'.join(results[-5:-2]))

MACRO-AVERAGE MEASURES:
P	R	F1
0.71	0.58	0.63
