In [1]:
import sys; sys.path.append('../src')

from input_output.parser import Parser
from input_output.writer import Writer
from preprocessing.tokenizer import tokenize
from preprocessing.transformations import CRFfeatureTransformer, crfGetTag
from models.rules import classify_token, classify_tokens
import pandas as pd

In [2]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 1000)

In [3]:
bank_type = 'DDI'
bank_name = 'DrugBank'
bank_dir = f'../resources/Test-{bank_type}/{bank_name}/'

out_file_name = 'task9.2_CRF1_1.txt'
out_folder = '../out/'
out_file = f'{out_folder}{out_file_name}'

In [4]:
df = Parser(bank_dir).call()

df['tokens'] = df['sentence'].apply(tokenize)
df['drugs'] = df['tokens'].apply(classify_tokens)

In [5]:
#df.head()

In [6]:
crf_transformer = CRFfeatureTransformer()
df['crf_features'] = df['tokens'].apply(crf_transformer.fit_transform)


In [7]:
df['crf_tags'] = df[['tokens', 'parsed_drugs']].apply(crfGetTag, axis=1)

In [8]:
df.head()

Unnamed: 0,id,sentence,parsed_drugs,parsed_pairs,tokens,drugs,crf_features,crf_tags
0,DDI-DrugBank.d610.s0,Pharmacokinetic properties of abacavir were not altered by the addition of either lamivudine or zidovudine or the combination of lamivudine and zidovudine.,"[<DrugEntity DDI-DrugBank.d610.s0.e0 30-37 abacavir drug>, <DrugEntity DDI-DrugBank.d610.s0.e1 82-91 lamivudine drug>, <DrugEntity DDI-DrugBank.d610.s0.e2 96-105 zidovudine drug>, <DrugEntity DDI-DrugBank.d610.s0.e3 129-138 lamivudine drug>, <DrugEntity DDI-DrugBank.d610.s0.e4 144-153 zidovudine drug>]","[<DrugPair DDI-DrugBank.d610.s0.p0 DDI-DrugBank.d610.s0.e0 DDI-DrugBank.d610.s0.e1 None>, <DrugPair DDI-DrugBank.d610.s0.p1 DDI-DrugBank.d610.s0.e0 DDI-DrugBank.d610.s0.e2 None>, <DrugPair DDI-DrugBank.d610.s0.p2 DDI-DrugBank.d610.s0.e0 DDI-DrugBank.d610.s0.e3 None>, <DrugPair DDI-DrugBank.d610.s0.p3 DDI-DrugBank.d610.s0.e0 DDI-DrugBank.d610.s0.e4 None>, <DrugPair DDI-DrugBank.d610.s0.p4 DDI-DrugBank.d610.s0.e1 DDI-DrugBank.d610.s0.e2 None>, <DrugPair DDI-DrugBank.d610.s0.p5 DDI-DrugBank.d610.s0.e1 DDI-DrugBank.d610.s0.e3 None>, <DrugPair DDI-DrugBank.d610.s0.p6 DDI-DrugBank.d610.s0.e1 DDI-DrugBank.d610.s0.e4 None>, <DrugPair DDI-DrugBank.d610.s0.p7 DDI-DrugBank.d610.s0.e2 DDI-DrugBank.d610.s0.e3 None>, <DrugPair DDI-DrugBank.d610.s0.p8 DDI-DrugBank.d610.s0.e2 DDI-DrugBank.d610.s0.e4 None>, <DrugPair DDI-DrugBank.d610.s0.p9 DDI-DrugBank.d610.s0.e3 DDI-DrugBank.d610.s0.e4 None>]","[{'text': 'Pharmacokinetic', 'char_offset': [0, 14]}, {'text': 'properties', 'char_offset': [16, 25]}, {'text': 'of', 'char_offset': [27, 28]}, {'text': 'abacavir', 'char_offset': [30, 37]}, {'text': 'were', 'char_offset': [39, 42]}, {'text': 'not', 'char_offset': [44, 46]}, {'text': 'altered', 'char_offset': [48, 54]}, {'text': 'by', 'char_offset': [56, 57]}, {'text': 'the', 'char_offset': [59, 61]}, {'text': 'addition', 'char_offset': [63, 70]}, {'text': 'of', 'char_offset': [72, 73]}, {'text': 'either', 'char_offset': [75, 80]}, {'text': 'lamivudine', 'char_offset': [82, 91]}, {'text': 'or', 'char_offset': [93, 94]}, {'text': 'zidovudine', 'char_offset': [96, 105]}, {'text': 'or', 'char_offset': [107, 108]}, {'text': 'the', 'char_offset': [110, 112]}, {'text': 'combination', 'char_offset': [114, 124]}, {'text': 'of', 'char_offset': [126, 127]}, {'text': 'lamivudine', 'char_offset': [129, 138]}, {'text': 'and', 'char_offset': [140, 142]}, {'text': 'zidovudine', 'char_offset': [14...","[<DrugEntity None 0-14 Pharmacokinetic group>, <DrugEntity None 30-37 abacavir drug>, <DrugEntity None 82-91 lamivudine drug>, <DrugEntity None 96-105 zidovudine drug>, <DrugEntity None 129-138 lamivudine drug>, <DrugEntity None 144-153 zidovudine drug>]","[[0, 14, form=Pharmacokinetic, formlower=pharmacokinetic, suf3=tic, suf4=etic, isTitle, BoS, formNext=properties, formlowerNext=properties, suf3Next=ies, suf4Next=ties, isTitleNext], [16, 25, form=properties, formlower=properties, suf3=ies, suf4=ties, formPrev=Pharmacokinetic, formlowerPrev=pharmacokinetic, suf3Prev=tic, suf4Prev=etic, formNext=of, formlowerNext=of, suf3Next=of, suf4Next=of], [27, 28, form=of, formlower=of, suf3=of, suf4=of, formPrev=properties, formlowerPrev=properties, suf3Prev=ies, suf4Prev=ties, formNext=abacavir, formlowerNext=abacavir, suf3Next=vir, suf4Next=avir], [30, 37, form=abacavir, formlower=abacavir, suf3=vir, suf4=avir, formPrev=of, formlowerPrev=of, suf3Prev=of, suf4Prev=of, formNext=were, formlowerNext=were, suf3Next=ere, suf4Next=were], [39, 42, form=were, formlower=were, suf3=ere, suf4=were, formPrev=abacavir, formlowerPrev=abacavir, suf3Prev=vir, suf4Prev=avir, formNext=not, formlowerNext=not, suf3Next=not, suf4Next=not], [44, 46, form=not, form...","[O, O, O, B-drug, O, O, O, O, O, O, O, O, B-drug, O, B-drug, O, O, O, O, B-drug, O, B-drug, O]"
1,DDI-DrugBank.d610.s1,No clinically significant changes to lamivudine or zidovudine pharmacokinetics were observed following concomitant administration of abacavir.,"[<DrugEntity DDI-DrugBank.d610.s1.e0 37-46 lamivudine drug>, <DrugEntity DDI-DrugBank.d610.s1.e1 51-60 zidovudine drug>, <DrugEntity DDI-DrugBank.d610.s1.e2 133-140 abacavir drug>]","[<DrugPair DDI-DrugBank.d610.s1.p0 DDI-DrugBank.d610.s1.e0 DDI-DrugBank.d610.s1.e1 None>, <DrugPair DDI-DrugBank.d610.s1.p1 DDI-DrugBank.d610.s1.e0 DDI-DrugBank.d610.s1.e2 None>, <DrugPair DDI-DrugBank.d610.s1.p2 DDI-DrugBank.d610.s1.e1 DDI-DrugBank.d610.s1.e2 None>]","[{'text': 'No', 'char_offset': [0, 1]}, {'text': 'clinically', 'char_offset': [3, 12]}, {'text': 'significant', 'char_offset': [14, 24]}, {'text': 'changes', 'char_offset': [26, 32]}, {'text': 'to', 'char_offset': [34, 35]}, {'text': 'lamivudine', 'char_offset': [37, 46]}, {'text': 'or', 'char_offset': [48, 49]}, {'text': 'zidovudine', 'char_offset': [51, 60]}, {'text': 'pharmacokinetics', 'char_offset': [62, 77]}, {'text': 'were', 'char_offset': [79, 82]}, {'text': 'observed', 'char_offset': [84, 91]}, {'text': 'following', 'char_offset': [93, 101]}, {'text': 'concomitant', 'char_offset': [103, 113]}, {'text': 'administration', 'char_offset': [115, 128]}, {'text': 'of', 'char_offset': [130, 131]}, {'text': 'abacavir', 'char_offset': [133, 140]}, {'text': '.', 'char_offset': [141, 141]}]","[<DrugEntity None 14-24 significant group>, <DrugEntity None 37-46 lamivudine drug>, <DrugEntity None 51-60 zidovudine drug>, <DrugEntity None 62-77 pharmacokinetics group>, <DrugEntity None 103-113 concomitant group>, <DrugEntity None 133-140 abacavir drug>]","[[0, 1, form=No, formlower=no, suf3=No, suf4=No, isTitle, BoS, formNext=clinically, formlowerNext=clinically, suf3Next=lly, suf4Next=ally, isTitleNext], [3, 12, form=clinically, formlower=clinically, suf3=lly, suf4=ally, formPrev=No, formlowerPrev=no, suf3Prev=No, suf4Prev=No, formNext=significant, formlowerNext=significant, suf3Next=ant, suf4Next=cant], [14, 24, form=significant, formlower=significant, suf3=ant, suf4=cant, formPrev=clinically, formlowerPrev=clinically, suf3Prev=lly, suf4Prev=ally, formNext=changes, formlowerNext=changes, suf3Next=ges, suf4Next=nges], [26, 32, form=changes, formlower=changes, suf3=ges, suf4=nges, formPrev=significant, formlowerPrev=significant, suf3Prev=ant, suf4Prev=cant, formNext=to, formlowerNext=to, suf3Next=to, suf4Next=to], [34, 35, form=to, formlower=to, suf3=to, suf4=to, formPrev=changes, formlowerPrev=changes, suf3Prev=ges, suf4Prev=nges, formNext=lamivudine, formlowerNext=lamivudine, suf3Next=ine, suf4Next=dine], [37, 46, form=lamivudine,...","[O, O, O, O, O, B-drug, O, B-drug, O, O, O, O, O, O, O, B-drug, O]"
2,DDI-DrugBank.d610.s2,Abacavir has no effect on the pharmacokinetic properties of ethanol.,"[<DrugEntity DDI-DrugBank.d610.s2.e0 0-7 Abacavir drug>, <DrugEntity DDI-DrugBank.d610.s2.e1 60-66 ethanol drug>]",[<DrugPair DDI-DrugBank.d610.s2.p0 DDI-DrugBank.d610.s2.e0 DDI-DrugBank.d610.s2.e1 None>],"[{'text': 'Abacavir', 'char_offset': [0, 7]}, {'text': 'has', 'char_offset': [9, 11]}, {'text': 'no', 'char_offset': [13, 14]}, {'text': 'effect', 'char_offset': [16, 21]}, {'text': 'on', 'char_offset': [23, 24]}, {'text': 'the', 'char_offset': [26, 28]}, {'text': 'pharmacokinetic', 'char_offset': [30, 44]}, {'text': 'properties', 'char_offset': [46, 55]}, {'text': 'of', 'char_offset': [57, 58]}, {'text': 'ethanol', 'char_offset': [60, 66]}, {'text': '.', 'char_offset': [67, 67]}]","[<DrugEntity None 0-7 Abacavir drug>, <DrugEntity None 30-44 pharmacokinetic group>, <DrugEntity None 60-66 ethanol drug>]","[[0, 7, form=Abacavir, formlower=abacavir, suf3=vir, suf4=avir, isTitle, BoS, formNext=has, formlowerNext=has, suf3Next=has, suf4Next=has, isTitleNext], [9, 11, form=has, formlower=has, suf3=has, suf4=has, formPrev=Abacavir, formlowerPrev=abacavir, suf3Prev=vir, suf4Prev=avir, formNext=no, formlowerNext=no, suf3Next=no, suf4Next=no], [13, 14, form=no, formlower=no, suf3=no, suf4=no, formPrev=has, formlowerPrev=has, suf3Prev=has, suf4Prev=has, formNext=effect, formlowerNext=effect, suf3Next=ect, suf4Next=fect], [16, 21, form=effect, formlower=effect, suf3=ect, suf4=fect, formPrev=no, formlowerPrev=no, suf3Prev=no, suf4Prev=no, formNext=on, formlowerNext=on, suf3Next=on, suf4Next=on], [23, 24, form=on, formlower=on, suf3=on, suf4=on, formPrev=effect, formlowerPrev=effect, suf3Prev=ect, suf4Prev=fect, formNext=the, formlowerNext=the, suf3Next=the, suf4Next=the], [26, 28, form=the, formlower=the, suf3=the, suf4=the, formPrev=on, formlowerPrev=on, suf3Prev=on, suf4Prev=on, formNext=phar...","[B-drug, O, O, O, O, O, O, O, O, B-drug, O]"
3,DDI-DrugBank.d610.s3,Ethanol decreases the elimination of abacavir causing an increase in overall exposure . The addition of methadone has no clinically significant effect on the pharmacokinetic properties of abacavir.,"[<DrugEntity DDI-DrugBank.d610.s3.e0 0-6 Ethanol drug>, <DrugEntity DDI-DrugBank.d610.s3.e1 37-44 abacavir drug>, <DrugEntity DDI-DrugBank.d610.s3.e2 104-112 methadone drug>, <DrugEntity DDI-DrugBank.d610.s3.e3 188-195 abacavir drug>]","[<DrugPair DDI-DrugBank.d610.s3.p0 DDI-DrugBank.d610.s3.e0 DDI-DrugBank.d610.s3.e1 mechanism>, <DrugPair DDI-DrugBank.d610.s3.p1 DDI-DrugBank.d610.s3.e0 DDI-DrugBank.d610.s3.e2 None>, <DrugPair DDI-DrugBank.d610.s3.p2 DDI-DrugBank.d610.s3.e0 DDI-DrugBank.d610.s3.e3 None>, <DrugPair DDI-DrugBank.d610.s3.p3 DDI-DrugBank.d610.s3.e1 DDI-DrugBank.d610.s3.e2 None>, <DrugPair DDI-DrugBank.d610.s3.p4 DDI-DrugBank.d610.s3.e1 DDI-DrugBank.d610.s3.e3 None>, <DrugPair DDI-DrugBank.d610.s3.p5 DDI-DrugBank.d610.s3.e2 DDI-DrugBank.d610.s3.e3 None>]","[{'text': 'Ethanol', 'char_offset': [0, 6]}, {'text': 'decreases', 'char_offset': [8, 16]}, {'text': 'the', 'char_offset': [18, 20]}, {'text': 'elimination', 'char_offset': [22, 32]}, {'text': 'of', 'char_offset': [34, 35]}, {'text': 'abacavir', 'char_offset': [37, 44]}, {'text': 'causing', 'char_offset': [46, 52]}, {'text': 'an', 'char_offset': [54, 55]}, {'text': 'increase', 'char_offset': [57, 64]}, {'text': 'in', 'char_offset': [66, 67]}, {'text': 'overall', 'char_offset': [69, 75]}, {'text': 'exposure', 'char_offset': [77, 84]}, {'text': '.', 'char_offset': [86, 86]}, {'text': 'The', 'char_offset': [88, 90]}, {'text': 'addition', 'char_offset': [92, 99]}, {'text': 'of', 'char_offset': [101, 102]}, {'text': 'methadone', 'char_offset': [104, 112]}, {'text': 'has', 'char_offset': [114, 116]}, {'text': 'no', 'char_offset': [118, 119]}, {'text': 'clinically', 'char_offset': [121, 130]}, {'text': 'significant', 'char_offset': [132, 142]}, {'text': 'effect', 'char_offset': [144, 149]...","[<DrugEntity None 0-6 Ethanol drug>, <DrugEntity None 37-44 abacavir drug>, <DrugEntity None 104-112 methadone drug>, <DrugEntity None 132-142 significant group>, <DrugEntity None 158-172 pharmacokinetic group>, <DrugEntity None 188-195 abacavir drug>]","[[0, 6, form=Ethanol, formlower=ethanol, suf3=nol, suf4=anol, isTitle, BoS, formNext=decreases, formlowerNext=decreases, suf3Next=ses, suf4Next=ases, isTitleNext], [8, 16, form=decreases, formlower=decreases, suf3=ses, suf4=ases, formPrev=Ethanol, formlowerPrev=ethanol, suf3Prev=nol, suf4Prev=anol, formNext=the, formlowerNext=the, suf3Next=the, suf4Next=the], [18, 20, form=the, formlower=the, suf3=the, suf4=the, formPrev=decreases, formlowerPrev=decreases, suf3Prev=ses, suf4Prev=ases, formNext=elimination, formlowerNext=elimination, suf3Next=ion, suf4Next=tion], [22, 32, form=elimination, formlower=elimination, suf3=ion, suf4=tion, formPrev=the, formlowerPrev=the, suf3Prev=the, suf4Prev=the, formNext=of, formlowerNext=of, suf3Next=of, suf4Next=of], [34, 35, form=of, formlower=of, suf3=of, suf4=of, formPrev=elimination, formlowerPrev=elimination, suf3Prev=ion, suf4Prev=tion, formNext=abacavir, formlowerNext=abacavir, suf3Next=vir, suf4Next=avir], [37, 44, form=abacavir, formlower=ab...","[B-drug, O, O, O, O, B-drug, O, O, O, O, O, O, O, O, O, O, B-drug, O, O, O, O, O, O, O, O, O, O, B-drug, O]"
4,DDI-DrugBank.d610.s4,"In a study of 11 HIV-infected patients receiving methadone-maintenance therapy (40 mg and 90 mg daily) with 600 mg of ZIAGEN twice daily (twice the currently recommended dose), oral methadone clearance increased 22% (90% CI 6% to 42%).","[<DrugEntity DDI-DrugBank.d610.s4.e0 49-57 methadone drug>, <DrugEntity DDI-DrugBank.d610.s4.e1 118-123 ZIAGEN brand>, <DrugEntity DDI-DrugBank.d610.s4.e2 182-190 methadone drug>]","[<DrugPair DDI-DrugBank.d610.s4.p0 DDI-DrugBank.d610.s4.e0 DDI-DrugBank.d610.s4.e1 mechanism>, <DrugPair DDI-DrugBank.d610.s4.p1 DDI-DrugBank.d610.s4.e0 DDI-DrugBank.d610.s4.e2 None>, <DrugPair DDI-DrugBank.d610.s4.p2 DDI-DrugBank.d610.s4.e1 DDI-DrugBank.d610.s4.e2 None>]","[{'text': 'In', 'char_offset': [0, 1]}, {'text': 'a', 'char_offset': [3, 3]}, {'text': 'study', 'char_offset': [5, 9]}, {'text': 'of', 'char_offset': [11, 12]}, {'text': '11', 'char_offset': [14, 15]}, {'text': 'HIV-infected', 'char_offset': [17, 28]}, {'text': 'patients', 'char_offset': [30, 37]}, {'text': 'receiving', 'char_offset': [39, 47]}, {'text': 'methadone-maintenance', 'char_offset': [49, 69]}, {'text': 'therapy', 'char_offset': [71, 77]}, {'text': '(', 'char_offset': [79, 79]}, {'text': '40', 'char_offset': [80, 81]}, {'text': 'mg', 'char_offset': [83, 84]}, {'text': 'and', 'char_offset': [86, 88]}, {'text': '90', 'char_offset': [90, 91]}, {'text': 'mg', 'char_offset': [93, 94]}, {'text': 'daily', 'char_offset': [96, 100]}, {'text': ')', 'char_offset': [101, 101]}, {'text': 'with', 'char_offset': [103, 106]}, {'text': '600', 'char_offset': [108, 110]}, {'text': 'mg', 'char_offset': [112, 113]}, {'text': 'of', 'char_offset': [115, 116]}, {'text': 'ZIAGEN', 'char_offset': ...","[<DrugEntity None 30-37 patients group>, <DrugEntity None 118-123 ZIAGEN brand>, <DrugEntity None 182-190 methadone drug>, <DrugEntity None 221-222 CI brand>]","[[0, 1, form=In, formlower=in, suf3=In, suf4=In, isTitle, BoS, formNext=a, formlowerNext=a, suf3Next=a, suf4Next=a, isTitleNext], [3, 3, form=a, formlower=a, suf3=a, suf4=a, formPrev=In, formlowerPrev=in, suf3Prev=In, suf4Prev=In, formNext=study, formlowerNext=study, suf3Next=udy, suf4Next=tudy], [5, 9, form=study, formlower=study, suf3=udy, suf4=tudy, formPrev=a, formlowerPrev=a, suf3Prev=a, suf4Prev=a, formNext=of, formlowerNext=of, suf3Next=of, suf4Next=of], [11, 12, form=of, formlower=of, suf3=of, suf4=of, formPrev=study, formlowerPrev=study, suf3Prev=udy, suf4Prev=tudy, formNext=11, formlowerNext=11, suf3Next=11, suf4Next=11], [14, 15, form=11, formlower=11, suf3=11, suf4=11, isDigit, formPrev=of, formlowerPrev=of, suf3Prev=of, suf4Prev=of, isDigitPrev, formNext=HIV-infected, formlowerNext=hiv-infected, suf3Next=ted, suf4Next=cted, isDigitNext], [17, 28, form=HIV-infected, formlower=hiv-infected, suf3=ted, suf4=cted, formPrev=11, formlowerPrev=11, suf3Prev=11, suf4Prev=11, for...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, B-brand, O, O, O, O, O, O, O, O, O, O, O, B-drug, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"


In [9]:
len(df.crf_tags[0])

23

In [10]:
len(df.crf_features[0])

23

In [11]:
len(df.parsed_drugs[0])

5