In [288]:
import sys; sys.path.append('../src')

from input_output.parser import Parser
from input_output.writer import Writer
from tokenizer import tokenize
from pipeline.rules.token_classifier import classify_token
import pandas as pd

In [336]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 1000)

In [290]:
bank_type = 'NER'
bank_name = 'DrugBank'
#dir_type = f'Test-{bank_type}'
dir_type = f'Train'
bank_dir = f'../resources/{dir_type}/{bank_name}/'


In [291]:
df = Parser(bank_dir).call()

df['tokens'] = df['sentence'].apply(tokenize)

get_drugs = lambda tokens: [t for t in  [classify_token(t) for t in tokens] if t is not None]
df['drugs'] = df['tokens'].apply(get_drugs)

In [403]:
df.head()

Unnamed: 0,id,sentence,parsed_drugs,parsed_pairs,tokens,drugs
0,DDI-DrugBank.d436.s0,"No drug, nutritional supplement, food or herb ...",[],[],"[{'text': 'No', 'char_offset': '0-1'}, {'text'...",[<DrugEntity None 46-57 interactions group>]
1,DDI-DrugBank.d519.s0,No formal drug/drug interaction studies with P...,[<DrugEntity DDI-DrugBank.d519.s0.e0 45-52 Ple...,[],"[{'text': 'No', 'char_offset': '0-1'}, {'text'...",[]
2,DDI-DrugBank.d519.s1,Cytochrome P-450 is not known to be involved i...,[<DrugEntity DDI-DrugBank.d519.s1.e0 66-73 Ple...,[],"[{'text': 'Cytochrome', 'char_offset': '0-9'},...",[<DrugEntity None 11-15 P-450 brand>]
3,DDI-DrugBank.d519.s2,Plenaxis is highly bound to plasma proteins (9...,[<DrugEntity DDI-DrugBank.d519.s2.e0 0-7 Plena...,[],"[{'text': 'Plenaxis', 'char_offset': '0-7'}, {...",[<DrugEntity None 35-42 proteins group>]
4,DDI-DrugBank.d519.s3,Laboratory Tests Response to Plenaxis should b...,[<DrugEntity DDI-DrugBank.d519.s3.e0 29-36 Ple...,[<DrugPair DDI-DrugBank.d519.s3.p0 DDI-DrugBan...,"[{'text': 'Laboratory', 'char_offset': '0-9'},...","[<DrugEntity None 11-15 Tests group>, <DrugEnt..."


In [404]:
df.shape

(5675, 6)

In [294]:
# number of different type of drugs
df.parsed_drugs.apply(len).sum()

12929

### Find differences in parsed and found drugs

In [392]:
def find_diffs(row, diffs=True):
    pd_lst = []
    for parsed_drug in row.parsed_drugs:
        pd_lst.append(parsed_drug.text)
        
    drug_lst = []
    for drug in row.drugs:
        drug_lst.append(drug.text)
    
    if diffs:
        return [item for item in drug_lst if item not in pd_lst]
    else:
        return [item for item in drug_lst if item in pd_lst]

In [393]:
diffs = df.apply(lambda x: find_diffs(x, diffs=True), axis=1)
same = df.apply(lambda x: find_diffs(x, diffs=False), axis=1)

In [395]:
pd_txt = df.parsed_drugs.apply(lambda x: [item.text for item in x])
d_txt = df.drugs.apply(lambda x: [item.text for item in x])

In [396]:
df_sim = pd.concat([pd_txt, d_txt, diffs, same], axis=1)
df_sim.columns = ['parsed_drugs', 'drugs', 'diffs', 'sim']

In [397]:
df_sim.head()

Unnamed: 0,parsed_drugs,drugs,diffs,sim
0,[],[interactions],[interactions],[]
1,[Plenaxis],[],[],[]
2,[Plenaxis],[P-450],[P-450],[]
3,[Plenaxis],[proteins],[proteins],[]
4,"[Plenaxis, testosterone]","[Tests, testosterone, concentrations]","[Tests, concentrations]",[testosterone]


In [402]:
print('diffs', df_sim.diffs.apply(len).sum())
print('sim', df_sim.sim.apply(len).sum())

diffs 11499
sim 8492


In [406]:
found_drugs = df.apply(lambda x: pd.Series(x['drugs']),axis=1).stack()
print(found_drugs.shape)

(19991,)


### Initial analysis of parsed drugs

In [295]:
drug_ents = df.apply(lambda x: pd.Series(x['parsed_drugs']),axis=1).stack()

In [303]:
drug_ents.apply(lambda x: x.type).unique()

array(['brand', 'drug', 'group', 'drug_n'], dtype=object)

In [304]:
drugs.shape

(12929,)

In [305]:
drugs.value_counts()

drug      8197
group     3206
brand     1423
drug_n     103
dtype: int64

In [306]:
drug_names = drug_ents.apply(lambda x: x.text)

In [307]:
drug_names.describe()

count        12929
unique        2655
top       warfarin
freq           176
dtype: object

In [308]:
drug_types = drug_ents.apply(lambda x: x.type)

In [309]:
drug_types.describe()

count     12929
unique        4
top        drug
freq       8197
dtype: object

In [310]:
drug_chars = drug_ents.apply(lambda x: x.char_offset)

In [413]:
df_drugs  = pd.concat([drug_names, drug_types, drug_chars], axis=1).reset_index(drop=True)
df_drugs.columns = ['name','type','offset']

In [415]:
df_drugs.head(11)

Unnamed: 0,name,type,offset
0,Plenaxis,brand,45-52
1,Plenaxis,brand,66-73
2,Plenaxis,brand,0-7
3,Plenaxis,brand,29-36
4,testosterone,drug,83-94
5,Plenaxis,brand,76-83
6,ORENCIA,brand,61-67
7,MTX,drug,50-52
8,NSAIDs,group,55-60
9,corticosteroids,group,63-77


In [419]:
df_drugs.describe()

Unnamed: 0,name,type,offset
count,12929,12929,12929
unique,2655,4,3957
top,warfarin,drug,0-9
freq,176,8197,267


In [452]:
df_drugs.apply(lambda x: x['name'][-3:] if x.type == 'drug' else None, axis=1).value_counts()[:10]

ine    1919
ide     617
cin     396
ole     391
one     330
ate     311
rin     302
ium     291
vir     220
xin     220
dtype: int64

In [453]:
df_drugs.apply(lambda x: x['name'][-4:] if x.type == 'drug' else None, axis=1).value_counts()[:10]

dine    439
zole    378
pine    309
arin    259
line    246
mide    237
mine    227
oxin    215
toin    214
avir    209
dtype: int64

In [454]:
df_drugs.apply(lambda x: x['name'][-5:] if x.type == 'drug' else None, axis=1).value_counts()[:10]

azole    348
idine    311
farin    224
amine    219
navir    206
goxin    205
mycin    204
ytoin    201
thium    148
lline    143
dtype: int64

### Check weird offset cases

In [316]:
print(df_drugs[df_drugs.offset.str.contains(';')].shape)
df_drugs[df_drugs.offset.str.contains(';')]

(33, 3)


Unnamed: 0,name,type,offset
45,diagnostic monoclonal antibodies,group,98-107;124-144
691,loop diuretics,group,150-153;187-195
692,potassium-sparing diuretics,group,156-172;187-195
1290,antiplatelet medication,group,81-92;112-121
1390,R(+) warfarin,drug,64-67;77-84
1510,loop diuretics,group,119-122;136-144
3519,loop diuretics,group,191-194;228-236
3520,potassium-sparing diuretics,group,197-213;228-236
4211,phenothiazines classes of antipsychotic agents,group,199-212;246-276
4212,thioxanthene classes of antipsychotic agents,group,215-226;246-276
