In [1]:
import os
from xml.dom import minidom
import pandas as pd
import numpy as np
from pymongo import MongoClient
import db_con_mongo

DATA SOURCE : https://www.cs.york.ac.uk/semeval-2013/task9/index.php%3Fid=data.html

In [2]:
file_locs = ['/home/grace/workspace/bio_dataset/semeval_task9_train_pair/Train/MedLine',
            '/home/grace/workspace/bio_dataset/semeval_task9_train_pair/Train/DrugBank']

entity_pairs = []
sents = []
for file_loc in file_locs:
    medlines = os.listdir(file_loc)

    file_attribs = []
    for med in medlines:
        doc = minidom.parse(file_loc+'/'+med)
        items = doc.getElementsByTagName('document')
        doc_id = items[0].attributes['id'].value

        for sent in doc.getElementsByTagName('sentence'):
            sent_id = sent.attributes['id'].value
            sent_text = sent.attributes['text'].value
            if sent_id=='DrugDDI.d124.s16':
                print(sent_text)
            
            if sent_text.strip():
                sents.append({'id':sent_id, 'sent':sent_text})

            # entity(drug)의 갯수가 1보다 클때만
            if len(sent.getElementsByTagName('pair')) > 0:
                for pair_info in sent.getElementsByTagName('pair'):
                    ddi = pair_info.attributes['ddi'].value
                    e1 = pair_info.attributes['e1'].value
                    e2 = pair_info.attributes['e2'].value
                    
                    for e in sent.getElementsByTagName('entity'):
                        if e.attributes['id'].value==e1:
                            e1_name = e.attributes['text'].value
                        elif e.attributes['id'].value==e2:
                            e2_name = e.attributes['text'].value
                    
                    if e1_name < e2_name:
                        entity_pairs.append({'ddi':ddi, 
                                             'e1_id':e1, 
                                             'e1_name':e1_name,
                                             'e2_id':e2, 
                                             'e2_name':e2_name})
                    else:
                        entity_pairs.append({'ddi':ddi, 
                                             'e1_id':e2, 
                                             'e1_name':e2_name,
                                             'e2_id':e1, 
                                             'e2_name':e1_name})
    #                 for ent in sent.getElementsByTagName('entity'):
    #                     ent_id = ent.attributes['id'].value
    #         #             ent_charOffset = ent.attributes['charOffset'].value
    #                     ent_type = ent.attributes['type'].value.strip()
    #                     ent_text = ent.attributes['text'].value.lower().strip()

    #                     entities.append({'type':ent_type, 'entity':ent_text})
    #                 entity_pairs.append(entities)

In [3]:
ents = pd.DataFrame(entity_pairs)
print(ents.shape)

ents.drop_duplicates(inplace=True)
print(ents.shape)
# ents.to_csv('./entity_dic_ddi.csv', sep=',', header=None, index=None)

(27792, 5)
(27792, 5)


In [4]:
ents.head()

Unnamed: 0,ddi,e1_id,e1_name,e2_id,e2_name
0,False,DDI-MedLine.d112.s0.e1,digoxin,DDI-MedLine.d112.s0.e0,dofetillide
1,False,DDI-MedLine.d112.s1.e1,digoxin,DDI-MedLine.d112.s1.e0,dofetilide
2,False,DDI-MedLine.d112.s2.e1,digoxin,DDI-MedLine.d112.s2.e0,dofetilide
3,False,DDI-MedLine.d112.s3.e1,digoxin,DDI-MedLine.d112.s3.e0,dofetilide
4,False,DDI-MedLine.d127.s6.e1,digoxin,DDI-MedLine.d127.s6.e0,thiazide diuretics


In [5]:
ents.ddi = np.where(ents.ddi=='true', True, False)

In [6]:
ents.ddi.unique()

array([False,  True])

In [7]:
# ents.loc[ents.e2_id.str.find('DDI-DrugBank.d493.s0.e0')>=0]

In [8]:
ents.loc[ents.ddi==False].shape

(23772, 5)

In [9]:
ents.loc[ents.ddi==True].shape

(4020, 5)

In [10]:
duplicated_ents = ents[ents.duplicated(subset=['e1_name', 'e2_name'])]
print(duplicated_ents)

         ddi                     e1_id              e1_name  \
2      False    DDI-MedLine.d112.s2.e1              digoxin   
3      False    DDI-MedLine.d112.s3.e1              digoxin   
11     False     DDI-MedLine.d74.s5.e1              glucose   
12     False     DDI-MedLine.d74.s6.e0             Arginine   
13     False     DDI-MedLine.d74.s6.e0             Arginine   
14     False     DDI-MedLine.d74.s6.e1              glucose   
15     False    DDI-MedLine.d74.s11.e1              glucose   
18     False     DDI-MedLine.d38.s3.e1        d-amphetamine   
20     False     DDI-MedLine.d38.s8.e1        d-amphetamine   
25     False      DDI-MedLine.d8.s1.e1             KRM-1648   
27      True      DDI-MedLine.d8.s6.e1             KRM-1648   
33     False      DDI-MedLine.d8.s6.e1             KRM-1648   
34     False      DDI-MedLine.d8.s6.e3            rifabutin   
35      True      DDI-MedLine.d8.s6.e4            ofloxacin   
36      True      DDI-MedLine.d8.s6.e4            oflox

In [11]:
ents['True_Pair_Exists'] = False
ents.loc[ents.ddi==True, 'True_Pair_Exists'] = True

In [12]:
# ents.loc[ents.ddi==True]
# ents.loc[ents.ddi==False]

In [13]:
# id로는 식별 불가
ents.loc[(ents.e1_id=='DDI-MedLine.d8.s6.e1') | (ents.e2_id=='DDI-MedLine.d8.s6.e1')]

Unnamed: 0,ddi,e1_id,e1_name,e2_id,e2_name,True_Pair_Exists
27,True,DDI-MedLine.d8.s6.e1,KRM-1648,DDI-MedLine.d8.s6.e0,ofloxacin,True
31,False,DDI-MedLine.d8.s6.e1,KRM-1648,DDI-MedLine.d8.s6.e2,rifampicin,False
32,False,DDI-MedLine.d8.s6.e1,KRM-1648,DDI-MedLine.d8.s6.e3,rifabutin,False
33,False,DDI-MedLine.d8.s6.e1,KRM-1648,DDI-MedLine.d8.s6.e4,ofloxacin,False


In [14]:
for ind, en in ents.loc[ents.ddi==True].iterrows():
#     print(en)
    ents.loc[(ents.e1_name==en[2]) & (ents.e2_name==en[4]) & (ents.ddi==False), 'True_Pair_Exists'] = True
    ents.loc[(ents.e1_name==en[4]) & (ents.e2_name==en[2]) & (ents.ddi==False), 'True_Pair_Exists'] = True

In [15]:
ents.head()

Unnamed: 0,ddi,e1_id,e1_name,e2_id,e2_name,True_Pair_Exists
0,False,DDI-MedLine.d112.s0.e1,digoxin,DDI-MedLine.d112.s0.e0,dofetillide,False
1,False,DDI-MedLine.d112.s1.e1,digoxin,DDI-MedLine.d112.s1.e0,dofetilide,True
2,False,DDI-MedLine.d112.s2.e1,digoxin,DDI-MedLine.d112.s2.e0,dofetilide,True
3,False,DDI-MedLine.d112.s3.e1,digoxin,DDI-MedLine.d112.s3.e0,dofetilide,True
4,False,DDI-MedLine.d127.s6.e1,digoxin,DDI-MedLine.d127.s6.e0,thiazide diuretics,False


In [16]:
ents.loc[ents.True_Pair_Exists==False].shape

(21798, 6)

In [17]:
## MongoDB에 들어있는 DrugBank Data 확인하기

In [18]:
client = db_con_mongo.get_connection()
print(client)

MongoClient(host=['morpheus.snu.ac.kr:27017'], document_class=dict, tz_aware=False, connect=True)


In [19]:
print(client.ddi)
ddi = client.ddi

Database(MongoClient(host=['morpheus.snu.ac.kr:27017'], document_class=dict, tz_aware=False, connect=True), 'ddi')


In [20]:
drugbank = ddi.get_collection('drugbank')

In [21]:
cursor = drugbank.find({'$where': 'this.structured_drug_interactions.length > 1'}).count()
total_docs = cursor
print(cursor)

  """Entry point for launching an IPython kernel.


1914


In [22]:
drug_interactions = []

limit_num = 100
for i in range(int(total_docs/limit_num)+1):
    cursor = drugbank.find({'$where': 'this.structured_drug_interactions.length > 1'}, \
                     {'term_names':1, 'title':1, \
                      'structured_drug_interactions.evidence_level':1,\
                      'structured_drug_interactions.affected_drug':1,\
                      'structured_drug_interactions.subject_drug':1,\
                     }).limit(limit_num).skip(limit_num*i)
    retreived_list = [x for x in cursor]
    for ret_l in retreived_list:
        interactions = ret_l['structured_drug_interactions']

        for interaction in interactions:
            affected = interaction['affected_drug']['name'].lower()
            subject = interaction['subject_drug']['name'].lower()
            if affected < subject:
                drug_interactions.append({'drug1':affected,\
                                          'drug2':subject,\
                    'evidence_level':interaction['evidence_level']})
            else:
                drug_interactions.append({'drug1':subject,\
                                          'drug2':affected,\
                    'evidence_level':interaction['evidence_level']})

In [23]:
len(drug_interactions)

1321468

In [24]:
df_drug_interactions = pd.DataFrame(drug_interactions)

In [25]:
df_drug_interactions.drop_duplicates(inplace=True)

In [26]:
df_drug_interactions.shape

(731194, 3)

In [None]:
for ind, en in ents.loc[ents.ddi==True].iterrows():
#     print(en)
    df_drug_interactions.loc[(df_drug_interactions.drug1==en[2]) & (df_drug_interactions.drug2==en[4]) & (ents.ddi==False), 'True_Pair_Exists'] = True
    df_drug_interactions.loc[(df_drug_interactions.drug2==en[4]) & (df_drug_interactions.drug1==en[2]) & (ents.ddi==False), 'True_Pair_Exists'] = True

In [None]:
pd.DataFrame(sents).to_csv('./ddi_sent.csv', sep=',', index=None)
pd.DataFrame(sents).to_csv('./ddi_sent_wo_header.csv', sep=',', header=None, index=None)

In [None]:
pd.read_csv('./ddi_sent.csv', sep=',').head()

In [None]:
df_sent = pd.DataFrame(sents)
df_sent.loc[df_sent.sent.isnull()]

In [None]:
df_sent.shape