In [4]:
import urllib.request, urllib.error, urllib.parse
import json
import os
import pandas as pd
import time

In [5]:
pd.set_option('display.max_colwidth', 300)

In [16]:
basedir = '/scratch/ias41/ae_code/prev_reported_safety_associations'
with open('/scratch/ias41/ucc-fs-nethome/bioportal_api_key.txt', 'r') as f:
    API_KEY = f.read().strip('\n')

In [7]:
def get_json(url):
    opener = urllib.request.build_opener()
    opener.addheaders = [('Authorization', 'apikey token=' + API_KEY)]
    return json.loads(opener.open(url).read())

In [8]:
REST_URL = "http://data.bioontology.org"
def get_bioportal_annotations(text):
    """Return codes of MedDRA terms annotated by BioPortal Annotate Service.
    kwargs: text -- test to annotate"""
    
    annotations = get_json(REST_URL + "/annotator?text=" + urllib.parse.quote(text) + '&ontologies=MEDDRA&longest_only=true')
    
    identifiers = set()
    for result in annotations:
        identifier = result['annotatedClass']['@id'].split('/')[-1]
        identifiers.add(identifier)
    
    return(identifiers)

In [9]:
# Open previously formatted adverse effects extracted from publications
formatted_effects = pd.read_excel(basedir + '/data/safety_original2formatted_effects.xls')

In [10]:
formatted_effects.head()

Unnamed: 0,Accession,ChEMBL target name,Original effect,Formatted effect,PubMed ID,Reference
0,P02708,Acetylcholine receptor protein alpha chain,Effects on muscular functions,Effects on muscular functions,PMID16243262,"Whitebread S, Hamon J, Bojanic D, Urban L. Keynote review: in vitro safety pharmacology profiling: an essential tool for successful drug development. Drug Discov Today. 2005 Nov 1;10(21):1421-33."
1,P02708,Acetylcholine receptor protein alpha chain,Palpitation,Palpitation,PMID16243262,"Whitebread S, Hamon J, Bojanic D, Urban L. Keynote review: in vitro safety pharmacology profiling: an essential tool for successful drug development. Drug Discov Today. 2005 Nov 1;10(21):1421-33."
2,P02708,Acetylcholine receptor protein alpha chain,Stimulates autonomic cardiovascular functions,Stimulates autonomic cardiovascular functions,PMID16243262,"Whitebread S, Hamon J, Bojanic D, Urban L. Keynote review: in vitro safety pharmacology profiling: an essential tool for successful drug development. Drug Discov Today. 2005 Nov 1;10(21):1421-33."
3,P02708,Acetylcholine receptor protein alpha chain,Stimulates gastrointestinal functions,Stimulates gastrointestinal functions,PMID16243262,"Whitebread S, Hamon J, Bojanic D, Urban L. Keynote review: in vitro safety pharmacology profiling: an essential tool for successful drug development. Drug Discov Today. 2005 Nov 1;10(21):1421-33."
4,P02708,Acetylcholine receptor protein alpha chain,bronchial secretion,Bronchial secretion,PMID16243262,"Whitebread S, Hamon J, Bojanic D, Urban L. Keynote review: in vitro safety pharmacology profiling: an essential tool for successful drug development. Drug Discov Today. 2005 Nov 1;10(21):1421-33."


In [11]:
# find all unique AEs occurring
all_aes = set(list(formatted_effects['Formatted effect']))

In [12]:
%%time
# Find annotations from BioPortal
ae_tuples = list()

for effect in all_aes:
    annotations = get_bioportal_annotations(effect)
    time.sleep(0.02)
    if len(annotations) < 1:
        ae_tuples.append((effect, 'Nothing found'))
        continue
    for annotation in annotations:
        ae_tuples.append((effect, annotation))

CPU times: user 1.02 s, sys: 157 ms, total: 1.17 s
Wall time: 5min 35s


In [13]:
ae_df = pd.DataFrame.from_records(ae_tuples)
ae_df.columns = ['Original formatted effect', 'MedDRA ID']
len(ae_df)

523

In [17]:
# Prepare file for submitting to MedDRA browser hierarchy analysis
# Save to file for hierarchy analysis in MedDRA to find LLT > PT
ae_df_mapped = ae_df.loc[(ae_df['MedDRA ID']!='Nothing found')&(~ae_df['MedDRA ID'].str.contains('T')),:]
ae_df_mapped.columns=['Row ID', 'Code']
ae_df_mapped['Term'] = ''
ae_df_mapped[['Row ID', 'Term', 'Code']].to_excel(basedir + '/data/formatted_effects_BioPortal_annotated.xls', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [15]:
# Number of terms with at least one annotation
len(ae_df_mapped['Row ID'].drop_duplicates())

309

In [14]:
# Do MedDRA hierarchy analysis in MedDRA Web-based browser (WBB) and manual inspection of the results