In [36]:
from rnaseq_lib.web import openfda_get_drugs_by_query
from rnaseq_lib.web import openfda_drug_label
from rnaseq_lib.web import _rget
from rnaseq_lib.utils import flatten
from rnaseq_lib.utils.expando import Expando
from rnaseq_lib.tissues import grep_cancer_terms
from rnaseq_lib.tissues import identify_tissue_from_str
from rnaseq_lib.tissues import validate_genes

import pandas as pd
import os
from collections import defaultdict

from progressbar import ProgressBar

# Table

1. Get list of drugs and targets from Cancerrxgene
2. Check for FDA listing
    1. Collect indications_of_usage, mechanism_of_action, openfda.brand_name, openfda.generic_name
3. Derive tissue from information

In [37]:
df = pd.read_excel('Cancerrxgene/screened-compounds.xlsx')
print df.shape; df.head()

(265, 5)


Unnamed: 0,Drug ID,Drug Name,Synonyms,Target,Target Pathway
0,1,Erlotinib,"Tarceva, RG-1415, CP-358774, OSI-774, Ro-50823...",EGFR,EGFR signaling
1,3,Rapamycin,"AY-22989, Sirolimus, WY-090217, Torisel, Rapamune",MTORC1,PI3K/MTOR signaling
2,5,Sunitinib,"Sutent, Sunitinib Malate, SU-11248","PDGFR, KIT, VEGFR, FLT3, RET, CSF1R",RTK signaling
3,6,PHA-665752,"PHA665752, PHA 665752",MET,RTK signaling
4,9,MG-132,"LLL cpd, MG 132, MG132","Proteasome, CAPN1",Protein stability and degradation


## Query OpenFDA
Query drug name in OpenFDA for additional information

In [38]:
df_path = 'Cancerrxgene/openfda.pandas'
try:
    df = pd.read_pickle(df_path)
except IOError:
    info = defaultdict(list)
    bar = ProgressBar()
    for i in bar(xrange(len(df))):
        row = df.iloc[i]

        # Collect all aliases for Drug
        aliases = row.Synonyms.split(', ') if type(row.Synonyms) == list else [str(row.Synonyms)]
        aliases = None if 'nan' in aliases[0].lower() else aliases
        drug = [str(row['Drug Name']).split()[0]]
        aliases =  drug + aliases if aliases else drug

        # Check if any aliases return OpenFDA results
        r = None
        for name in aliases:
            if not '-' in name:
                r = openfda_drug_label(str(name))
                if r: break

        # If no results
        features  = ['usage', 'mech_action', 'brand_name', 'generic_name']
        if not r:
            #print 'No query for: {}'.format(row['Drug Name'])
            for f in features:
                info[f].append(None)

        else:
            # Pull out result
            e = r.json(object_hook=Expando)['results'][0]

            for f, a in zip(features, ['indications_and_usage', 'mechanism_of_action', 
                             'openfda.brand_name', 'openfda.generic_name']):    
                try:
                    attribute = e[a] if not a.startswith('openfda') else e['openfda'][a.split('.')[1]]
                    info[f].append('. '.join(attribute).encode('utf-8'))
                except KeyError:
                    info[f].append(None)
    
    # Add values to DataFrame
    df['usage'] = info['usage']
    df['mech_action'] = info['mech_action']
    df['brand_name'] = info['brand_name']
    df['generic_name'] = info['generic_name']

    # Save
    df.to_pickle(df_path)

Drop rows whose generic or brand name does not match the drug name or synonyms

In [39]:
rows_to_drop = []
for i in xrange(len(df)):
    row = df.iloc[i]
    aliases = row.Synonyms.split(', ') if type(row.Synonyms) == list else [str(row.Synonyms)]
    drug = [str(row['Drug Name']).split()[0]]
    aliases =  drug + aliases if aliases else drug
    
    # If missing brand and generic name, drop
    if not row.generic_name and not row.brand_name:
        rows_to_drop.append(i)
        continue
    
    # If drug name or synonyms don't match, drop
    if not any([x for x in aliases if x.lower() in row.generic_name.lower()]):
        if not any([x for x in aliases if x.lower() in row.brand_name.lower()]):
            rows_to_drop.append(i)

In [40]:
df = df.drop(rows_to_drop)

Drop duplicate entries

In [41]:
df = df.drop_duplicates('Drug Name')

## Process and Expand Table

Replace rows that target multiple tissues with one row per tissue

In [42]:
records = []
for row in df.iterrows():
    entries = defaultdict(str)
    index, row = row
    for sentence in grep_cancer_terms(str(row['usage'])):
        for tissue in identify_tissue_from_str(sentence):
            entries[tissue] += sentence
    
    if entries:
        # Store new rows
        for entry in entries:
            new_row = row.copy()
            new_row['tissue'] = entry
            new_row['evidence'] = entries[entry]
            records.append(new_row)
    
    else:
        print 'No evidence found for any tissue found for: {}'.format(row['Drug Name'])

df = pd.DataFrame.from_records(records, columns=list(df.columns) + ['tissue', 'evidence'])

No evidence found for any tissue found for: CMK
No evidence found for any tissue found for: Pyrimethamine
No evidence found for any tissue found for: Bortezomib
No evidence found for any tissue found for: Bexarotene
No evidence found for any tissue found for: Ruxolitinib
No evidence found for any tissue found for: Belinostat
No evidence found for any tissue found for: Tretinoin
No evidence found for any tissue found for: Vorinostat
No evidence found for any tissue found for: Temozolomide


Drugs missing evidence were for either blood or bone cancer, or a non-cancer drug. 

### Fix Targets (genes)

Look for genes in **Target** and make one row per gene

In [49]:
records = []
for row in df.iterrows():
    index, row = row
    for gene in set(validate_genes(row.Target.split(', '))):
        new_row = row.copy()
        new_row['gene'] = gene.upper() if gene else gene
        records.append(new_row)

df = pd.DataFrame.from_records(records)

### Drop NA and Save

Drop all rows missing a valid "gene" target

In [52]:
df = df[~df.gene.isnull()]
print 'Number of drugs: {}'.format(len(df['Drug Name'].unique()))
print 'Number of unique gene targets: {}'.format(len(df.gene.unique()))
df.shape

Number of drugs: 30
Number of unique gene targets: 37


(108, 12)

In [53]:
df.to_csv('Cancerrxgene/cancerrxgene.processed.tsv', sep='\t')