# Create and Process Dataframe for MABs

Create a dataframe of Monoclonal Antibodies and their gene targets

In [119]:
from collections import defaultdict

import pandas as pd
import numpy as np

from progressbar import ProgressBar

from rnaseq_lib.web import openfda_drug_label
from rnaseq_lib.utils.expando import Expando

from rnaseq_lib.tissues import grep_cancer_terms
from rnaseq_lib.tissues import identify_tissue_from_str
from rnaseq_lib.tissues import validate_genes

## Download Table

In [112]:
raw_path = 'MAB-processing/raw.tsv'
try:
    df = pd.read_csv(raw_path, index_col=0, sep='\t')
except IOError:
    # Read in table
    url = 'https://en.wikipedia.org/wiki/List_of_therapeutic_monoclonal_antibodies'
    df = pd.read_html(url, encoding='utf-8')[0]

    # Set columns
    df.columns = df.iloc[0]
    df = df.drop(0, axis=0)
    
    # Fix encoding
    df.Target = df.Target.str.encode('utf-8')

    # Save
    df.to_csv(raw_path, sep='\t')

In [113]:
df.head()

Unnamed: 0,Name,Trade name,Type,Source,Target,Use
1,3F8,,mab,mouse,GD2 ganglioside,neuroblastoma
2,8H9[1],,mab,mouse,B7-H3,"neuroblastoma, sarcoma, metastatic brain cancers"
3,Abagovomab[2],,mab,mouse,CA-125 (imitation),ovarian cancer
4,Abciximab,ReoPro,Fab,chimeric,CD41 (integrin alpha-IIb),platelet aggregation inhibitor
5,Abituzumab[3],,mab,humanized,CD51,cancer


## Query OpenFDA for Drug

In [114]:
df_path = 'MAB-processing/openfda.pandas'
try:
    df = pd.read_pickle(df_path)
except IOError:
    info = defaultdict(list)
    bar = ProgressBar()
    for i in bar(xrange(len(df))):
        row = df.iloc[i]
        
        # Process drug name
        name = row.Name.split(' (')[0].split('[')[0]
        aliases = [name] + row['Trade name'].split(', ') if type(row['Trade name']) is unicode else [name]

        # Check if any aliases return OpenFDA results
        r = None
        for name in aliases:
            if not '-' in name:
                r = openfda_drug_label(str(name))
                if r: break

        # If no results record None
        features  = ['usage', 'mech_action', 'brand_name', 'generic_name']
        if not r:
            for f in features:
                info[f].append(None)

        else:
            # Pull out result
            e = r.json(object_hook=Expando)['results'][0]

            for f, a in zip(features, ['indications_and_usage', 'mechanism_of_action', 
                             'openfda.brand_name', 'openfda.generic_name']):    
                try:
                    attribute = e[a] if not a.startswith('openfda') else e['openfda'][a.split('.')[1]]
                    info[f].append('. '.join(attribute).encode('utf-8'))
                except KeyError:
                    info[f].append(None)
    
    # Add values to DataFrame
    df['usage'] = info['usage']
    df['mech_action'] = info['mech_action']
    df['brand_name'] = info['brand_name']
    df['generic_name'] = info['generic_name']

    # Save
    df.to_pickle(df_path)    

## Validate Match and Drop NA
Drop rows whose generic or brand name does not match the drug or trade name

In [115]:
# Reset index
df = df.reset_index(drop=True)
rows_to_drop = []
for row in df.iterrows():
    i, row = row
    name = row.Name.split(' (')[0].split('[')[0]
    aliases = [name] + row['Trade name'].split(', ') if type(row['Trade name']) is unicode else [name]
    
    # If missing brand and generic name, drop
    if not row.generic_name and not row.brand_name:
        rows_to_drop.append(i)
        continue
    
    # If drug name or synonyms don't match, drop
    if not any([x for x in aliases if x.lower() in row.generic_name.lower()]):
        if not any([x for x in aliases if x.lower() in row.brand_name.lower()]):
            rows_to_drop.append(i)

# Drop rows
df = df.drop(rows_to_drop)
df = df.reset_index(drop=True)

## Match Drug to Tissue
Replace rows that target multiple tissues with one row per tissue

In [116]:
records = []
for row in df.iterrows():
    entries = defaultdict(str)
    index, row = row
    for sentence in grep_cancer_terms(str(row['usage'])):
        for tissue in identify_tissue_from_str(sentence):
            entries[tissue] += sentence
    
    if entries:
        # Store new rows
        for entry in entries:
            new_row = row.copy()
            new_row['tissue'] = entry
            new_row['evidence'] = entries[entry]
            records.append(new_row)
    
    else:
        print 'No evidence found for any tissue found for: {}'.format(row['Name'])

df = pd.DataFrame.from_records(records, columns=list(df.columns) + ['tissue', 'evidence'])

No evidence found for any tissue found for: Abciximab
No evidence found for any tissue found for: Adalimumab
No evidence found for any tissue found for: Alirocumab[13]
No evidence found for any tissue found for: Atlizumab (= tocilizumab)
No evidence found for any tissue found for: Basiliximab
No evidence found for any tissue found for: Belimumab
No evidence found for any tissue found for: Bezlotoxumab[25]
No evidence found for any tissue found for: Brentuximab vedotin[33]
No evidence found for any tissue found for: Brodalumab[35]
No evidence found for any tissue found for: Canakinumab[36]
No evidence found for any tissue found for: Daclizumab
No evidence found for any tissue found for: Daratumumab[43]
No evidence found for any tissue found for: Dinutuximab[47]
No evidence found for any tissue found for: Dupilumab[52]
No evidence found for any tissue found for: Eculizumab[17]
No evidence found for any tissue found for: Elotuzumab
No evidence found for any tissue found for: Evolocumab[26

### Fix Targets (genes)

Look for genes in **Target** and make one row per gene

In [128]:
records = []
for row in df.iterrows():
    index, row = row
    for gene in set(validate_genes([row.Target.split('/')[0]])):
        new_row = row.copy()
        new_row['gene'] = gene.upper() if gene else gene
        records.append(new_row)

df = pd.DataFrame.from_records(records)

### Drop NA and Save

Drop all rows missing a valid "gene" target

In [131]:
df = df[~df.gene.isnull()]
print 'Number of drugs: {}'.format(len(df.Name.unique()))
print 'Number of unique gene targets: {}'.format(len(df.gene.unique()))
df.shape

Number of drugs: 21
Number of unique gene targets: 12


(40, 13)

In [132]:
df.to_csv('MAB-processing/mab.processed.tsv', sep='\t')