# Combine Tables
Combine drug / gene tables

In [97]:
import os
from itertools import combinations

import pandas as pd

from rnaseq_lib.utils import mkdir_p, flatten
from rnaseq_lib.tissues.plots import gene_expression_boxplot

## Process Inputs

In [49]:
mab = pd.read_csv('MAB-processing/mab.processed.tsv', index_col=0, sep='\t')
crg = pd.read_csv('Cancerrxgene/cancerrxgene.processed.tsv', index_col=0, sep='\t')

Drop duplicate entries

In [50]:
duplicates = set(mab.generic_name).intersection(set(crg.generic_name))
for d in duplicates:
    mab = mab.drop(mab[mab.generic_name == d].index, axis=0)

In [51]:
mab = mab.reset_index(drop=True)

combine

In [61]:
df = pd.concat([mab, crg], axis=0)

Drop columns

In [62]:
df = df.drop(['Drug ID', 'Drug Name', 'Name', 'Synonyms', 
              'Use', 'Trade name', 'Type'], axis=1)
df.head()

Unnamed: 0,Source,Target,Target Pathway,brand_name,evidence,gene,generic_name,mech_action,tissue,usage
0,humanized,CD52,,Campath,1 INDICATIONS AND USAGE Campath is indicated a...,CD52,ALEMTUZUMAB,12.1 Mechanism of Action Campath binds to CD52...,Blood,1 INDICATIONS AND USAGE Campath is indicated a...
1,humanized,CD274,,TECENTRIQ,1) Metastatic non-small cell lung cancer who h...,CD274,ATEZOLIZUMAB,12.1 Mechanism of Action PD-L1 may be expresse...,Lung,1 INDICATIONS AND USAGE TECENTRIQ is a program...
2,humanized,CD274,,TECENTRIQ,1 INDICATIONS AND USAGE TECENTRIQ is a program...,CD274,ATEZOLIZUMAB,12.1 Mechanism of Action PD-L1 may be expresse...,Bladder,1 INDICATIONS AND USAGE TECENTRIQ is a program...
3,human,CD274,,BAVENCIO,1 INDICATIONS AND USAGE BAVENCIO is a programm...,CD274,AVELUMAB,12.1 Mechanism of Action PD-L1 may be expresse...,Skin-Head,1 INDICATIONS AND USAGE BAVENCIO is a programm...
4,human,CD274,,BAVENCIO,1) Patients with locally advanced or metastati...,CD274,AVELUMAB,12.1 Mechanism of Action PD-L1 may be expresse...,Bladder,1 INDICATIONS AND USAGE BAVENCIO is a programm...


In [63]:
mkdir_p('combined-table')
df.to_csv('combined-table/combined.tsv', sep='\t')

## Get DESeq2 Adjusted P-Value and Log2 Fold Change

Get differential expression results when comparing GTEx to TCGA-Tumor 

In [157]:
%%time
d2_results = os.path.join(os.path.dirname(os.getcwd()), 'DESeq2-Runs')

pval, l2fc, rank = [], [], []
for row in df.iterrows():
    # Get tissue and grab TSV
    p, fc, r = None, None, None
    i, row = row
    tsv = os.path.join(d2_results, row.tissue, row.tissue + '.tsv')
    if os.path.exists(tsv):
        with open(tsv, 'r') as f:
            for i, line in enumerate(f):
                line = line.split()
                if line[0] == row.gene:
                    fc = float(line[2])
                    p = float(line[-1].strip())
                    r = i
    # Store values
    pval.append(p)
    l2fc.append(fc)
    rank.append(r)  

# Add columns to dataframe
df['pval_adj'] = pval
df['l2fc'] = l2fc
df['pval_rank'] = rank

CPU times: user 59.6 s, sys: 12 ms, total: 59.6 s
Wall time: 59.6 s


Save

In [158]:
df.to_csv('combined-table/combined-deseq2.tsv', sep='\t', index=False)

## Gephi Output
Gephi is a graph visualization tool

In [143]:
# Create node information. Gephi requires an "Id" and "Label" header
nodes = list(set(map(lambda x: (x, 'tissue'), df.tissue.tolist()) + 
            map(lambda x: (x, 'gene'), df.gene.tolist()) +
            map(lambda x: (x, 'drug'), df.generic_name.tolist())))
nodes = pd.DataFrame.from_records(nodes, columns=('Label', 'Type'))
nodes.index.name = 'Id'

In [144]:
# Edges require a "Source" and "Target" label
edges = set()
for row in df.iterrows():
    i, row = row
    edges.add( (int(nodes[nodes.Label == row.tissue].index[0]), int(nodes[nodes.Label == row.gene].index[0]), 1))
    edges.add( (int(nodes[nodes.Label == row.tissue].index[0]), int(nodes[nodes.Label == row.generic_name].index[0]), 1))
    edges.add( (int(nodes[nodes.Label == row.generic_name].index[0]), int(nodes[nodes.Label == row.gene].index[0]), 1))      

edges = pd.DataFrame.from_records(list(edges), columns=('Source', 'Target', 'Weight'))

Save files

In [145]:
nodes.to_csv('combined-table/gephi-nodes.csv', index=True, index_label='Id')
edges.to_csv('combined-table/gephi-edges.csv', index=False)

Make Tissue -> Gene dict

In [81]:
tg = df.groupby('tissue').gene.unique().to_dict()

In [152]:
'","'.join(df[df.tissue == 'Lung'].gene.unique())

'CD274","MMRN2","EGFR","PDCD1","KDR","MET","ALK","ROS1","TOP2A","ERBB2","MAP2K2","PPP4R3A'

In [154]:
df[df.l2fc > 3]

Unnamed: 0,Source,Target,Target Pathway,brand_name,evidence,gene,generic_name,mech_action,tissue,usage,pval_adj,l2fc
57,,TOP2,DNA replication,Etoposide,Small Cell Lung Cancer Etoposide Injection US...,TOP2A,ETOPOSIDE,,Lung,INDICATIONS AND USAGE Etoposide Injection USP ...,0.0,3.914468
88,,"VEGFR, MET, RET, KIT, FLT1, FLT3, FLT4, TIE2,AXL","Other, kinases",COMETRIQ,1 INDICATIONS AND USAGE COMETRIQ is indicated ...,MET,CABOZANTINIB,12.1 Mechanism of Action In vitro biochemical ...,Thyroid,1 INDICATIONS AND USAGE COMETRIQ is indicated ...,0.0,3.537572
