In [2]:
import pandas as pd
import numpy as np

# Make TPM matrix

## Experiments list

In [3]:
experiments = pd.read_csv("../data/intermediate/experiment_setup.txt",sep="\t")

experiments["kallisto_path"] = "../data/intermediate/kallisto_quant/"+experiments["sample"]+"/abundance.tsv"

def load_tpms(exp_path, exp_name):
    abundances = pd.read_csv(exp_path,sep="\t",index_col=0)
    
    tpms = abundances["tpm"]
    tpms = tpms.rename(exp_name)
    tpms = tpms.astype(np.float64)
    
    return tpms

## Merge TPMs

In [4]:
exps = zip(experiments["kallisto_path"],experiments["sample"])

merged_tpms = [load_tpms(x[0],x[1]) for x in exps]

merged_tpms = pd.concat(merged_tpms,axis=1)

merged_tpms.to_csv("../data/processed/tpm_matrix.txt",sep="\t")

# Transcripts to genes

In [29]:
t2g = pd.read_csv("../data/intermediate/sleuth_diff/ensembl_t2g.csv")

t2g["format_transcript_id"] = t2g["hgnc_symbol"] + \
    "_" + t2g["ensembl_transcript_id"]
t2g["format_gene_id"] = t2g["hgnc_symbol"] + "_" + t2g["ensembl_gene_id"]

t2g = t2g.drop_duplicates(subset=["ensembl_transcript_id"],keep="first")
t2g = t2g.dropna(subset=["format_transcript_id"])

transcript_map = dict(zip(t2g["ensembl_transcript_id"], t2g["format_transcript_id"]))
gene_name_map = dict(zip(t2g["ensembl_transcript_id"], t2g["hgnc_symbol"]))
gene_id_map = dict(zip(t2g["ensembl_gene_id"], t2g["format_gene_id"]))

In [30]:
named_tpms = merged_tpms.copy(deep=True)

named_tpms.index = named_tpms.index.map(lambda x: transcript_map.get(x,"unnamed_"+x))
named_tpms = named_tpms.T

named_tpms.to_hdf("../data/processed/tpm_matrix_named.hdf",key="named_tpms",mode="w")

In [32]:
_, align_ensembl_genes = merged_tpms.align(t2g.set_index("ensembl_transcript_id")["ensembl_gene_id"],axis=0,join="inner")

gene_tpms = merged_tpms.groupby(align_ensembl_genes).sum().T
gene_tpms.columns = gene_tpms.columns.map(lambda x: gene_id_map[x])

gene_tpms.to_hdf("../data/processed/tpm_matrix_genes.hdf",key="gene_tpms",mode="w")

# Sleuth outputs

In [38]:
rpl22_oe = pd.read_csv("../data/intermediate/sleuth_diff/rpl22_oe.csv",index_col=1)
rpl22l1_oe = pd.read_csv("../data/intermediate/sleuth_diff/rpl22l1_oe.csv",index_col=1)
sh704 = pd.read_csv("../data/intermediate/sleuth_diff/sh704.csv",index_col=1)
sh705 = pd.read_csv("../data/intermediate/sleuth_diff/sh705.csv",index_col=1)

rpl22_oe["format_transcript_id"] = rpl22_oe.index.map(lambda x: transcript_map.get(x,"unnamed_"+x))
rpl22l1_oe["format_transcript_id"] = rpl22l1_oe.index.map(lambda x: transcript_map.get(x,"unnamed_"+x))
sh704["format_transcript_id"] = sh704.index.map(lambda x: transcript_map.get(x,"unnamed_"+x))
sh705["format_transcript_id"] = sh705.index.map(lambda x: transcript_map.get(x,"unnamed_"+x))

## Add control-treatment TPMs

In [39]:
rpl22_oe_samples = ['LNCaP_GFP_1',
                    'LNCaP_GFP_2',
                    'LNCaP_GFP_3',
                    'LNCaP_RPL22_1',
                    'LNCaP_RPL22_2',
                    'LNCaP_RPL22_3']

rpl22l1_oe_samples = ['CAL851_GFP_1',
                      'CAL851_GFP_2',
                      'CAL851_GFP_3',
                      'CAL851_RPL22L1_1',
                      'CAL851_RPL22L1_2',
                      'CAL851_RPL22L1_3']

sh704_samples = ['LNCaP_shLuc_1',
                 'LNCaP_shLuc_2',
                 'LNCaP_shLuc_3',
                 'LNCaP_sh704_1',
                 'LNCaP_sh704_2',
                 'LNCaP_sh704_3']

sh705_samples = ['LNCaP_shLuc_1',
                 'LNCaP_shLuc_2',
                 'LNCaP_shLuc_3',
                 'LNCaP_sh705_1',
                 'LNCaP_sh705_2',
                 'LNCaP_sh705_3']

rpl22_oe_tpms = merged_tpms.loc[rpl22_oe.index, rpl22_oe_samples]
rpl22l1_oe_tpms = merged_tpms.loc[rpl22l1_oe.index, rpl22l1_oe_samples]
sh704_tpms = merged_tpms.loc[sh704.index, sh704_samples]
sh705_tpms = merged_tpms.loc[sh705.index, sh705_samples]

rpl22_oe = pd.concat([rpl22_oe, rpl22_oe_tpms], axis=1)
rpl22l1_oe = pd.concat([rpl22l1_oe, rpl22l1_oe_tpms], axis=1)
sh704 = pd.concat([sh704, sh704_tpms], axis=1)
sh705 = pd.concat([sh705, sh705_tpms], axis=1)

## Add gene names

In [40]:
rpl22_oe["gene_name"] = rpl22_oe.index.map(lambda x: gene_name_map.get(x,np.nan))
rpl22l1_oe["gene_name"] = rpl22l1_oe.index.map(lambda x: gene_name_map.get(x,np.nan))
sh704["gene_name"] = sh704.index.map(lambda x: gene_name_map.get(x,np.nan))
sh705["gene_name"] = sh705.index.map(lambda x: gene_name_map.get(x,np.nan))

## Export to csv

In [41]:
rpl22_oe.to_csv("../data/processed/kallisto_sleuth_merge/rpl22_oe.txt", sep="\t")
rpl22l1_oe.to_csv("../data/processed/kallisto_sleuth_merge/rpl22l1_oe.txt", sep="\t")
sh704.to_csv("../data/processed/kallisto_sleuth_merge/sh704.txt", sep="\t")
sh705.to_csv("../data/processed/kallisto_sleuth_merge/sh705.txt", sep="\t")