In [1]:
import pandas as pd
import numpy as np

# Make TPM matrix

## Experiments list

In [2]:
experiments = pd.read_csv("../data/intermediate/experiment_setup.txt",sep="\t")

experiments["kallisto_path"] = "../data/intermediate/kallisto_quant/"+experiments["sample"]+"/abundance.tsv"

def load_tpms(exp_path, exp_name):
    abundances = pd.read_csv(exp_path,sep="\t",index_col=0)
    
    tpms = abundances["tpm"]
    tpms = tpms.rename(exp_name)
    tpms = tpms.astype(np.float64)
    
    return tpms

## Merge TPMs

In [3]:
exps = zip(experiments["kallisto_path"],experiments["sample"])

merged_tpms = [load_tpms(x[0],x[1]) for x in exps]

merged_tpms = pd.concat(merged_tpms,axis=1)

merged_tpms.to_csv("../data/processed/tpm_matrix.txt",sep="\t")

## Format for SUPPA

In [62]:
suppa_tpms = merged_tpms.copy(deep=True)

suppa_tpms.index = suppa_tpms.index.map(lambda x: x.split(".")[0])

suppa_tpms.to_csv("../data/processed/tpm_matrix_suppa.txt",sep="\t",index_label=False)

# Transcripts to genes

In [27]:
t2g = pd.read_csv("../data/raw/kallisto_homo_sapiens/transcripts_to_genes.txt",
                  sep="\t",
                  names=["transcript_id", "gene_id", "gene_name"]
                  )

t2g["format_transcript_id"] = t2g["gene_name"] + "_" + t2g["transcript_id"]
t2g["format_gene_id"] = t2g["gene_name"] + "_" + t2g["gene_id"]

transcript_map = dict(zip(t2g["transcript_id"], t2g["format_transcript_id"]))

gene_id_map = dict(zip(t2g["gene_id"],t2g["format_gene_id"]))

In [46]:
named_tpms = merged_tpms.copy(deep=True)

named_tpms.index = named_tpms.index.map(lambda x: transcript_map.get(x,"unnamed_"+x))
named_tpms = named_tpms.T

named_tpms.to_hdf("../data/processed/tpm_matrix_named.hdf",key="named_tpms",mode="w")

In [45]:
_, align_ensembl_genes = merged_tpms.align(t2g.set_index("transcript_id")["gene_id"],axis=0,join="inner")

gene_tpms = merged_tpms.groupby(align_ensembl_genes).sum().T
gene_tpms.columns = gene_tpms.columns.map(lambda x: gene_id_map[x])

gene_tpms.to_hdf("../data/processed/tpm_matrix_genes.hdf",key="gene_tpms",mode="w")

# Sleuth outputs

In [39]:
rpl22_oe = pd.read_csv("../data/intermediate/kallisto_sleuth/RPL22_oe.csv",index_col=1)
rpl22l1_oe = pd.read_csv("../data/intermediate/kallisto_sleuth/RPL22L1_oe.csv",index_col=1)
sh704 = pd.read_csv("../data/intermediate/kallisto_sleuth/sh704.csv",index_col=1)
sh705 = pd.read_csv("../data/intermediate/kallisto_sleuth/sh705.csv",index_col=1)

rpl22_oe["format_transcript_id"] = rpl22_oe.index.map(lambda x: transcript_map.get(x,"unnamed_"+x))
rpl22l1_oe["format_transcript_id"] = rpl22l1_oe.index.map(lambda x: transcript_map.get(x,"unnamed_"+x))
sh704["format_transcript_id"] = sh704.index.map(lambda x: transcript_map.get(x,"unnamed_"+x))
sh705["format_transcript_id"] = sh705.index.map(lambda x: transcript_map.get(x,"unnamed_"+x))

In [None]:
'LNCaP_GFP_1',
'LNCaP_GFP_2',
'LNCaP_GFP_3',
'LNCaP_RPL22_1',
'LNCaP_RPL22_2',
'LNCaP_RPL22_3',
'CAL851_GFP_1',
'CAL851_GFP_2',
'CAL851_GFP_3',
'CAL851_RPL22L1_1',
'CAL851_RPL22L1_2',
'CAL851_RPL22L1_3',
'LNCaP_shLuc_1',
'LNCaP_shLuc_2',
'LNCaP_shLuc_3',
'LNCaP_sh704_1',
'LNCaP_sh704_2',
'LNCaP_sh704_3',
'LNCaP_sh705_1',
'LNCaP_sh705_2',
'LNCaP_sh705_3'

In [51]:
rpl22_oe_samples = ['LNCaP_GFP_1',
                    'LNCaP_GFP_2',
                    'LNCaP_GFP_3',
                    'LNCaP_RPL22_1',
                    'LNCaP_RPL22_2',
                    'LNCaP_RPL22_3']

rpl22l1_oe_samples = ['CAL851_GFP_1',
                      'CAL851_GFP_2',
                      'CAL851_GFP_3',
                      'CAL851_RPL22L1_1',
                      'CAL851_RPL22L1_2',
                      'CAL851_RPL22L1_3']

sh704_samples = ['LNCaP_shLuc_1',
                 'LNCaP_shLuc_2',
                 'LNCaP_shLuc_3',
                 'LNCaP_sh704_1',
                 'LNCaP_sh704_2',
                 'LNCaP_sh704_3']

sh705_samples = ['LNCaP_shLuc_1',
                 'LNCaP_shLuc_2',
                 'LNCaP_shLuc_3',
                 'LNCaP_sh705_1',
                 'LNCaP_sh705_2',
                 'LNCaP_sh705_3']

rpl22_oe_tpms = merged_tpms.loc[rpl22_oe.index, rpl22_oe_samples]
rpl22l1_oe_tpms = merged_tpms.loc[rpl22l1_oe.index, rpl22l1_oe_samples]
sh704_tpms = merged_tpms.loc[sh704.index, sh704_samples]
sh705_tpms = merged_tpms.loc[sh705.index, sh705_samples]

In [53]:
rpl22_oe = pd.concat([rpl22_oe,rpl22_oe_tpms],axis=1)
rpl22l1_oe = pd.concat([rpl22l1_oe,rpl22l1_oe_tpms],axis=1)
sh704 = pd.concat([sh704,sh704_tpms],axis=1)
sh705 = pd.concat([sh705,sh705_tpms],axis=1)

In [43]:
rpl22_oe.to_hdf("../data/intermediate/kallisto_sleuth/RPL22_oe.hdf", key="rpl22_oe", mode="w")
rpl22l1_oe.to_hdf("../data/intermediate/kallisto_sleuth/RPL22L1_oe.hdf", key="rpl22l1_oe", mode="w")
sh704.to_hdf("../data/intermediate/kallisto_sleuth/sh704.hdf", key="sh704", mode="w")
sh705.to_hdf("../data/intermediate/kallisto_sleuth/sh705.hdf", key="sh705", mode="w")