In [1]:
import pandas as pd
import numpy as np

# Make TPM matrix

## Experiments list

In [2]:
experiments = pd.read_csv("../data/intermediate/experiment_setup.txt",sep="\t")

experiments["kallisto_path"] = "../data/intermediate/kallisto_quant/"+experiments["sample"]+"/abundance.tsv"

def load_tpms(exp_path, exp_name):
    abundances = pd.read_csv(exp_path,sep="\t",index_col=0)
    
    tpms = abundances["tpm"]
    tpms = tpms.rename(exp_name)
    tpms = tpms.astype(np.float64)
    
    return tpms

## Merge TPMs

In [3]:
exps = zip(experiments["kallisto_path"],experiments["sample"])

merged_tpms = [load_tpms(x[0],x[1]) for x in exps]

merged_tpms = pd.concat(merged_tpms,axis=1)

merged_tpms.to_csv("../data/processed/tpm_matrix.txt",sep="\t")

## Format for SUPPA

In [62]:
suppa_tpms = merged_tpms.copy(deep=True)

suppa_tpms.index = suppa_tpms.index.map(lambda x: x.split(".")[0])

suppa_tpms.to_csv("../data/processed/tpm_matrix_suppa.txt",sep="\t",index_label=False)

# Transcripts to genes

In [27]:
t2g = pd.read_csv("../data/raw/kallisto_homo_sapiens/transcripts_to_genes.txt",
                  sep="\t",
                  names=["transcript_id", "gene_id", "gene_name"]
                  )

t2g["format_transcript_id"] = t2g["gene_name"] + "_" + t2g["transcript_id"]
t2g["format_gene_id"] = t2g["gene_name"] + "_" + t2g["gene_id"]

transcript_map = dict(zip(t2g["transcript_id"], t2g["format_transcript_id"]))

gene_id_map = dict(zip(t2g["gene_id"],t2g["format_gene_id"]))

In [36]:
named_tpms = merged_tpms.copy(deep=True)

named_tpms.index = named_tpms.index.map(lambda x: transcript_map.get(x,"unnamed_"+x))
named_tpms = named_tpms.T

named_tpms.to_csv("../data/processed/tpm_matrix_named.txt",sep="\t")

In [37]:
_, align_ensembl_genes = merged_tpms.align(t2g.set_index("transcript_id")["gene_id"],axis=0,join="inner")

gene_tpms = merged_tpms.groupby(align_ensembl_genes).sum().T
gene_tpms.columns = gene_tpms.columns.map(lambda x: gene_id_map[x])

gene_tpms.to_csv("../data/processed/tpm_matrix_genes.txt",sep="\t")