In [1]:
import pandas as pd
import numpy as np

# Make TPM matrix

## Experiments list

In [2]:
experiments = pd.read_csv(
    "../data/intermediate/experiment_setup.txt", sep="\t")

experiments["kallisto_path"] = "../data/intermediate/kallisto_quant/" + \
    experiments["sample"]+"/abundance.tsv"


def load_tpms(exp_path, exp_name):
    abundances = pd.read_csv(exp_path, sep="\t", index_col=0)

    tpms = abundances["tpm"]
    tpms = tpms.rename(exp_name)
    tpms = tpms.astype(np.float64)

    return tpms

## Merge TPMs

In [3]:
exps = zip(experiments["kallisto_path"], experiments["sample"])

transcript_tpms = [load_tpms(x[0], x[1]) for x in exps]

transcript_tpms = pd.concat(transcript_tpms, axis=1)

transcript_tpms.to_csv("../data/processed/transcript_tpms.txt", sep="\t")

# Transcripts to genes

In [41]:
t2g = pd.read_csv("../data/intermediate/sleuth_diff/ensembl_t2g.csv")

transcript_gene_map = dict(zip(t2g["target_id"], t2g["ext_gene"]))

## Sum transcripts by genes

In [43]:
_, align_ensembl_genes = transcript_tpms.align(t2g.set_index(
    "target_id")["ens_gene"], axis=0, join="inner")

gene_tpms = transcript_tpms.groupby(align_ensembl_genes).sum()

gene_tpms.to_csv("../data/processed/gene_tpms.txt", sep="\t")

# Sleuth outputs

In [58]:
experiments = {"rpl22_oe": [
    ['LNCaP_GFP_1',
     'LNCaP_GFP_2',
     'LNCaP_GFP_3'],
    ['LNCaP_RPL22_1',
     'LNCaP_RPL22_2',
     'LNCaP_RPL22_3']
],
    "rpl22l1_oe": [
    ['CAL851_GFP_1',
     'CAL851_GFP_2',
     'CAL851_GFP_3'],
    ['CAL851_RPL22L1_1',
     'CAL851_RPL22L1_2',
     'CAL851_RPL22L1_3']
],
    "sh704": [
    ['LNCaP_shLuc_1',
     'LNCaP_shLuc_2',
     'LNCaP_shLuc_3'],
    ['LNCaP_sh704_1',
     'LNCaP_sh704_2',
     'LNCaP_sh704_3']
],
    "sh705": [
    ['LNCaP_shLuc_1',
     'LNCaP_shLuc_2',
     'LNCaP_shLuc_3'],
    ['LNCaP_sh705_1',
     'LNCaP_sh705_2',
     'LNCaP_sh705_3']
]
}

experiment_ids = ["rpl22_oe","rpl22l1_oe","sh704","sh705"]

In [67]:
def compute_medians(sleuth_diff, experiment):
    sleuth_diff["control_median"] = sleuth_diff[experiments[experiment][0]].median(axis=1)
    sleuth_diff["treatment_median"] = sleuth_diff[experiments[experiment][1]].median(axis=1)
    sleuth_diff["median_foldchange"] = sleuth_diff["treatment_median"]/sleuth_diff["control_median"]
    
def signed_p_rank(sleuth_diff):

    sleuth_diff["-log_pval"] = -np.log10(sleuth_diff["pval"])
    sleuth_diff["-log_pval"] = sleuth_diff["-log_pval"].replace(np.inf,320)
    
    sleuth_diff["treatment_increase"] = -1 + 2*(sleuth_diff["median_foldchange"] > 1)
    
    sleuth_diff["signed_pval"] = sleuth_diff["-log_pval"] * sleuth_diff["treatment_increase"]
    
    sleuth_diff = sleuth_diff.sort_values(by="signed_pval")

In [68]:
def process_sleuth_genes(experiment):
    sleuth_diff = pd.read_csv("../data/intermediate/sleuth_diff/"+experiment+"_genes.csv",index_col=1)
        
    controls = experiments[experiment][0]
    treatments = experiments[experiment][1]
    
    sleuth_tpms = gene_tpms.loc[sleuth_diff.index,controls+treatments]
    
    sleuth_diff = pd.concat([sleuth_diff, sleuth_tpms], axis=1)
    
    compute_medians(sleuth_diff, experiment)
    signed_p_rank(sleuth_diff)
    
    sleuth_diff = sleuth_diff.dropna(subset=["pval","median_foldchange"],how="any")

    sleuth_diff.to_csv("../data/processed/kallisto_sleuth_merge/"+experiment+"_genes.txt",sep="\t")

In [69]:
def process_sleuth_transcripts(experiment):
    sleuth_diff = pd.read_csv("../data/intermediate/sleuth_diff/"+experiment+"_transcripts.csv",index_col=1)
        
    controls = experiments[experiment][0]
    treatments = experiments[experiment][1]
    
    sleuth_tpms = transcript_tpms.loc[sleuth_diff.index,controls+treatments]
    
    sleuth_diff = pd.concat([sleuth_diff, sleuth_tpms], axis=1)
    
    compute_medians(sleuth_diff, experiment)
    signed_p_rank(sleuth_diff)
    
    sleuth_diff = sleuth_diff.dropna(subset=["pval","median_foldchange"],how="any")

    sleuth_diff.to_csv("../data/processed/kallisto_sleuth_merge/"+experiment+"_transcripts.txt",sep="\t")    

In [70]:
process_sleuth_genes("rpl22_oe")
process_sleuth_genes("rpl22l1_oe")
process_sleuth_genes("sh704")
process_sleuth_genes("sh705")

process_sleuth_transcripts("rpl22_oe")
process_sleuth_transcripts("rpl22l1_oe")
process_sleuth_transcripts("sh704")
process_sleuth_transcripts("sh705")

  result = getattr(ufunc, method)(*inputs, **kwargs)
