In [3]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

# Experimental setup

In [4]:
experiments = pd.read_csv(
    "../data/intermediate/experiment_setup.txt", sep="\t")
samples = list(experiments["sample"])

sample_groups = [samples[0:3],
                 samples[3:6],
                 samples[6:9],
                 samples[9:12],
                 samples[12:15],
                 samples[15:18],
                 samples[18:21],
                 ]
sample_names = ["RPL22_oe_c",
                "RPL22_oe_t",
                "RPL22L1_oe_c",
                "RPL22L1_oe_t",
                "shluc",
                "sh704",
                "sh705"]

# Transcripts to genes

In [56]:
t2g = pd.read_csv("../data/raw/kallisto_homo_sapiens/transcripts_to_genes.txt",
                  sep="\t",
                  names=["transcript_id", "gene_id", "gene_name"]
                  )


t2g["format_transcript_id"] = t2g["gene_name"] + "_" + t2g["transcript_id"]
t2g["format_gene_id"] = t2g["gene_name"] + "_" + t2g["gene_id"]
t2g["gene_id_stable"] = t2g["gene_id"].str.split(".").str[0]

gene_name_map = dict(zip(t2g["gene_id_stable"], t2g["gene_name"]))


# Partitions

## Partition exon psis

In [66]:
splice_types = ["A3", "A5", "AF", "AL", "MX", "RI", "SE"]

psis = [pd.read_csv("../data/intermediate/suppa_exon_psis/" +
                    x+".psi", sep="\t") for x in splice_types]

for splice_type, splice_psis in zip(splice_types, psis):

    for sample_name, sample_group in zip(sample_names, sample_groups):

        sample_psis = splice_psis[sample_group]
        
#         sample_psis = sample_psis.dropna(axis=0,how="any")
#         sample_psis = sample_psis.fillna(0)

        sample_psis.to_csv("../data/intermediate/suppa_exon_psis_grouped/"+splice_type+"_"+sample_name+".psi",
                           sep="\t",
                           na_rep='nan',
                           index_label=False
                           )

exon_psis = pd.concat(psis)

exon_psis.to_hdf("../data/processed/suppa2_exon_psis.hdf", key="exon_psis", mode="w")

## Partition transcript psis

In [20]:
transcript_psis = pd.read_csv(
    "../data/intermediate/suppa_isoforms_isoform.psi", sep="\t")

for sample_name, sample_group in zip(sample_names, sample_groups):

    sample_psis = transcript_psis[sample_group]

    sample_psis.to_csv("../data/processed/grouped_transcript_psis/"+sample_name+".txt",
                       sep="\t",
                       na_rep='NA',
                       index_label=False)

transcript_psis.to_hdf(
    "../data/processed/suppa2_transcript_psis.hdf", key="transcript_psis", mode="w")

## Partition TPMs

In [82]:
suppa_tpms = pd.read_csv("../data/processed/tpm_matrix_suppa.txt",sep="\t")

for sample_name, sample_group in zip(sample_names, sample_groups):
        
        sample_tpms = suppa_tpms[sample_group]
        
        sample_tpms.to_csv("../data/processed/grouped_tpms/"+sample_name+".txt",
                           sep="\t",
                           index_label=False)
        

# Differential exon splicing results

## Load and merge results 

In [10]:
experiments = ["RPL22_oe", "RPL22L1_oe", "sh704", "sh705"]
splice_types = ["A3", "A5", "AF", "AL", "MX", "RI", "SE"]


def merge_splices(exp):
    merged_splices = []

    for s in splice_types:
        diffs = pd.read_csv("../data/intermediate/suppa_diff_psis/"+s+"_"+exp+".dpsi.temp.0",
                            sep="\t", index_col=0)

        diffs.columns = ["difference", "pval"]
        diffs["type"] = s
        merged_splices.append(diffs)

    return pd.concat(merged_splices, axis=0)


rpl22_oe = merge_splices("RPL22_oe")
rpl22l1_oe = merge_splices("RPL22L1_oe")
sh704 = merge_splices("sh704")
sh705 = merge_splices("sh705")

# for experiment in experiments:

## Add control-treatment PSIs

In [25]:
rpl22_oe_samples = ['LNCaP_GFP_1',
                    'LNCaP_GFP_2',
                    'LNCaP_GFP_3',
                    'LNCaP_RPL22_1',
                    'LNCaP_RPL22_2',
                    'LNCaP_RPL22_3']

rpl22l1_oe_samples = ['CAL851_GFP_1',
                      'CAL851_GFP_2',
                      'CAL851_GFP_3',
                      'CAL851_RPL22L1_1',
                      'CAL851_RPL22L1_2',
                      'CAL851_RPL22L1_3']

sh704_samples = ['LNCaP_shLuc_1',
                 'LNCaP_shLuc_2',
                 'LNCaP_shLuc_3',
                 'LNCaP_sh704_1',
                 'LNCaP_sh704_2',
                 'LNCaP_sh704_3']

sh705_samples = ['LNCaP_shLuc_1',
                 'LNCaP_shLuc_2',
                 'LNCaP_shLuc_3',
                 'LNCaP_sh705_1',
                 'LNCaP_sh705_2',
                 'LNCaP_sh705_3']

rpl22_oe_psis = exon_psis.loc[rpl22_oe.index, rpl22_oe_samples]
rpl22l1_oe_psis = exon_psis.loc[rpl22l1_oe.index, rpl22l1_oe_samples]
sh704_psis = exon_psis.loc[sh704.index, sh704_samples]
sh705_psis = exon_psis.loc[sh705.index, sh705_samples]

rpl22_oe = pd.concat([rpl22_oe, rpl22_oe_psis], axis=1)
rpl22l1_oe = pd.concat([rpl22l1_oe, rpl22l1_oe_psis], axis=1)
sh704 = pd.concat([sh704, sh704_psis], axis=1)
sh705 = pd.concat([sh705, sh705_psis], axis=1)

## Add gene names

In [57]:
extract_gene_ids = lambda x: x.split(";")[0].split("_and_")

rpl22_oe["genes"] = rpl22_oe.index.map(extract_gene_ids)
rpl22l1_oe["genes"] = rpl22l1_oe.index.map(extract_gene_ids)
sh704["genes"] = sh704.index.map(extract_gene_ids)
sh705["genes"] = sh705.index.map(extract_gene_ids)

map_gene_names = lambda x: ";".join([gene_name_map.get(y, "") for y in x])

rpl22_oe["gene_names"] = rpl22_oe["genes"].apply(map_gene_names)
rpl22l1_oe["gene_names"] = rpl22l1_oe["genes"].apply(map_gene_names)
sh704["gene_names"] = sh704["genes"].apply(map_gene_names)
sh705["gene_names"] = sh705["genes"].apply(map_gene_names)

rpl22_oe = rpl22_oe.drop("genes",axis=1)
rpl22l1_oe = rpl22l1_oe.drop("genes",axis=1)
sh704 = sh704.drop("genes",axis=1)
sh705 = sh705.drop("genes",axis=1)

In [59]:
rpl22_oe.to_hdf("../data/processed/exon_psi_diffs/rpl22_oe.hdf",
                key="rpl22_oe", mode="w")
rpl22l1_oe.to_hdf("../data/processed/exon_psi_diffs/rpl22l1_oe.hdf",
                  key="rpl22l1_oe", mode="w")
sh704.to_hdf("../data/processed/exon_psi_diffs/sh704.hdf",
             key="sh704", mode="w")
sh705.to_hdf("../data/processed/exon_psi_diffs/sh705.hdf",
             key="sh705", mode="w")