In [77]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from functools import reduce

# Experimental setup

In [69]:
experiments = {"rpl22_oe": [
    ['LNCaP_GFP_1',
     'LNCaP_GFP_2',
     'LNCaP_GFP_3'],
    ['LNCaP_RPL22_1',
     'LNCaP_RPL22_2',
     'LNCaP_RPL22_3']
],
    "rpl22l1_oe": [
    ['CAL851_GFP_1',
     'CAL851_GFP_2',
     'CAL851_GFP_3'],
    ['CAL851_RPL22L1_1',
     'CAL851_RPL22L1_2',
     'CAL851_RPL22L1_3']
],
    "sh704": [
    ['LNCaP_shLuc_1',
     'LNCaP_shLuc_2',
     'LNCaP_shLuc_3'],
    ['LNCaP_sh704_1',
     'LNCaP_sh704_2',
     'LNCaP_sh704_3']
],
    "sh705": [
    ['LNCaP_shLuc_1',
     'LNCaP_shLuc_2',
     'LNCaP_shLuc_3'],
    ['LNCaP_sh705_1',
     'LNCaP_sh705_2',
     'LNCaP_sh705_3']
]
}

experiment_ids = ["rpl22_oe","rpl22l1_oe","sh704","sh705"]

splice_types = ["A3SS","A5SS","MXE","RI","SE"]

# Transcripts to genes

In [56]:
t2g = pd.read_csv("../data/raw/kallisto_homo_sapiens/transcripts_to_genes.txt",
                  sep="\t",
                  names=["transcript_id", "gene_id", "gene_name"]
                  )


t2g["format_transcript_id"] = t2g["gene_name"] + "_" + t2g["transcript_id"]
t2g["format_gene_id"] = t2g["gene_name"] + "_" + t2g["gene_id"]
t2g["gene_id_stable"] = t2g["gene_id"].str.split(".").str[0]

gene_name_map = dict(zip(t2g["gene_id_stable"], t2g["gene_name"]))


# Merge splice types

In [99]:
def concat_cols(df, cols, delim):
    cols_str = [df[x].astype(str) for x in cols]

    return reduce(lambda a, b: a + delim + b, cols_str)


def load_splices(experiment):
    
    controls = experiments[experiment][0]
    treatments = experiments[experiment][1]
    
    A3SS = pd.read_csv("../data/intermediate/rmats_output/"+experiment +
                       "/A3SS.MATS.JC.txt", sep="\t", index_col=0)
    A5SS = pd.read_csv("../data/intermediate/rmats_output/"+experiment +
                       "/A5SS.MATS.JC.txt", sep="\t", index_col=0)
    MXE = pd.read_csv("../data/intermediate/rmats_output/"+experiment +
                      "/MXE.MATS.JC.txt", sep="\t", index_col=0)
    RI = pd.read_csv("../data/intermediate/rmats_output/"+experiment +
                     "/RI.MATS.JC.txt", sep="\t", index_col=0)
    SE = pd.read_csv("../data/intermediate/rmats_output/"+experiment +
                     "/SE.MATS.JC.txt", sep="\t", index_col=0)

    A3SS["gene_id"] = A3SS["geneSymbol"] + "_" + A3SS["GeneID"]
    A5SS["gene_id"] = A5SS["geneSymbol"] + "_" + A5SS["GeneID"]
    MXE["gene_id"] = MXE["geneSymbol"] + "_" + MXE["GeneID"]
    RI["gene_id"] = RI["geneSymbol"] + "_" + RI["GeneID"]
    SE["gene_id"] = SE["geneSymbol"] + "_" + SE["GeneID"]

    A3SS["exon_id"] = concat_cols(
        A3SS, ["longExonStart_0base", "longExonEnd", "shortES", "shortEE"], "_")
    A5SS["exon_id"] = concat_cols(
        A5SS, ["longExonStart_0base", "longExonEnd", "shortES", "shortEE"], "_")
    MXE["exon_id"] = concat_cols(
        MXE, ["1stExonStart_0base", "1stExonEnd", "2ndExonStart_0base", "2ndExonEnd"], "_")
    RI["exon_id"] = concat_cols(
        RI, ["riExonStart_0base", "riExonEnd"], "_")
    SE["exon_id"] = concat_cols(
        SE, ["exonStart_0base", "exonEnd"], "_")

    A3SS["exon_gene_id"] = A3SS["gene_id"] + "_" + A3SS["exon_id"]
    A5SS["exon_gene_id"] = A5SS["gene_id"] + "_" + A5SS["exon_id"]
    MXE["exon_gene_id"] = MXE["gene_id"] + "_" + MXE["exon_id"]
    RI["exon_gene_id"] = RI["gene_id"] + "_" + RI["exon_id"]
    SE["exon_gene_id"] = SE["gene_id"] + "_" + SE["exon_id"]

    A3SS = A3SS.set_index("exon_gene_id")
    A5SS = A5SS.set_index("exon_gene_id")
    MXE = MXE.set_index("exon_gene_id")
    RI = RI.set_index("exon_gene_id")
    SE = SE.set_index("exon_gene_id")

    A3SS["splice_type"] = "A3SS"
    A5SS["splice_type"] = "A5SS"
    MXE["splice_type"] = "MXE"
    RI["splice_type"] = "RI"
    SE["splice_type"] = "SE"

    merged_cols = ["splice_type", "GeneID", "geneSymbol",
                   "PValue", "FDR", "IncLevel1", "IncLevel2", "gene_id"]

    merged_splices = pd.concat([A3SS[merged_cols],
                                A5SS[merged_cols],
                                MXE[merged_cols],
                                RI[merged_cols],
                                SE[merged_cols],
                                ], axis=0)
    
    nan_replace = lambda x: x.replace("NA","nan")
    
    merged_splices["IncLevel1"] = merged_splices["IncLevel1"].apply(nan_replace)
    merged_splices["IncLevel2"] = merged_splices["IncLevel2"].apply(nan_replace)
    
    control_split = merged_splices["IncLevel1"].str.split(",").str
    treatment_split = merged_splices["IncLevel2"].str.split(",").str
    
    for i in range(len(controls)):
        merged_splices[controls[i]] = control_split[i].astype(np.float32)
        
    for i in range(len(treatments)):
        merged_splices[treatments[i]] = treatment_split[i].astype(np.float32)
        
    merged_splices = merged_splices.drop(["IncLevel1", "IncLevel2"],axis=1)
    
    merged_splices = merged_splices.sort_values(by="FDR")

    return merged_splices


load_splices("rpl22_oe")

Unnamed: 0_level_0,splice_type,GeneID,geneSymbol,PValue,FDR,gene_id,LNCaP_GFP_1,LNCaP_GFP_2,LNCaP_GFP_3,LNCaP_RPL22_1,LNCaP_RPL22_2,LNCaP_RPL22_3
exon_gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
RPL22L1_ENSG00000163584_170585801_170585990_170585801_170585923,A3SS,ENSG00000163584,RPL22L1,0.000000e+00,0.000000e+00,RPL22L1_ENSG00000163584,0.001,0.001,0.001,0.049,0.093,0.045
ARL13B_ENSG00000169379_93715391_93715528_93715406_93715528,A3SS,ENSG00000169379,ARL13B,4.996004e-15,9.657275e-12,ARL13B_ENSG00000169379,1.000,1.000,1.000,0.000,0.000,0.091
MDM4_ENSG00000198625_204506586_204506625,SE,ENSG00000198625,MDM4,6.661338e-16,2.265788e-11,MDM4_ENSG00000198625,1.000,1.000,1.000,0.125,0.176,0.300
TBC1D7_ENSG00000145979_13328527_13328763_13328547_13328763,A5SS,ENSG00000145979,TBC1D7,1.889489e-12,4.774738e-09,TBC1D7_ENSG00000145979,0.000,0.000,0.000,1.000,0.600,1.000
MDM4_ENSG00000198625_204506557_204506625,SE,ENSG00000198625,MDM4,4.847234e-13,8.243690e-09,MDM4_ENSG00000198625,1.000,1.000,1.000,0.263,0.417,0.533
LSS_ENSG00000160285_47610989_47611149,SE,ENSG00000160285,LSS,7.474021e-13,8.474045e-09,LSS_ENSG00000160285,1.000,1.000,0.992,0.810,0.912,0.889
LSS_ENSG00000160285_47610979_47611149,SE,ENSG00000160285,LSS,1.049494e-12,8.924371e-09,LSS_ENSG00000160285,1.000,1.000,0.992,0.810,0.909,0.886
A1BG-AS1_ENSG00000268895_58864686_58864840,SE,ENSG00000268895,A1BG-AS1,2.153833e-12,1.321584e-08,A1BG-AS1_ENSG00000268895,1.000,1.000,1.000,0.000,0.250,0.250
AC010967.2_ENSG00000228033_52959879_52959915,SE,ENSG00000228033,AC010967.2,2.331246e-12,1.321584e-08,AC010967.2_ENSG00000228033,1.000,1.000,1.000,0.333,0.200,0.200
TTC21B_ENSG00000123607_166719453_166719575,SE,ENSG00000123607,TTC21B,9.217849e-12,3.919199e-08,TTC21B_ENSG00000123607,1.000,1.000,1.000,0.238,0.500,0.400
