In [77]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from functools import reduce

# Experimental setup

In [69]:
experiments = {"rpl22_oe": [
    ['LNCaP_GFP_1',
     'LNCaP_GFP_2',
     'LNCaP_GFP_3'],
    ['LNCaP_RPL22_1',
     'LNCaP_RPL22_2',
     'LNCaP_RPL22_3']
],
    "rpl22l1_oe": [
    ['CAL851_GFP_1',
     'CAL851_GFP_2',
     'CAL851_GFP_3'],
    ['CAL851_RPL22L1_1',
     'CAL851_RPL22L1_2',
     'CAL851_RPL22L1_3']
],
    "sh704": [
    ['LNCaP_shLuc_1',
     'LNCaP_shLuc_2',
     'LNCaP_shLuc_3'],
    ['LNCaP_sh704_1',
     'LNCaP_sh704_2',
     'LNCaP_sh704_3']
],
    "sh705": [
    ['LNCaP_shLuc_1',
     'LNCaP_shLuc_2',
     'LNCaP_shLuc_3'],
    ['LNCaP_sh705_1',
     'LNCaP_sh705_2',
     'LNCaP_sh705_3']
]
}

experiment_ids = ["rpl22_oe","rpl22l1_oe","sh704","sh705"]

splice_types = ["A3SS","A5SS","MXE","RI","SE"]

# Transcripts to genes

In [56]:
t2g = pd.read_csv("../data/raw/kallisto_homo_sapiens/transcripts_to_genes.txt",
                  sep="\t",
                  names=["transcript_id", "gene_id", "gene_name"]
                  )


t2g["format_transcript_id"] = t2g["gene_name"] + "_" + t2g["transcript_id"]
t2g["format_gene_id"] = t2g["gene_name"] + "_" + t2g["gene_id"]
t2g["gene_id_stable"] = t2g["gene_id"].str.split(".").str[0]

gene_name_map = dict(zip(t2g["gene_id_stable"], t2g["gene_name"]))


# Merge splice types

In [84]:
def concat_cols(df, cols, delim):
    cols_str = [df[x].astype(str) for x in cols]

    return reduce(lambda a, b: a + delim + b, cols_str)


def load_splices(experiment):
    A3SS = pd.read_csv("../data/intermediate/rmats_output/"+experiment +
                       "/A3SS.MATS.JC.txt", sep="\t", index_col=0)
    A5SS = pd.read_csv("../data/intermediate/rmats_output/"+experiment +
                       "/A5SS.MATS.JC.txt", sep="\t", index_col=0)
    MXE = pd.read_csv("../data/intermediate/rmats_output/"+experiment +
                      "/MXE.MATS.JC.txt", sep="\t", index_col=0)
    RI = pd.read_csv("../data/intermediate/rmats_output/"+experiment +
                     "/RI.MATS.JC.txt", sep="\t", index_col=0)
    SE = pd.read_csv("../data/intermediate/rmats_output/"+experiment +
                     "/SE.MATS.JC.txt", sep="\t", index_col=0)

    A3SS["gene_id"] = A3SS["geneSymbol"] + "_" + A3SS["GeneID"]
    A5SS["gene_id"] = A5SS["geneSymbol"] + "_" + A5SS["GeneID"]
    MXE["gene_id"] = MXE["geneSymbol"] + "_" + MXE["GeneID"]
    RI["gene_id"] = RI["geneSymbol"] + "_" + RI["GeneID"]
    SE["gene_id"] = SE["geneSymbol"] + "_" + SE["GeneID"]

    A3SS["exon_id"] = concat_cols(
        A3SS, ["longExonStart_0base", "longExonEnd", "shortES", "shortEE"], "_")
    A5SS["exon_id"] = concat_cols(
        A5SS, ["longExonStart_0base", "longExonEnd", "shortES", "shortEE"], "_")
    MXE["exon_id"] = concat_cols(
        MXE, ["1stExonStart_0base", "1stExonEnd", "2ndExonStart_0base", "2ndExonEnd"], "_")
    RI["exon_id"] = concat_cols(
        RI, ["riExonStart_0base", "riExonEnd"], "_")
    SE["exon_id"] = concat_cols(
        SE, ["exonStart_0base", "exonEnd"], "_")
    
    A3SS["exon_gene_id"] = A3SS["gene_id"] + "_" + A3SS["exon_id"]
    A5SS["exon_gene_id"] = A5SS["gene_id"] + "_" + A5SS["exon_id"]
    MXE["exon_gene_id"] = MXE["gene_id"] + "_" + MXE["exon_id"]
    RI["exon_gene_id"] = RI["gene_id"] + "_" + RI["exon_id"]
    SE["exon_gene_id"] = SE["gene_id"] + "_" + SE["exon_id"]
    
    A3SS = A3SS.set_index("exon_gene_id")
    A5SS = A5SS.set_index("exon_gene_id")
    MXE = MXE.set_index("exon_gene_id")
    RI = RI.set_index("exon_gene_id")
    SE = SE.set_index("exon_gene_id")

    display(A3SS.head())
    display(A5SS.head())
    display(MXE.head())
    display(RI.head())
    display(SE.head())


load_splices("rpl22_oe")

Unnamed: 0_level_0,GeneID,geneSymbol,chr,strand,longExonStart_0base,longExonEnd,shortES,shortEE,flankingES,flankingEE,...,IncFormLen,SkipFormLen,PValue,FDR,IncLevel1,IncLevel2,IncLevelDifference,gene_id,exon_id,exon_gene_id
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
115,ENSG00000148396,SEC16A,chr9,-,139341306,139341467,139341306,139341401,139341678,139341816,...,-2,-1,0.555465,1.0,"0.049,0.099,0.104","0.07,0.043,0.104",0.012,SEC16A_ENSG00000148396,139341306_139341467_139341306_139341401,SEC16A_ENSG00000148396_139341306_139341467_139...
116,ENSG00000100335,MIEF1,chr22,+,39905342,39905576,39905426,39905576,39900207,39900539,...,-2,-1,0.165161,1.0,"0.0,0.0,0.556","0.333,0.2,0.0",0.008,MIEF1_ENSG00000100335,39905342_39905576_39905426_39905576,MIEF1_ENSG00000100335_39905342_39905576_399054...
117,ENSG00000100335,MIEF1,chr22,+,39908236,39908499,39908322,39908499,39907853,39908031,...,-2,-1,0.694257,1.0,"0.366,0.6,0.429","0.273,0.617,0.333",0.057,MIEF1_ENSG00000100335,39908236_39908499_39908322_39908499,MIEF1_ENSG00000100335_39908236_39908499_399083...
119,ENSG00000160345,C9orf116,chr9,-,138387026,138387461,138387026,138387415,138390430,138390509,...,-2,-1,0.295294,1.0,"1.0,1.0,1.0","1.0,0.913,0.786",0.1,C9orf116_ENSG00000160345,138387026_138387461_138387026_138387415,C9orf116_ENSG00000160345_138387026_138387461_1...
121,ENSG00000111752,PHC1,chr12,+,9075228,9075384,9075252,9075384,9074196,9074346,...,-2,-1,1.0,1.0,"0.818,1.0,1.0","1.0,1.0,1.0",-0.061,PHC1_ENSG00000111752,9075228_9075384_9075252_9075384,PHC1_ENSG00000111752_9075228_9075384_9075252_9...


Unnamed: 0_level_0,GeneID,geneSymbol,chr,strand,longExonStart_0base,longExonEnd,shortES,shortEE,flankingES,flankingEE,...,IncFormLen,SkipFormLen,PValue,FDR,IncLevel1,IncLevel2,IncLevelDifference,gene_id,exon_id,exon_gene_id
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
83,ENSG00000160323,ADAMTS13,chr9,+,136320406,136320725,136320406,136320557,136321190,136321337,...,-2,-1,1.0,1.0,"1.0,NA,1.0","NA,0.2,1.0",0.4,ADAMTS13_ENSG00000160323,136320406_136320725_136320406_136320557,ADAMTS13_ENSG00000160323_136320406_136320725_1...
84,ENSG00000148303,RPL7A,chr9,+,136215072,136215428,136215072,136215101,136215776,136215897,...,-2,-1,1.0,1.0,"0.0,0.001,0.0","0.001,0.002,0.0",-0.001,RPL7A_ENSG00000148303,136215072_136215428_136215072_136215101,RPL7A_ENSG00000148303_136215072_136215428_1362...
89,ENSG00000126878,AIF1L,chr9,+,133993142,133993305,133993142,133993234,133995621,133998532,...,-2,-1,1.0,1.0,"0.902,1.0,1.0","0.953,1.0,1.0",-0.017,AIF1L_ENSG00000126878,133993142_133993305_133993142_133993234,AIF1L_ENSG00000126878_133993142_133993305_1339...
91,ENSG00000148335,NTMT1,chr9,+,132394928,132395144,132394928,132395080,132396332,132396585,...,-2,-1,0.2262,1.0,"0.839,0.829,0.798","0.929,0.944,0.788",-0.065,NTMT1_ENSG00000148335,132394928_132395144_132394928_132395080,NTMT1_ENSG00000148335_132394928_132395144_1323...
95,ENSG00000090621,PABPC4,chr1,-,40027356,40027459,40027399,40027459,40026487,40026794,...,-2,-1,0.436627,1.0,"0.984,0.98,0.969","0.986,0.976,0.961",0.003,PABPC4_ENSG00000090621,40027356_40027459_40027399_40027459,PABPC4_ENSG00000090621_40027356_40027459_40027...


Unnamed: 0_level_0,GeneID,geneSymbol,chr,strand,1stExonStart_0base,1stExonEnd,2ndExonStart_0base,2ndExonEnd,upstreamES,upstreamEE,...,IncFormLen,SkipFormLen,PValue,FDR,IncLevel1,IncLevel2,IncLevelDifference,gene_id,exon_id,exon_gene_id
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,ENSG00000220023,AL592183.1,chrGL000219.1,-,83212,83317,97293,97363,79936,80028,...,-2,-2,0.003052,0.212689,"0.571,0.625,0.688","0.333,0.263,0.423",0.288,AL592183.1_ENSG00000220023,83212_83317_97293_97363,AL592183.1_ENSG00000220023_83212_83317_97293_9...
16,ENSG00000148396,SEC16A,chr9,-,139339503,139339563,139340096,139340171,139338274,139338352,...,-2,-2,0.664048,1.0,"0.792,0.803,0.892","0.826,0.844,0.788",0.01,SEC16A_ENSG00000148396,139339503_139339563_139340096_139340171,SEC16A_ENSG00000148396_139339503_139339563_139...
17,ENSG00000148396,SEC16A,chr9,-,139347879,139347962,139350052,139350245,139342535,139342613,...,-2,-2,1.0,1.0,"0.991,0.974,0.99","1.0,1.0,1.0",-0.015,SEC16A_ENSG00000148396,139347879_139347962_139350052_139350245,SEC16A_ENSG00000148396_139347879_139347962_139...
18,ENSG00000148396,SEC16A,chr9,-,139348560,139348779,139350052,139350245,139347879,139347962,...,-2,-2,0.727593,1.0,"0.426,0.418,0.442","0.418,0.428,0.407",0.011,SEC16A_ENSG00000148396,139348560_139348779_139350052_139350245,SEC16A_ENSG00000148396_139348560_139348779_139...
19,ENSG00000160323,ADAMTS13,chr9,+,136290648,136290732,136291057,136291182,136289440,136289598,...,-2,-2,0.921919,1.0,"0.545,1.0,1.0","0.75,0.727,1.0",0.023,ADAMTS13_ENSG00000160323,136290648_136290732_136291057_136291182,ADAMTS13_ENSG00000160323_136290648_136290732_1...


Unnamed: 0_level_0,GeneID,geneSymbol,chr,strand,riExonStart_0base,riExonEnd,upstreamES,upstreamEE,downstreamES,downstreamEE,...,IncFormLen,SkipFormLen,PValue,FDR,IncLevel1,IncLevel2,IncLevelDifference,gene_id,exon_id,exon_gene_id
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
112,ENSG00000111752,PHC1,chr12,+,9075228,9075384,9075228,9075252,9075328,9075384,...,-2,-1,0.160052,1.0,"1.0,1.0,0.833","1.0,0.6,0.75",0.161,PHC1_ENSG00000111752,9075228_9075384,PHC1_ENSG00000111752_9075228_9075384
119,ENSG00000148303,RPL7A,chr9,+,136217094,136217582,136217094,136217174,136217451,136217582,...,-2,-1,1.0,1.0,"0.0,0.0,0.0","0.0,0.0,0.0",0.0,RPL7A_ENSG00000148303,136217094_136217582,RPL7A_ENSG00000148303_136217094_136217582
125,ENSG00000167110,GOLGA2,chr9,-,131024841,131025371,131024841,131024983,131025312,131025371,...,-2,-1,1.0,1.0,"0.0,0.0,0.0","0.013,0.0,0.0",-0.004,GOLGA2_ENSG00000167110,131024841_131025371,GOLGA2_ENSG00000167110_131024841_131025371
126,ENSG00000167106,FAM102A,chr9,-,130703159,130706963,130703159,130705516,130706893,130706963,...,-2,-1,1.0,1.0,"0.014,0.0,0.056","0.05,0.014,0.0",0.002,FAM102A_ENSG00000167106,130703159_130706963,FAM102A_ENSG00000167106_130703159_130706963
130,ENSG00000119403,PHF19,chr9,-,123627987,123628374,123627987,123628108,123628305,123628374,...,-2,-1,1.0,1.0,"0.0,0.0,0.077","NA,0.0,0.0",0.026,PHF19_ENSG00000119403,123627987_123628374,PHF19_ENSG00000119403_123627987_123628374


Unnamed: 0_level_0,GeneID,geneSymbol,chr,strand,exonStart_0base,exonEnd,upstreamES,upstreamEE,downstreamES,downstreamEE,...,IncFormLen,SkipFormLen,PValue,FDR,IncLevel1,IncLevel2,IncLevelDifference,gene_id,exon_id,exon_gene_id
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
130,ENSG00000220023,AL592183.1,chrGL000219.1,-,83212,83317,79936,80028,99415,99541,...,-2,-1,1.0,1.0,"1.0,1.0,1.0","1.0,0.875,1.0",0.042,AL592183.1_ENSG00000220023,83212_83317,AL592183.1_ENSG00000220023_83212_83317
131,ENSG00000220023,AL592183.1,chrGL000219.1,-,97293,97363,79936,80028,99415,99541,...,-2,-1,1.0,1.0,"1.0,1.0,1.0","1.0,0.714,1.0",0.095,AL592183.1_ENSG00000220023,97293_97363,AL592183.1_ENSG00000220023_97293_97363
415,ENSG00000148396,SEC16A,chr9,-,139339503,139339563,139338274,139338352,139340096,139340171,...,-2,-1,0.93009,1.0,"0.371,0.396,0.37","0.386,0.401,0.364",-0.005,SEC16A_ENSG00000148396,139339503_139339563,SEC16A_ENSG00000148396_139339503_139339563
416,ENSG00000148396,SEC16A,chr9,-,139339503,139339563,139338274,139338352,139341306,139341401,...,-2,-1,0.830009,1.0,"0.563,0.532,0.436","0.476,0.435,0.574",0.015,SEC16A_ENSG00000148396,139339503_139339563,SEC16A_ENSG00000148396_139339503_139339563
417,ENSG00000148396,SEC16A,chr9,-,139340096,139340171,139338274,139338352,139341306,139341401,...,-2,-1,0.541079,1.0,"0.83,0.823,0.865","0.812,0.806,0.833",0.022,SEC16A_ENSG00000148396,139340096_139340171,SEC16A_ENSG00000148396_139340096_139340171
