In [1]:
import numpy as np
import pandas as pd

from statsmodels.stats.multitest import multipletests

import gc

import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

import cancer_data
import many

import config
config.config_visuals()

import stackprinter
stackprinter.set_excepthook(style='lightbg')

# Load data

## Merged TCGA info

In [2]:
merged_tcga_info = pd.read_csv(
    "../data/supplementary/S2_merged-tcga-info.txt", sep="\t", index_col=0
)

tcga_msi = merged_tcga_info[merged_tcga_info["MSI"]==True]

## TCGA gene expression

In [3]:
tcga_genex = cancer_data.load("tcga_normalized_gene_expression")

normal_genex = tcga_genex[tcga_genex.index.map(lambda x: x[-2:] == "11")]
tcga_genex = tcga_genex[tcga_genex.index.map(lambda x: x[-2:] != "11")]

INFO:numexpr.utils:Note: NumExpr detected 16 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
INFO:numexpr.utils:NumExpr defaulting to 8 threads.


## TCGA splicing

In [4]:
def preprocess_splicing(df):
    
    df.index = df.index.map(lambda x: x[:15])
    df = df[~df.index.duplicated(keep="first")]
    
    # remove normals
    df = df[df.index.map(lambda x: x[-2:] != "11")]
    
    return df

In [17]:
many.stats.mat_mwu_naive(
    tcga_se.iloc[:, :1000],
    merged_tcga_info[["RPL22_k15fs_mutation"]].dropna(),
    **corr_kwargs,
    pbar=True
)

  0%|          | 0/1000 [00:00<?, ?it/s]

Unnamed: 0_level_0,Unnamed: 1_level_0,rank_biserial,pval,qval,pos_n,neg_n
a_col,b_col,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ENSG00000116786.7_ES_1_16046228:16046415:16047823:16047883:16051811:16052040_16047823:16047883,RPL22_k15fs_mutation,-0.473730,21.047830,18.047830,139.0,7806.0
ENSG00000160072.15_ES_1_1424583:1424654:1425071:1425191:1425636:1425751_1425071:1425191,RPL22_k15fs_mutation,-0.366600,13.401711,10.702741,144.0,7293.0
ENSG00000133226.12_ES_1_24989174:24989295:24989673:24989715:24993305:24993336_24989673:24989715,RPL22_k15fs_mutation,-0.346297,11.851886,9.329008,142.0,7866.0
ENSG00000011021.17_ES_1_11888162:11888276:11888514:11888681:11889252:11889356_11888514:11888681,RPL22_k15fs_mutation,-0.519165,9.745338,7.347398,51.0,3651.0
ENSG00000157870.10_ES_1_2519837:2519892:2519968:2520064:2520395:2520479_2519968:2520064,RPL22_k15fs_mutation,0.433557,9.589216,7.288186,73.0,1413.0
...,...,...,...,...,...,...
ENSG00000197530.8_ES_1_1560665:1560808:1560907:1561033:1562029:1562134_1560907:1561033,RPL22_k15fs_mutation,0.000000,-0.000000,-0.000000,0.0,144.0
ENSG00000162493.12_ES_1_13910764:13910836:13922321:13922479:13933667:13933716_13922321:13922479,RPL22_k15fs_mutation,0.000000,-0.000000,-0.000000,0.0,146.0
ENSG00000058453.12_ES_1_17248444:17248573:17249157:17249293:17250819:17250974_17249157:17249293,RPL22_k15fs_mutation,0.000000,-0.000000,-0.000000,0.0,154.0
ENSG00000171608.11_ES_1_9748233:9748371:9751524:9751629:9770481:9770654_9751524:9751629,RPL22_k15fs_mutation,0.000000,-0.000000,-0.000000,0.0,168.0


In [12]:
corr_kwargs = {"melt": True, "effect": "rank_biserial","pbar":True}

many.stats.mat_mwu_naive(
    tcga_a5ss,
    merged_tcga_info[["RPL22_k15fs_mutation"]],
    **corr_kwargs,
)

  0%|          | 0/70536 [00:00<?, ?it/s]

Unnamed: 0_level_0,Unnamed: 1_level_0,rank_biserial,pval,qval,pos_n,neg_n
a_col,b_col,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ENSG00000123737.8_A5_4_122737926:122737995:122734909:122735202:122734909:122737602_122735202:122737602,RPL22_k15fs_mutation,0.618615,48.935217,44.086806,148.0,8182.0
ENSG00000205629.7_A5_16_25143722:25143844:25139795:25139887:25139795:25139891_25139887:25139891,RPL22_k15fs_mutation,-0.293018,39.225577,34.678196,145.0,8115.0
ENSG00000160957.8_A5_8_145738449:145738521:145738799:145738864:145738600:145738864_145738600:145738799,RPL22_k15fs_mutation,0.644800,37.246804,32.875514,136.0,6043.0
ENSG00000168591.11_A5_17_42266389:42266447:42264639:42264799:42264639:42265377_42264799:42265377,RPL22_k15fs_mutation,-0.633798,36.408322,32.161971,137.0,7445.0
ENSG00000168591.11_A5_17_42266389:42266447:42264639:42264768:42264639:42265377_42264768:42265377,RPL22_k15fs_mutation,-0.623305,34.962153,30.812712,136.0,7002.0
...,...,...,...,...,...,...
ENSG00000172005.6_A5_2_95719125:95719737:95717436:95717686:95717436:95717777_95717686:95717777,RPL22_k15fs_mutation,0.000000,-0.000000,-0.000000,0.0,86.0
ENSG00000075043.13_A5_20_62039765:62039889:62044802:62044934:62044760:62044934_62044760:62044802,RPL22_k15fs_mutation,0.000000,-0.000000,-0.000000,0.0,680.0
ENSG00000134986.9_A5_5_111092832:111092876:111093641:111093739:111093628:111093739_111093628:111093641,RPL22_k15fs_mutation,0.000000,-0.000000,-0.000000,0.0,159.0
ENSG00000134986.9_A5_5_111092832:111092876:111093025:111093098:111093016:111093098_111093016:111093025,RPL22_k15fs_mutation,0.000000,-0.000000,-0.000000,0.0,188.0


In [62]:
tcga_splicing_sets = [
    "tcga_se",
    "tcga_a3ss",
    "tcga_a5ss",
    "tcga_ir",
    "tcga_mx_1",
    "tcga_mx_2",
    "tcga_mx_3",
    "tcga_mx_4",
    "tcga_mx_5",
]

mwu_kwargs = {"melt": True, "effect": "rank_biserial", "pbar": True}
corr_kwargs = {"melt": True, "method": "spearman", "pbar": True}


def splicing_vs_mutations(mut_series):

    merged = []

    for splicing_set in tcga_splicing_sets:

        print(f"Loading {splicing_set}")

        splicing = cancer_data.load(splicing_set)
        splicing = preprocess_splicing(splicing)

        correlations = many.stats.mat_mwu_naive(splicing, mut_series, **mwu_kwargs)
        merged.append(correlations)

        del splicing
        gc.collect()

    merged = pd.concat(merged)

    return merged


def recompute_qval(df):

    df["qval"] = multipletests(
        10 ** (-df["pval"]),
        alpha=0.01,
        method="fdr_bh",
    )[1]

    return df


def corr_splicing(series):

    merged = []

    for splicing_set in tcga_splicing_sets:

        print(f"Loading {splicing_set}")

        splicing = cancer_data.load(splicing_set)
        splicing = preprocess_splicing(splicing)

        correlations = many.stats.mat_corr_naive(splicing, series, **corr_kwargs)
        merged.append(correlations)

        del splicing
        gc.collect()

    merged = pd.concat(merged)
    merged = merged[merged["n"] >= 100]
    merged = recompute_qval(merged)
    merged = merged.sort_values(by="qval", ascending=False)
    
    merged = merged.reset_index().rename(
        {
            "a_col": "first_splicing_event",
            "b_col": "second_splicing_event",
            "pval": "-log10(P value)",
            "qval": "-log10(Q value)",
        },
        axis=1
    )

    return merged

In [31]:
tcga_mdm4_cosplicing = corr_splicing(merged_tcga_info["MDM4_exon_6_inclusion"])
tcga_rpl22l1_cosplicing = corr_splicing(merged_tcga_info["RPL22L1_exon_3A_inclusion"])

Loading tcga_se


  0%|          | 0/136145 [00:00<?, ?it/s]

Loading tcga_a3ss


  0%|          | 0/96354 [00:00<?, ?it/s]

Loading tcga_a5ss


  0%|          | 0/70536 [00:00<?, ?it/s]

Loading tcga_ir


  0%|          | 0/91592 [00:00<?, ?it/s]

Loading tcga_mx_1


  0%|          | 0/200000 [00:00<?, ?it/s]

Loading tcga_mx_2


  0%|          | 0/200000 [00:00<?, ?it/s]

Loading tcga_mx_3


  0%|          | 0/200000 [00:00<?, ?it/s]

Loading tcga_mx_4


  0%|          | 0/200000 [00:00<?, ?it/s]

Loading tcga_mx_5


  0%|          | 0/102293 [00:00<?, ?it/s]

In [65]:
tcga_mdm4_cosplicing.to_csv("../data/supplementary/S4-a_tcga-mdm4-cosplicing.txt", sep="\t")
tcga_rpl22l1_cosplicing.to_csv("../data/supplementary/S4-b_tcga-rpl22l1-cosplicing.txt", sep="\t")