In [2]:
import pandas as pd
import numpy as np

from collections import Counter

from tqdm import tqdm_notebook as tqdm

from multiprocessing import Pool

# Batch effects normalized RNAseq

In [4]:
tcga_genex = pd.read_csv("raw/TCGA/EB++AdjustPANCAN_IlluminaHiSeq_RNASeqV2.geneExp.xena.gz",sep="\t",index_col=0).T
tcga_genex = tcga_genex.astype(np.float32)
tcga_genex.columns = [tcga_genex.columns[i] + "_" + str(i) for i in range(len(tcga_genex.columns))]

tcga_genex.to_hdf("processed/TCGA/TCGA_genex_norm.h5",key="tcga_genex",mode="w")

# Splicing (Kahles et al.)

In [12]:
from functools import reduce

def concat_cols(df, cols, delim):
    cols_str = [df[x].astype(str) for x in cols]

    return reduce(lambda a, b: a + delim + b, cols_str)

In [24]:
def process_splicing(load_path, save_path):

    chunk_iterator = pd.read_csv(load_path, sep="\t", chunksize=1000)

    chunk_count = 0

    merged = []
    
    pbar = tqdm()

    for chunk in chunk_iterator:

        chunk["exon_id"] = concat_cols(chunk,
                                       [
                                           "gene_name",
                                           "event_type",
                                           "event_chr",
                                           "event_coordinates",
                                           "alt_region_coordinates"
                                       ],
                                       "_"
                                       )

        chunk = chunk.drop(["event_id",
                            "event_type",
                            "event_chr",
                            "event_coordinates",
                            "alt_region_coordinates",
                            "gene_name"
                            ], axis=1)

        chunk = chunk.set_index("exon_id")
        chunk = chunk.dropna(axis=0, thresh=len(chunk.columns)/10)

        chunk = chunk[chunk.std(axis=1) > 0.025]
        chunk = chunk.astype(np.float16)

        merged.append(chunk)

        chunk_count += 1
        
        pbar.update(1)
        
    merged = pd.concat(merged,axis=0)
    merged = merged.astype(np.float16)
    merged = merged.T
    
    # remove prefix identifiers from names
    merged.index = merged.index.map(lambda x: x.split(".")[0])
    
    merged.to_hdf(save_path,key="tcga_splicing",mode="w")

In [28]:
path_pairs = [
    ["raw/TCGA/merge_graphs_alt_3prime_C2.confirmed.txt.gz","processed/TCGA/splicing_a3ss.h5"],
    ["raw/TCGA/merge_graphs_alt_5prime_C2.confirmed.txt.gz","processed/TCGA/splicing_a5ss.h5"],
    ["raw/TCGA/merge_graphs_intron_retention_C2.confirmed.txt.gz","processed/TCGA/splicing_ri.h5"],
    ["raw/TCGA/merge_graphs_exon_skip_C2.confirmed.txt.gz","processed/TCGA/splicing_se.h5"]
]

In [31]:
with Pool(processes=4) as pool:
    pool.starmap(process_splicing, path_pairs)

  self._target(*self._args, **self._kwargs)
  self._target(*self._args, **self._kwargs)
  self._target(*self._args, **self._kwargs)
  self._target(*self._args, **self._kwargs)
  self._target(*self._args, **self._kwargs)


In [3]:
tcga_a3ss = pd.read_hdf("processed/TCGA/splicing_a3ss.h5",key="tcga_splicing")
tcga_a5ss = pd.read_hdf("processed/TCGA/splicing_a5ss.h5",key="tcga_splicing")
tcga_ri = pd.read_hdf("processed/TCGA/splicing_ri.h5",key="tcga_splicing")
tcga_se = pd.read_hdf("processed/TCGA/splicing_se.h5",key="tcga_splicing")

In [4]:
tcga_splicing = pd.concat([
    tcga_a3ss,
    tcga_a5ss,
    tcga_ri,
    tcga_se
],axis=1,join="outer")

In [5]:
tcga_splicing.to_hdf("processed/TCGA/merged.h5",key="tcga_splicing",mode="w")