In [1]:
import pandas as pd
import scipy
import numpy as np
import scipy.sparse as sp
import scipy.io as spio

import os

import isolearn.io as isoio


In [2]:
data = pd.read_csv('unprocessed_data/Alt_5SS_Tag_to_Seq_Map.csv',sep=',',index_col=0)
c = spio.loadmat('unprocessed_data/Alt_5SS_Usage_All_Cells.mat')

c_MCF7 = sp.csc_matrix(c['MCF7'])
c_CHO = sp.csc_matrix(c['CHO'])
c_HELA = sp.csc_matrix(c['HELA'])
c_HEK = sp.csc_matrix(c['HEK'])

In [3]:
#Sort data on counts

total_c_MCF7 = np.ravel(c_MCF7.sum(axis=-1))
total_c_CHO = np.ravel(c_CHO.sum(axis=-1))
total_c_HELA = np.ravel(c_HELA.sum(axis=-1))
total_c_HEK = np.ravel(c_HEK.sum(axis=-1))

avg_c = (total_c_HEK + total_c_HELA + total_c_CHO + total_c_MCF7) / 4.0

sort_index = np.argsort(avg_c)

data = data.iloc[sort_index].copy().reset_index(drop=True)
c_MCF7 = c_MCF7[sort_index, :]
c_CHO = c_CHO[sort_index, :]
c_HELA = c_HELA[sort_index, :]
c_HEK = c_HEK[sort_index, :]


In [4]:
#Constant background sequence context
up_background = 'acggcaacatcctggggcacaagctggagtacaactacaacagccacaacgtctatatcatggccgacaagcagaagaacggcatcaaagtgaacttcaagatccgccacaacatcgagg'.upper()
dn_background = 'acagagtttccttatttgtctctgttgccggcttatatggacaagcatatcacagccatttatcggagcgcctccgtacacgctattatcggacgcctcgcgagatcaatacgtatacca'.upper()

print('len(up_background) = ' + str(len(up_background)))
print('len(dn_background) = ' + str(len(dn_background)))


len(up_background) = 120
len(dn_background) = 120


In [5]:
#Extend sequences and count matrices

data['padded_sequence'] = up_background + data['Seq'].str.slice(0,101) + dn_background

padded_c_MCF7, padded_c_CHO, padded_c_HELA, padded_c_HEK = [
    sp.csr_matrix(
        sp.hstack([
            sp.csc_matrix((c_mat.shape[0], len(up_background))),
            c_mat[:, :101],
            sp.csc_matrix((c_mat.shape[0], len(dn_background))),
            sp.csc_matrix(np.array(c_mat[:, 303].todense()).reshape(-1, 1))
        ])
    )
    for c_mat in [c_MCF7, c_CHO, c_HELA, c_HEK]
]

print('padded_c_MCF7.shape = ' + str(padded_c_MCF7.shape))
print('padded_c_CHO.shape = ' + str(padded_c_CHO.shape))
print('padded_c_HELA.shape = ' + str(padded_c_HELA.shape))
print('padded_c_HEK.shape = ' + str(padded_c_HEK.shape))


padded_c_MCF7.shape = (265137, 342)
padded_c_CHO.shape = (265137, 342)
padded_c_HELA.shape = (265137, 342)
padded_c_HEK.shape = (265137, 342)


In [6]:
#Get joined min dataset

min_keep_index = (np.ravel(padded_c_HEK.sum(axis=-1)) > 0)
min_keep_index = min_keep_index & (np.ravel(padded_c_HELA.sum(axis=-1)) > 0)
min_keep_index = min_keep_index & (np.ravel(padded_c_MCF7.sum(axis=-1)) > 0)
min_keep_index = min_keep_index & (np.ravel(padded_c_CHO.sum(axis=-1)) > 0)

#MIN data
data_min_filtered = data.iloc[min_keep_index].copy().reset_index(drop=True)

c_hek_min_filtered = padded_c_HEK[min_keep_index, :]
c_hela_min_filtered = padded_c_HELA[min_keep_index, :]
c_mcf7_min_filtered = padded_c_MCF7[min_keep_index, :]
c_cho_min_filtered = padded_c_CHO[min_keep_index, :]

print('len(data_min_filtered) = ' + str(len(data_min_filtered)))

print('c_hek_min_filtered.shape = ' + str(c_hek_min_filtered.shape))
print('c_hela_min_filtered.shape = ' + str(c_hela_min_filtered.shape))
print('c_mcf7_min_filtered.shape = ' + str(c_mcf7_min_filtered.shape))
print('c_cho_min_filtered.shape = ' + str(c_cho_min_filtered.shape))


len(data_min_filtered) = 264647
c_hek_min_filtered.shape = (264647, 342)
c_hela_min_filtered.shape = (264647, 342)
c_mcf7_min_filtered.shape = (264647, 342)
c_cho_min_filtered.shape = (264647, 342)


In [7]:
#Dump final dataset

data_min_filtered = data_min_filtered[['padded_sequence']]

splicing_5ss_dict = {
    'df' : data_min_filtered.reset_index(drop=True),
    'hek_count' : c_hek_min_filtered,
    'hela_count' : c_hela_min_filtered,
    'mcf7_count' : c_mcf7_min_filtered,
    'cho_count' : c_cho_min_filtered,
}

isoio.dump(splicing_5ss_dict, 'processed_data/splicing_5ss_data/splicing_5ss_data')


In [8]:
#Dump small sample dataset

splicing_5ss_dict = {
    'df' : data_min_filtered.iloc[-10000:].copy().reset_index(drop=True),
    'hek_count' : c_hek_min_filtered[-10000:, :],
    'hela_count' : c_hela_min_filtered[-10000:, :],
    'mcf7_count' : c_mcf7_min_filtered[-10000:, :],
    'cho_count' : c_cho_min_filtered[-10000:, :],
}

isoio.dump(splicing_5ss_dict, 'processed_data/splicing_5ss_data_sample/splicing_5ss_data_sample')
