In [1]:
import numpy as np
import pandas as pd
import os

from cell_metric import compute_cell_metric
from model_functions import psix_score
from tpm_to_mrna import tpm2mrna
import anndata
# from anndata import AnnData
from smartseq_tools import *


class Psix:
    
    def __init__(
        self,
        adata = anndata.AnnData(),
        reads_file = ''
    ):
        self.adata = adata
        
        if os.path.isfile(reads_file):
            self.adata = anndata.read_csv(reads_file, delimiter='\t', first_column_names=True)
        
    def process_smartseq(
        self,
        exon_sj_file,
        constitutive_sj_file,
        tpm_file,
        minJR = 5,
        minCell=20,
        drop_duplicates = False,
        min_psi = 0.05,
        min_observed = 0.1
    ):
        
        print('Obtaining psi tables...')
            
        psi, reads = get_psi_table(exon_sj_file, minJR, minCell, drop_duplicates)
        
        alt_exons = psi.index[np.abs(0.5 - psi.mean(axis=1)) <= (0.5-min_psi)]
        obs_exons = psi.index[psi.isna().mean(axis=1) < 1-min_observed]
        selected_exons = alt_exons & obs_exons
        
        psi = psi.loc[selected_exons]
        reads = reads.loc[selected_exons]
        
        print('Reading TPM and transforming to mRNA counts...')
        
        mrna = tpm2mrna(tpm_file)
        mrna_per_event = get_mrna_per_event(mrna, psi, reads, constitutive_sj_file)
        
        self.adata.uns['psi'] = psi.T
        self.adata.uns['mrna_per_event'] = mrna_per_event.T
        
        print('Successfully processed smart-seq data')



In [96]:
se_exons = pd.read_csv('~/data_sc_regulation/data_autocorrelation/tiklova_neurogenesis/skipped_exons_psi.tab',
                      sep='\t', index_col=0)

mrna_event = pd.read_csv('~/data_sc_regulation/data_autocorrelation/tiklova_neurogenesis/mrna_per_event.tab',
                        sep='\t', index_col=0)


star_counts = pd.read_csv('~/data_sc_regulation/tiklova_extended/star_counts.tab.gz', sep='\t', index_col=0)

sj_counts = pd.read_csv('../data/prueba_tiklova_SJ/SE_counts.tab', sep='\t', index_col=0)

constitutive_introns = pd.read_csv('../data/prueba_tiklova_SJ/constitutive_introns.tab', sep='\t', index_col=0)

tpm = pd.read_csv('~/data_sc_regulation/tiklova_extended/rsem_gene_tpm.tab.gz', sep='\t', index_col=0)

In [None]:
# constitutive_introns = 

In [98]:
discard = ['EGFP'] + [x for x in star_counts.index if x[:5]=='ERCC-']
good_genes = [x for x in star_counts.index if x not in discard]
star_counts = star_counts.loc[good_genes, se_exons.columns]
tpm = tpm.loc[good_genes, se_exons.columns]
sj_counts = sj_counts[se_exons.columns]
constitutive_introns = constitutive_introns[se_exons.columns]

In [106]:
star_counts.to_csv('../data/mouse_brain_development/star_counts.tab.gz', sep='\t', index=True, header=True)
constitutive_introns.to_csv('../data/mouse_brain_development/constitutive_introns.gz', sep='\t', index=True, header=True)
tpm.to_csv('../data/mouse_brain_development/tpm.tab.gz', sep='\t', index=True, header=True)
sj_counts.to_csv('../data/mouse_brain_development/sj_counts.tab.gz', sep='\t', index=True, header=True)

In [2]:
psix_object = Psix(reads_file='../data/mouse_brain_development/star_counts.tab.gz')

In [3]:
psix_object.process_smartseq(
        '../data/mouse_brain_development/sj_counts.tab.gz',
        '../data/mouse_brain_development/constitutive_introns.gz',
        '../data/mouse_brain_development/tpm.tab.gz',
        minJR = 1,
        minCell=1)

Obtaining psi tables...
Reading TPM and transforming to mRNA counts...


100%|██████████| 1197/1197 [00:51<00:00, 23.17it/s]


           SRR7408400  SRR7408401  SRR7408404  SRR7408413  SRR7408414  \
Mrpl15_1     0.154801         NaN    0.296402    0.297519    0.901730   
Mrpl15_2     0.000000         NaN    0.148201    0.297519    0.601153   
Tcea1_1      0.000000    0.561390    0.225917    0.110829    0.153591   
Tcea1_2      0.000000    0.000000    0.000000    0.000000    0.000000   
Tcea1_3      0.000000    0.000000    0.000000    0.000000    0.000000   
...               ...         ...         ...         ...         ...   
Gpm6b_2      0.262257    0.233860    0.233186    0.231614    0.632273   
Gpm6b_3      0.262257    0.350790    0.466372    0.154409    0.379364   
Gpm6b_4      0.000000    0.701580    0.874447    0.617637    0.969486   
Trappc2_1         NaN         NaN    0.000000    0.000000    0.000000   
Tmsb4x_1     0.154006    0.183185    0.266676    0.254099    0.569664   

           SRR7408418  SRR7408422  SRR7408424  SRR7408426  SRR7408427  ...  \
Mrpl15_1          NaN    0.475576    0.165175

In [4]:
psix_object.adata.uns['psi']

Unnamed: 0,AI314180_4,AI314180_5,AW554918_2,Aaed1_1,Aak1_2,Aak1_5,Aamdc_10,Aamdc_9,Aamdc_nmdSE_2,Aars2_2,...,Zufsp_1,Zyx_1,Zyx_2,Zzz3_10,Zzz3_11,Zzz3_12,l7Rn6_1,l7Rn6_3,l7Rn6_4,l7Rn6_nmdSE_1
SRR7408400,,,1.0,,,,0.000000,1.000000,0.0,,...,,,,,,,1.0,0.0,1.0,0.333333
SRR7408401,0.333333,,,,,,,,,,...,,1.0,,,,,0.2,0.0,0.0,
SRR7408404,,,,,,,1.000000,,,,...,,,0.0,,1.0,1.0,1.0,,,0.000000
SRR7408413,,,,,,,,,,,...,,,,,,,1.0,,,0.000000
SRR7408414,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SRR7410092,,,,,1.0,1.0,1.000000,1.000000,0.0,,...,,0.0,1.0,,,,,,,
SRR7410093,,,,,,,1.000000,1.000000,,,...,0.0,0.0,1.0,,,,1.0,1.0,,0.000000
SRR7410094,,0.0,,1.0,0.0,,1.000000,1.000000,0.0,,...,,,,,1.0,,1.0,0.0,,0.000000
SRR7410096,,,,0.0,,,0.882353,0.833333,0.0,1.0,...,,,,,,,1.0,0.0,,


In [9]:
psix_object.adata.uns['psi']['Mapt_3']

SRR7408400    0.000000
SRR7408401    0.000000
SRR7408404    0.000000
SRR7408413    0.000000
SRR7408414    0.000000
                ...   
SRR7410092    0.081081
SRR7410093    0.000000
SRR7410094    0.259259
SRR7410096    0.600000
SRR7410097    0.368421
Name: Mapt_3, Length: 1197, dtype: float64

In [None]:
constitutive_sj = pd.read_csv(constitutive_sj_file, sep=',', index_col=0)
        obs_mrna = mrna.index[mrna.median(axis=1) >= 1]
        obs_junctions = [x for x in constitutive_sj.index if x.split('_')[0] in obs_mrna]
        
        mrna_per_junction = mrna.loc[[x.split('_')[0] for x in obs_junctions]]
        mrna_per_junction.index = obs_junctions
        
        reads_per_junction = (constitutive_sj.loc[obs_junctions] / mrna_per_junction).replace([np.inf, -np.inf], np.nan)
        SJ_mean = reads_per_junction.mean()
        int_exons = psi.index[np.abs(0.5 - psi.mean(axis=1)) <= 0.45] & psi.index[psi.isna().mean(axis=1) < 0.9]
        psi = psi.loc[int_exons]
        mrna_events = (reads.loc[int_exons]/(SJ_mean * (1+psi)))
        
        self.adata.uns['psi'] = psi.T
        self.adata.uns['mrna_per_event'] = mrna_events.T

In [60]:
tiklova_ci = pd.read_csv(constitutive_sj_file, sep=',', index_col=0)

tiklova_mrna = tiklova_mrna[tiklova_norm.columns]
tiklova_mrna_expressed = tiklova_mrna.loc[tiklova_mrna.median(axis=1) >= 1]

xlist = [x for x in tiklova_ci.index if x.split('_')[0] in tiklova_mrna_expressed.index]
tiklova_mrna_per_junction = tiklova_mrna_expressed.loc[[x.split('_')[0] for x in xlist]]
tiklova_mrna_per_junction.index = xlist

tiklova_ratio = (tiklova_ci.loc[tiklova_mrna_per_junction.index] / tiklova_mrna_per_junction).replace([np.inf, -np.inf], np.nan)
tiklova_SJ = pd.DataFrame()
tiklova_SJ['SJ_mean'] = tiklova_ratio[tiklova_mrna.columns].mean()
tiklova_SJ['SJ_median'] = tiklova_ratio[tiklova_mrna.columns].median()
# tiklova_SJ.to_csv()
# tiklova_ratio.to_csv()

discard = [x for x in tiklova_mrna.index if ((x[:3] in ['mt-', 'Gm0', 'Gm1', 'Gm2', 'Gm3', 'Gm4', 'Gm5', 
                                'Gm6', 'Gm7', 'Gm8', 'Gm9', 'Mir']) or (x[-3:] in ['Rik', '-ps']) or (x [-4:-1] == '-ps'))]

good_genes = [x for x in tiklova_mrna.index if x not in discard]
good_exons = [x for x in tiklova_PSI.index if x.split('_')[0] in good_genes]

tiklova_PSI = tiklova_PSI.loc[good_exons]

tiklova_exons = tiklova_PSI.index[np.abs(0.5 - tiklova_PSI.mean(axis=1)) <= 0.45] & tiklova_PSI.index[tiklova_PSI.isna().mean(axis=1) < 0.5]

tiklova_mrna_events = (tiklova_reads.loc[tiklova_PSI.index]/(tiklova_SJ.SJ_mean * (1+tiklova_PSI)))[tiklova_mrna.columns[tiklova_mrna.sum() < 10**5.5]]
#

True

In [37]:
from cell_metric import compute_cell_metric


In [85]:
os.path.isfile('')

False

In [1]:
import numpy as np
import pandas as pd
import anndata

In [7]:
data = pd.DataFrame(np.zeros((20, 10)))
data.index = ['cell_' + str(i) for i in range(1, 21)]
data.columns = ['exon_' + str(i) for i in range(1, 11)]

data_psi = pd.DataFrame(np.ones((20, 10)))
data_psi.index = ['cell_' + str(i) for i in range(1, 21)]
data_psi.columns = ['exon_' + str(i) for i in range(1, 11)]

In [38]:
cell_metric = pd.DataFrame((np.ones((20, 20))*0.5))
cell_metric.index = ['cell_' + str(i) for i in range(1, 21)]
cell_metric.columns = ['cell_' + str(i) for i in range(1, 21)]

In [25]:
adata = anndata.AnnData(data_psi)#.varm

In [84]:
adata.obs[np.median(adata.layers['mrna'], axis=1)>=0]

cell_1
cell_2
cell_3
cell_4
cell_5
cell_6
cell_7
cell_8
cell_9
cell_10
cell_11


In [54]:
adata.layers['metric'] = cell_metric

ValueError: Value passed for key 'metric' is of incorrect shape. Values of layers must match dimensions (0, 1) of parent. Value had shape (20, 20) while it should have had (20, 10).

In [56]:
type(adata.uns['cell_metric'])

pandas.core.frame.DataFrame

In [41]:
adata.uns['cell_metric'] = cell_metric

In [51]:
'cell_metricdsd' in adata.uns

False