In [1]:
import warnings
warnings.resetwarnings()

import magic
import pandas as pd
import numpy as np
import random

from tqdm import tqdm

from sklearn.metrics import mean_absolute_percentage_error as mape
from sklearn.metrics import mean_squared_error as mse

from rpy2.robjects import r, pandas2ri
from rpy2.robjects.vectors import StrVector

pandas2ri.activate()

r('''
    source('../repos/ALRA/alra.R')
''')



In [2]:
def get_data_for_i(i):
    original_ = pd.read_csv('../../data/cell_simulation_10000/data.csv.gz', index_col=0)
    df_ = pd.read_csv('../../data/cell_simulation_10000/drp_{}0.csv.gz'.format(i), index_col=0)
    df_.index = [int(i) for i in df_.index]
    df_.columns = [int(i) for i in df_.columns]

    original_.columns = df_.columns
    original_.index = df_.index

    n = original_.size
    original_val = original_.values.copy()
    t = list(np.ndindex(original_.shape))
    random.Random(42).shuffle(t)

    mask = t[:int(len(t)/10 * i)]

    thr = np.sum(np.sign(df_)) > 0
    original_ = original_.loc[:, list(thr)]
    df_ = df_.loc[:, list(thr)]

    # original = original_.values
    original = np.log(original_+1)

    # df = df_.values
    df = np.log(df_+1)

    tmp = pd.DataFrame(thr)
    remove = [int(i) for i in tmp[tmp[0] == False].index]
    mask = [i for i in mask if i[1] not in remove]
    
    return df, mask, original

In [3]:
mses = {}
corrs = {}
method = 'ALRA'

for i in (range(9)):
    print(i)
    df, mask, original = get_data_for_i(i+1)

    r.assign("data", df)
    pred = r('''
        train = as.matrix(data)
        train_norm <- normalize_data(train)
        k_choice <- choose_k(train_norm)
        alra(train_norm,k=k_choice$k)[[3]]
    ''')
    
    pred = pd.DataFrame(pred, columns=df.columns, index=df.index)
    pred.to_csv('/export/scratch/inoue019/cell_result/{}_{}.csv.gz'.format(method, i), compression='gzip')

    origin = np.array([original.loc[i] for i in mask])
    predict = np.array([pred.loc[i] for i in mask])

    mses[i] = mse(origin, predict)
    corrs[i] = np.corrcoef(origin, predict)[0][1]

0
Read matrix with 10000 cells and 5000 genes
Getting nonzeros
Randomized SVD
Find the 0.001000 quantile of each gene
Sweep
Scaling all except for 0 columns
0.00% of the values became negative in the scaling process and were set to zero
The matrix went from 30.25% nonzero to 95.84% nonzero
1
Read matrix with 10000 cells and 5000 genes
Getting nonzeros
Randomized SVD
Find the 0.001000 quantile of each gene
Sweep
Scaling all except for 0 columns
0.00% of the values became negative in the scaling process and were set to zero
The matrix went from 26.89% nonzero to 95.04% nonzero
2
Read matrix with 10000 cells and 5000 genes
Getting nonzeros
Randomized SVD
Find the 0.001000 quantile of each gene
Sweep
Scaling all except for 0 columns
0.00% of the values became negative in the scaling process and were set to zero
The matrix went from 23.53% nonzero to 95.35% nonzero
3
Read matrix with 10000 cells and 5000 genes
Getting nonzeros
Randomized SVD
Find the 0.001000 quantile of each gene
Sweep
Sca

In [4]:
pd.DataFrame(mses.values()).T

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,2.22017,2.459477,2.811447,3.215953,3.771986,4.492851,5.662814,7.558511,11.823321


In [5]:
pd.DataFrame(corrs.values()).T

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0.425433,0.407914,0.404208,0.379303,0.381774,0.367112,0.355403,0.321125,0.23394
