In [1]:
import warnings
warnings.resetwarnings()

import magic
import pandas as pd
import numpy as np
import random

from tqdm import tqdm

from sklearn.metrics import mean_absolute_percentage_error as mape
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import roc_auc_score

from rpy2.robjects import r, pandas2ri
from rpy2.robjects.vectors import StrVector

pandas2ri.activate()

r('''
    source('../repos/ALRA/alra.R')
''')



In [2]:
def get_data_for_i(i):
    original_ = pd.read_csv('../../data/mid_simulation/data.csv.gz', index_col=0)
    df_ = pd.read_csv('../../data/mid_simulation/drp_{}0.csv.gz'.format(i), index_col=0)
    df_.index = [int(i) for i in df_.index]
    df_.columns = [int(i) for i in df_.columns]

    original_.columns = df_.columns
    original_.index = df_.index

    n = original_.size
    original_val = original_.values.copy()
    t = list(np.ndindex(original_.shape))
    random.Random(42).shuffle(t)

    mask = t[:int(len(t)/10 * i)]

    thr = np.sum(np.sign(df_)) > 0
    original_ = original_.loc[:, list(thr)]
    df_ = df_.loc[:, list(thr)]

    # original = original_.values
    original = np.log(original_+1)

    # df = df_.values
    df = np.log(df_+1)

    tmp = pd.DataFrame(thr)
    remove = [int(i) for i in tmp[tmp[0] == False].index]
    mask = [i for i in mask if i[1] not in remove]
    
    return df, mask, original

In [3]:
mses = {}
corrs = {}
mses_ = {}
corrs_ = {}
mses__ = {}
corrs__ = {}
aucs = {}
method = 'ALRA'

for i in (range(9)):
    print(i)
    df, mask, original = get_data_for_i(i+1)

    r.assign("data", df)
    pred = r('''
        train = as.matrix(data)
        train_norm <- normalize_data(train)
        k_choice <- choose_k(train_norm)
        alra(train_norm,k=k_choice$k)[[3]]
    ''')
    
    pred = pd.DataFrame(pred, columns=df.columns, index=df.index)
    pred.to_csv('/export/scratch/inoue019/cell_result/{}_{}.csv.gz'.format(method, i), compression='gzip')

    origin = np.array([original.loc[i] for i in mask])
    predict = np.array([pred.loc[i] for i in mask])

    mses[i] = mse(origin, predict)
    corrs[i] = np.corrcoef(origin, predict)[0][1]
    mses_[i] = mse(origin[origin != 0], predict[origin != 0])
    corrs_[i] = np.corrcoef(origin[origin != 0], predict[origin != 0])[0][1]
    mses__[i] = mse(origin[origin == 0], predict[origin == 0])
    
    df =  pd.DataFrame(np.array(predict))
    df['rank'] = df.rank()
    df['label'] = np.sign(origin)
    aucs[i] = roc_auc_score(df['label'], df['rank'])

0
Read matrix with 5000 cells and 5000 genes
Getting nonzeros
Randomized SVD
Find the 0.001000 quantile of each gene
Sweep
Scaling all except for 0 columns
0.00% of the values became negative in the scaling process and were set to zero
The matrix went from 30.37% nonzero to 95.64% nonzero
1
Read matrix with 5000 cells and 5000 genes
Getting nonzeros
Randomized SVD
Find the 0.001000 quantile of each gene
Sweep
Scaling all except for 0 columns
0.00% of the values became negative in the scaling process and were set to zero
The matrix went from 27.00% nonzero to 94.74% nonzero
2
Read matrix with 5000 cells and 5000 genes
Getting nonzeros
Randomized SVD
Find the 0.001000 quantile of each gene
Sweep
Scaling all except for 0 columns
0.00% of the values became negative in the scaling process and were set to zero
The matrix went from 23.63% nonzero to 94.74% nonzero
3
Read matrix with 5000 cells and 5000 genes
Getting nonzeros
Randomized SVD
Find the 0.001000 quantile of each gene
Sweep
Scaling

In [4]:
pd.DataFrame([
    mses.values(),
    mses_.values(),
    mses__.values(),
    corrs.values(),
    corrs_.values(),
    aucs.values()
], index=['mse', 'mse (nonzero)', 'mse (zero)', 'corr', 'corrs (nonzero)', 'auc'])

Unnamed: 0,0,1,2,3,4,5,6,7,8
mse,2.195449,2.430112,2.765812,3.123756,3.717219,4.420699,5.592657,7.232758,11.629781
mse (nonzero),0.947728,1.100159,1.313934,1.576124,1.976151,2.513339,3.344558,4.756801,7.919156
mse (zero),2.832387,3.108335,3.505507,3.912095,4.603558,5.391877,6.737721,8.494411,13.52055
corr,0.42436,0.407857,0.401071,0.374134,0.382231,0.361156,0.350254,0.239245,0.214446
corrs (nonzero),0.45332,0.436991,0.430042,0.391507,0.416185,0.389055,0.394695,0.151468,0.168995
auc,0.736905,0.734456,0.734049,0.729053,0.732546,0.728985,0.721566,0.692081,0.66786
