In [1]:
import warnings
warnings.resetwarnings()

import magic
import pandas as pd
import numpy as np
import random

from tqdm import tqdm

from sklearn.metrics import mean_absolute_percentage_error as mape
from sklearn.metrics import mean_squared_error as mse

from rpy2.robjects import r, pandas2ri
from rpy2.robjects.vectors import StrVector

pandas2ri.activate()

r('''
    source('../../../imputation2/notebooks/repos/ALRA/alra.R')
''')



In [2]:
def get_data_for_i(i):
    original_ = pd.read_csv('../../data/1k_cell//data.csv', index_col=0)
    df_ = pd.read_csv('../../data/1k_cell/drp_{}0.csv'.format(i), index_col=0)
    df_.index = [int(i) for i in df_.index]
    df_.columns = [int(i) for i in df_.columns]

    original_.columns = df_.columns
    original_.index = df_.index

    n = original_.size
    original_val = original_.values.copy()
    t = list(np.ndindex(original_.shape))
    random.Random(42).shuffle(t)

    mask = t[:int(len(t)/10 * i)]

    thr = np.sum(np.sign(df_)) > 0
    original_ = original_.loc[:, list(thr)]
    df_ = df_.loc[:, list(thr)]

    # original = original_.values
    original = np.log(original_+1)

    # df = df_.values
    df = np.log(df_+1)

    tmp = pd.DataFrame(thr)
    remove = [int(i) for i in tmp[tmp[0] == False].index]
    mask = [i for i in mask if i[1] not in remove]
    
    return df, mask, original

In [7]:
def get_cos_sim(vector1, vector2):
    dot_product = np.dot(vector1, vector2)

    # ベクトルの大きさ（ノルム）を計算します
    norm_vector1 = np.linalg.norm(vector1)
    norm_vector2 = np.linalg.norm(vector2)

    # Cosine Similarityを計算します
    cosine_similarity = dot_product / (norm_vector1 * norm_vector2)
    
    return cosine_similarity

In [8]:
def get_l1norm(vector1, vector2):
    l1_distance = np.sum(np.abs(vector1 - vector2))
    return l1_distance

In [None]:
mses = {}
corrs = {}
coss = {}
l1 = {}

method = 'ALRA'

for i in (range(9)):
    print(i)
    df, mask, original = get_data_for_i(i+1)

    r.assign("data", df)
    pred = r('''
        train = as.matrix(data)
        train_norm <- normalize_data(train)
        k_choice <- choose_k(train_norm)
        alra(train_norm,k=k_choice$k)[[3]]
    ''')
    
    pred = pd.DataFrame(pred, columns=df.columns, index=df.index)

    origin = np.array([original.loc[i] for i in mask])
    predict = np.array([pred.loc[i] for i in mask])

    mses[i] = mse(origin, predict)
    corrs[i] = np.corrcoef(origin, predict)[0][1]
    coss[i] = get_cos_sim(origin, predict) 
    l1[i] = get_l1norm(origin, predict)

0
Read matrix with 1000 cells and 12345 genes
Getting nonzeros
Randomized SVD
Find the 0.001000 quantile of each gene
Sweep
Scaling all except for 179 columns
0.00% of the values became negative in the scaling process and were set to zero
The matrix went from 19.95% nonzero to 64.97% nonzero
1
Read matrix with 1000 cells and 12333 genes
Getting nonzeros
Randomized SVD
Find the 0.001000 quantile of each gene
Sweep
Scaling all except for 199 columns
0.00% of the values became negative in the scaling process and were set to zero
The matrix went from 17.76% nonzero to 66.93% nonzero
2
Read matrix with 1000 cells and 12308 genes
Getting nonzeros
Randomized SVD
Find the 0.001000 quantile of each gene
Sweep
Scaling all except for 218 columns
0.00% of the values became negative in the scaling process and were set to zero
The matrix went from 15.57% nonzero to 59.33% nonzero
3
Read matrix with 1000 cells and 12279 genes
Getting nonzeros
Randomized SVD
Find the 0.001000 quantile of each gene
Swe

In [None]:
pd.DataFrame(mses.values()).T

In [None]:
pd.DataFrame(corrs.values()).T

In [None]:
pd.DataFrame(coss.values()).T

In [None]:
pd.DataFrame(l1.values()).T