In [1]:
import warnings
warnings.resetwarnings()

import scprep
import matplotlib.pyplot as plt

import pandas as pd
import numpy as np
import random

import os

import magic

from tqdm import tqdm
from sklearn.metrics import mean_squared_error as mse

In [2]:
def get_data_for_i(i):
    original_ = pd.read_csv('../../data/1k_cell//data.csv', index_col=0)
    df_ = pd.read_csv('../../data/1k_cell/drp_{}0.csv'.format(i), index_col=0)
    df_.index = [int(i) for i in df_.index]
    df_.columns = [int(i) for i in df_.columns]

    original_.columns = df_.columns
    original_.index = df_.index

    n = original_.size
    original_val = original_.values.copy()
    t = list(np.ndindex(original_.shape))
    random.Random(42).shuffle(t)

    mask = t[:int(len(t)/10 * i)]

    thr = np.sum(np.sign(df_)) > 0
    original_ = original_.loc[:, list(thr)]
    df_ = df_.loc[:, list(thr)]

    # original = original_.values
    original = np.log(original_+1)

    # df = df_.values
    df = np.log(df_+1)

    tmp = pd.DataFrame(thr)
    remove = [int(i) for i in tmp[tmp[0] == False].index]
    mask = [i for i in mask if i[1] not in remove]
    
    return df, mask, original

In [3]:
def get_cos_sim(vector1, vector2):
    dot_product = np.dot(vector1, vector2)

    # ベクトルの大きさ（ノルム）を計算します
    norm_vector1 = np.linalg.norm(vector1)
    norm_vector2 = np.linalg.norm(vector2)

    # Cosine Similarityを計算します
    cosine_similarity = dot_product / (norm_vector1 * norm_vector2)
    
    return cosine_similarity

In [4]:
def get_l1norm(vector1, vector2):
    l1_distance = np.sum(np.abs(vector1 - vector2))
    return l1_distance

In [5]:
mses = {}
corrs = {}
coss = {}
l1 = {}

method = 'MAGIC'

for i in (range(9)):
    print(i)
    df, mask, original = get_data_for_i(i+1)

    magic_op = magic.MAGIC()
    pred = magic_op.fit_transform(df)

    origin = np.array([original.loc[i] for i in mask])
    predict = np.array([pred.loc[i] for i in mask])

    mses[i] = mse(origin, predict)
    corrs[i] = np.corrcoef(origin, predict)[0][1]
    coss[i] = get_cos_sim(origin, predict) 
    l1[i] = get_l1norm(origin, predict)

0
Calculating MAGIC...
  Running MAGIC on 1000 cells and 12345 genes.
  Calculating graph and diffusion operator...
    Calculating PCA...


  with _logger.task("MAGIC"):
  _logger.info(
  with _logger.task("graph and diffusion operator"):


    Calculated PCA in 3.46 seconds.
    Calculating KNN search...
    Calculated KNN search in 0.15 seconds.
    Calculating affinities...
    Calculated affinities in 0.11 seconds.
  Calculated graph and diffusion operator in 3.75 seconds.
  Running MAGIC with `solver='exact'` on 12345-dimensional data may take a long time. Consider denoising specific genes with `genes=<list-like>` or using `solver='approximate'`.
  Calculating imputation...
  Calculated imputation in 0.14 seconds.
Calculated MAGIC in 3.93 seconds.


  with _logger.task("imputation"):


1
Calculating MAGIC...
  Running MAGIC on 1000 cells and 12333 genes.
  Calculating graph and diffusion operator...
    Calculating PCA...


  with _logger.task("MAGIC"):
  _logger.info(
  with _logger.task("graph and diffusion operator"):


    Calculated PCA in 3.77 seconds.
    Calculating KNN search...
    Calculated KNN search in 0.15 seconds.
    Calculating affinities...
    Calculated affinities in 0.11 seconds.
  Calculated graph and diffusion operator in 4.06 seconds.
  Running MAGIC with `solver='exact'` on 12333-dimensional data may take a long time. Consider denoising specific genes with `genes=<list-like>` or using `solver='approximate'`.
  Calculating imputation...
  Calculated imputation in 0.14 seconds.
Calculated MAGIC in 4.23 seconds.


  with _logger.task("imputation"):


2
Calculating MAGIC...
  Running MAGIC on 1000 cells and 12308 genes.
  Calculating graph and diffusion operator...
    Calculating PCA...


  with _logger.task("MAGIC"):
  _logger.info(
  with _logger.task("graph and diffusion operator"):


    Calculated PCA in 2.39 seconds.
    Calculating KNN search...
    Calculated KNN search in 0.16 seconds.
    Calculating affinities...
    Calculated affinities in 0.18 seconds.
  Calculated graph and diffusion operator in 2.80 seconds.
  Running MAGIC with `solver='exact'` on 12308-dimensional data may take a long time. Consider denoising specific genes with `genes=<list-like>` or using `solver='approximate'`.
  Calculating imputation...
  Calculated imputation in 0.15 seconds.
Calculated MAGIC in 3.02 seconds.


  with _logger.task("imputation"):


3
Calculating MAGIC...
  Running MAGIC on 1000 cells and 12279 genes.
  Calculating graph and diffusion operator...
    Calculating PCA...


  with _logger.task("MAGIC"):
  _logger.info(
  with _logger.task("graph and diffusion operator"):


    Calculated PCA in 7.21 seconds.
    Calculating KNN search...
    Calculated KNN search in 0.15 seconds.
    Calculating affinities...
    Calculated affinities in 0.11 seconds.
  Calculated graph and diffusion operator in 7.50 seconds.
  Running MAGIC with `solver='exact'` on 12279-dimensional data may take a long time. Consider denoising specific genes with `genes=<list-like>` or using `solver='approximate'`.
  Calculating imputation...
  Calculated imputation in 0.13 seconds.
Calculated MAGIC in 7.66 seconds.


  with _logger.task("imputation"):


4
Calculating MAGIC...
  Running MAGIC on 1000 cells and 12237 genes.
  Calculating graph and diffusion operator...
    Calculating PCA...


  with _logger.task("MAGIC"):
  _logger.info(
  with _logger.task("graph and diffusion operator"):


    Calculated PCA in 3.72 seconds.
    Calculating KNN search...
    Calculated KNN search in 0.18 seconds.
    Calculating affinities...
    Calculated affinities in 0.12 seconds.
  Calculated graph and diffusion operator in 4.04 seconds.
  Running MAGIC with `solver='exact'` on 12237-dimensional data may take a long time. Consider denoising specific genes with `genes=<list-like>` or using `solver='approximate'`.
  Calculating imputation...
  Calculated imputation in 0.13 seconds.
Calculated MAGIC in 4.21 seconds.


  with _logger.task("imputation"):


5
Calculating MAGIC...
  Running MAGIC on 1000 cells and 12182 genes.
  Calculating graph and diffusion operator...
    Calculating PCA...


  with _logger.task("MAGIC"):
  _logger.info(
  with _logger.task("graph and diffusion operator"):


    Calculated PCA in 3.53 seconds.
    Calculating KNN search...
    Calculated KNN search in 0.15 seconds.
    Calculating affinities...
    Calculated affinities in 0.11 seconds.
  Calculated graph and diffusion operator in 3.83 seconds.
  Running MAGIC with `solver='exact'` on 12182-dimensional data may take a long time. Consider denoising specific genes with `genes=<list-like>` or using `solver='approximate'`.
  Calculating imputation...
  Calculated imputation in 0.16 seconds.
Calculated MAGIC in 4.03 seconds.


  with _logger.task("imputation"):


6
Calculating MAGIC...
  Running MAGIC on 1000 cells and 12122 genes.
  Calculating graph and diffusion operator...
    Calculating PCA...


  with _logger.task("MAGIC"):
  _logger.info(
  with _logger.task("graph and diffusion operator"):


    Calculated PCA in 7.32 seconds.
    Calculating KNN search...
    Calculated KNN search in 0.13 seconds.
    Calculating affinities...
    Calculated affinities in 0.11 seconds.
  Calculated graph and diffusion operator in 7.59 seconds.
  Running MAGIC with `solver='exact'` on 12122-dimensional data may take a long time. Consider denoising specific genes with `genes=<list-like>` or using `solver='approximate'`.
  Calculating imputation...
  Calculated imputation in 0.14 seconds.
Calculated MAGIC in 7.78 seconds.


  with _logger.task("imputation"):


7
Calculating MAGIC...
  Running MAGIC on 1000 cells and 12029 genes.
  Calculating graph and diffusion operator...
    Calculating PCA...


  with _logger.task("MAGIC"):
  _logger.info(
  with _logger.task("graph and diffusion operator"):


    Calculated PCA in 5.10 seconds.
    Calculating KNN search...
    Calculated KNN search in 0.16 seconds.
    Calculating affinities...
    Calculated affinities in 0.13 seconds.
  Calculated graph and diffusion operator in 5.42 seconds.
  Running MAGIC with `solver='exact'` on 12029-dimensional data may take a long time. Consider denoising specific genes with `genes=<list-like>` or using `solver='approximate'`.
  Calculating imputation...
  Calculated imputation in 0.15 seconds.
Calculated MAGIC in 5.62 seconds.


  with _logger.task("imputation"):


8
Calculating MAGIC...
  Running MAGIC on 1000 cells and 11882 genes.
  Calculating graph and diffusion operator...
    Calculating PCA...


  with _logger.task("MAGIC"):
  _logger.info(
  with _logger.task("graph and diffusion operator"):


    Calculated PCA in 7.34 seconds.
    Calculating KNN search...
    Calculated KNN search in 0.15 seconds.
    Calculating affinities...
    Calculated affinities in 0.11 seconds.
  Calculated graph and diffusion operator in 7.64 seconds.
  Running MAGIC with `solver='exact'` on 11882-dimensional data may take a long time. Consider denoising specific genes with `genes=<list-like>` or using `solver='approximate'`.
  Calculating imputation...
  Calculated imputation in 0.14 seconds.
Calculated MAGIC in 7.81 seconds.


  with _logger.task("imputation"):


In [6]:
pd.DataFrame(mses.values()).T

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0.16496,0.176621,0.193429,0.214303,0.240517,0.271402,0.305915,0.34198,0.378075


In [7]:
pd.DataFrame(corrs.values()).T

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0.712891,0.707462,0.700366,0.693561,0.683786,0.670573,0.651113,0.619891,0.567223


In [8]:
pd.DataFrame(coss.values()).T

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0.769276,0.765136,0.759867,0.754814,0.747572,0.737827,0.72344,0.700967,0.664535


In [9]:
pd.DataFrame(l1.values()).T

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,296724.895712,596270.853041,903775.051863,1221059.0,1552907.0,1899296.0,2252073.0,2592266.0,2905910.0
