In [1]:
import scanpy as sc
import scipy as sci
import numpy as np
import anndata
import logging
from sklearn.manifold import MDS
from sklearn.decomposition import PCA
from sklearn.manifold import SpectralEmbedding
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import kneighbors_graph
import time

In [4]:
b = np.array([[8,1,7], [4,3,9], [5,2,6]])
print(np.apply_along_axis(sorted, 1, b))
b[1, :]

[[1 7 8]
 [3 4 9]
 [2 5 6]]


array([4, 3, 9])

In [5]:
logging.basicConfig(level=logging.INFO)

In [6]:
preprocessed_results = '../../test_data/inputs/10x/PBMC/3k/pre-processed/pbmc3k_preprocessed.h5ad'
adata = anndata.read_h5ad(preprocessed_results)
frame = adata.to_df()
frame.shape

(2496, 10499)

In [5]:
%load_ext memory_profiler

In [20]:
%memit
start = time.time()
embedding = MDS(n_components=100, n_jobs=4)
frame_dr = embedding.fit_transform(frame)
logging.info(frame_dr.shape)
end = time.time()
runtime = end - start
msg = "The runtime for MDS took {} seconds to complete".format(runtime)
logging.info(msg)

peak memory: 443.01 MiB, increment: -0.15 MiB


KeyboardInterrupt: 

In [6]:
%memit
start = time.time()
embedding = PCA(n_components=100)
frame_dr = embedding.fit_transform(frame)
frame_dr.shape
end = time.time()
runtime = end - start
msg = "The runtime for PCA took {} seconds to complete".format(runtime)
logging.info(msg)




peak memory: 320.13 MiB, increment: -0.34 MiB


INFO:root:The runtime for PCA took 1.654876947402954 seconds to complete


In [7]:
preprocessed_results = '/Users/lding/Documents/MICA/Datasets/filtered_gene_bc_matrices/hg19/pbmc33k_preprocessed.h5ad'
adata = anndata.read_h5ad(preprocessed_results)
frame = adata.to_df()
frame.shape


(30716, 9886)

In [9]:
%load_ext memory_profiler


In [None]:
%memit
start = time.time()
embedding = MDS(n_components=100, n_jobs=4)
frame_dr = embedding.fit_transform(frame)
logging.info(frame_dr.shape)
end = time.time()
runtime = end - start
msg = "The runtime for MDS took {} seconds to complete".format(runtime)
logging.info(msg)


peak memory: 1600.33 MiB, increment: -0.11 MiB


In [10]:
%memit
start = time.time()
embedding = PCA(n_components=100)
frame_dr = embedding.fit_transform(frame)
print(frame_dr.shape)
end = time.time()
runtime = end - start
msg = "The runtime for PCA took {} seconds to complete".format(runtime)
logging.info(msg)


peak memory: 313.68 MiB, increment: -0.27 MiB
(2496, 100)


INFO:root:The runtime for PCA took 1.7927229404449463 seconds to complete


In [None]:
%memit
start = time.time()
embedding = SpectralEmbedding(n_components=100)
frame_dr = embedding.fit_transform(frame)
frame_dr.shape
end = time.time()
runtime = end - start
msg = "The runtime for PCA took {} seconds to complete".format(runtime)
logging.info(msg)


peak memory: 1554.87 MiB, increment: -0.06 MiB


In [7]:
print(frame_dr.shape)
frame_dr


(2496, 100)


array([[-21.32925   ,  -5.310842  ,  -0.5425027 , ...,  -1.5629748 ,
          4.2057347 ,  -3.0832164 ],
       [ -9.444432  , -51.72805   ,  -9.201903  , ...,  -7.9191704 ,
        -21.303808  ,   6.881933  ],
       [ 64.72338   ,  -0.9341317 ,  -2.048853  , ..., -12.259157  ,
         -5.5743666 ,   0.33352754],
       ...,
       [-17.608286  ,  37.80875   , -20.959614  , ...,  -0.23672865,
        -10.469886  ,   0.6343327 ],
       [-18.737597  ,  55.075397  ,   0.6195772 , ...,   0.4294223 ,
         -8.523884  ,  -2.6892688 ],
       [-25.820639  ,  -1.7180945 , -11.768476  , ...,   8.850921  ,
        -12.339821  ,   4.1392007 ]], dtype=float32)

In [48]:
def calc_mi(arr1, arr2, bins, m):
    """ Calculates mutual information in between two cells, considering their gene expression levels

    This function is called by calc_distance_mat. It takes gene expression data from single cells,
    and compares them using standard calculation for mutual information. It builds a 2d histogram,
    which is used to calculate P(arr1, arr2)

    Args:
        arr1 (pandas series): gene expression data for a given cell in matrix_1
        arr2 (pandas series):
        bins           (int):
        m              (int):
    """
    fq = np.histogram2d(arr1, arr2, bins=(bins, bins))[0] / float(m)
    # sm = np.sum(fq * float(m), axis=1)
    # tm = np.sum(fq * float(m), axis=0)
    # sm = np.asmatrix(sm / float(sm.sum()))
    # tm = np.asmatrix(tm / float(tm.sum()))
    # sm_tm = np.matmul(np.transpose(sm), tm)
    # div = np.divide(fq, sm_tm, where=sm_tm != 0, out=np.zeros_like(fq))
    # ent = np.log(div, where=div != 0, out=np.zeros_like(div))
    # agg = np.multiply(fq, ent, out=np.zeros_like(fq), where=fq != 0)
    # return agg.sum()
    return fq


In [12]:
num_bins = int((frame_dr.shape[0]) ** (1 / 3.0))
num_genes = frame_dr.shape[1]
print(num_bins)
print(num_genes)
metric_params = {"bins": num_bins, "m": num_genes}
type(frame_dr)

13
100


numpy.ndarray

In [40]:
nbrs = NearestNeighbors(n_neighbors=2, algorithm='ball_tree', leaf_size=5, metric=calc_mi, metric_params=metric_params)
nbrs.fit(frame_dr)
nbrs.get_params()

{'algorithm': 'ball_tree',
 'leaf_size': 5,
 'metric': <function __main__.calc_mi(arr1, arr2, bins, m)>,
 'metric_params': {'bins': 13, 'm': 100},
 'n_jobs': None,
 'n_neighbors': 2,
 'p': 2,
 'radius': 1.0}

In [38]:
%timeit
G = nbrs.kneighbors_graph(frame_dr)
print(G)


KeyboardInterrupt: 

In [53]:
%%timeit
kn_graph = kneighbors_graph(frame_dr, n_neighbors=10, include_self=False).toarray()
print(kn_graph)


[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ...

In [63]:
from sklearn.neighbors import DistanceMetric
dist = DistanceMetric.get_metric('euclidean')

In [64]:
dist.pairwise(frame_dr[0], frame_dr[1])


ValueError: Buffer has wrong number of dimensions (expected 2, got 1)

In [65]:
from sklearn.metrics import mutual_info_score
def calc_MI(x, y, bins):
    c_xy = np.histogram2d(x, y, bins)[0]
    mi = mutual_info_score(None, None, contingency=c_xy)
    return mi

In [10]:
%timeit calc_MI(frame_dr[0], frame_dr[1], num_bins)

NameError: name 'calc_MI' is not defined

In [19]:
%timeit calc_mi(frame_dr[0], frame_dr[1], num_bins, num_genes)

373 µs ± 5.61 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [42]:
%timeit np.linalg.norm(frame_dr[0]-frame_dr[1])


6.9 µs ± 444 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [114]:
his1 = np.histogram2d(frame_dr[0], frame_dr[0], range=[[frame_dr[0].min(), frame_dr[0].max()], [frame_dr[1].min(), frame_dr[1].max()]], bins=(num_bins, num_bins))
sum(sum(his1[0]))

100.0

In [30]:
import fast_histogram

In [31]:
his2 = fast_histogram.histogram2d(frame_dr[0], frame_dr[0], range=[[frame_dr[0].min(), frame_dr[0].max()+1e-9], [frame_dr[1].min(), frame_dr[1].max()+1e-9]], bins=(num_bins, num_bins))
his2.shape
sum(sum(his2))


100.0

In [45]:
def calc_mi_f(arr1, arr2, bins, m):
    """ Calculates mutual information in between two cells, considering their gene expression levels

    This function is called by calc_distance_mat. It takes gene expression data from single cells,
    and compares them using standard calculation for mutual information. It builds a 2d histogram,
    which is used to calculate P(arr1, arr2)

    Args:
        arr1 (pandas series): gene expression data for a given cell in matrix_1
        arr2 (pandas series):
        bins           (int):
        m              (int):
    """
    fq = fast_histogram.histogram2d(arr1, arr2, range=[[arr1.min(), arr1.max()+1e-9], [arr2.min(), arr2.max()+1e-9]],
                                    bins=(bins, bins)) / float(m)
    # sm = np.sum(fq * float(m), axis=1)
    # tm = np.sum(fq * float(m), axis=0)
    # sm = np.asmatrix(sm / float(sm.sum()))
    # tm = np.asmatrix(tm / float(tm.sum()))
    # sm_tm = np.matmul(np.transpose(sm), tm)
    # div = np.divide(fq, sm_tm, where=sm_tm != 0, out=np.zeros_like(fq))
    # ent = np.log(div, where=div != 0, out=np.zeros_like(div))
    # agg = np.multiply(fq, ent, out=np.zeros_like(fq), where=fq != 0)
    # return agg.sum()
    return fq

In [32]:
frame_dr[0]
num_bins
num_genes
frame_dr[1]

array([ -9.444456  , -51.72808   ,  -9.201859  ,  -4.813866  ,
         6.104797  ,  17.683878  ,   9.576095  ,  -7.449539  ,
       -10.030239  ,   7.4715824 ,  -9.092354  ,   3.9567683 ,
       -13.239668  ,   8.51443   , -15.592079  ,   3.5520077 ,
         6.5806293 ,  -2.5439947 ,  -3.5155458 ,  -4.724738  ,
         8.452022  ,   9.136798  , -10.916585  ,  10.847441  ,
       -10.141193  ,   6.484892  ,  11.182839  ,  -4.431957  ,
         5.9527683 , -12.526245  , -10.973021  ,  13.206249  ,
         5.3229218 , -10.177122  ,   4.1274843 ,  -3.8336153 ,
        -1.1042932 ,  -9.003187  ,  -0.1419278 ,  -1.0794963 ,
        -5.488525  ,  -9.350327  ,  -1.6100521 ,  13.424952  ,
         3.4475007 ,   2.9993792 ,   6.061851  ,   1.6283923 ,
         7.427852  ,  -4.9590573 ,  11.682909  ,   8.554527  ,
        -4.5747147 ,  13.450885  ,  10.298287  ,  -5.6879015 ,
         4.4091744 ,  -3.1478784 ,   8.960452  ,  -7.5510283 ,
         2.2874665 ,   6.543751  ,   5.173377  ,  -1.75

In [51]:
%timeit calc_mi_f(frame_dr[0], frame_dr[1], num_bins, num_genes)

36 µs ± 683 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [50]:
%timeit calc_mi(frame_dr[0], frame_dr[1], num_bins, num_genes)


246 µs ± 19.4 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [129]:
for i in range(0, 2496):
    for j in range(0, 100):
        m1 = calc_mi(frame_dr[i], frame_dr[j], num_bins, num_genes)
        m2 = calc_mi_f(frame_dr[i], frame_dr[j], num_bins, num_genes)
        if m1 != m2:
            print(m1)
            print(m2)
            break
frame_dr.shape



(2496, 100)

In [133]:
nbrs = NearestNeighbors(n_neighbors=2, algorithm='ball_tree', leaf_size=5)
nbrs.fit(frame_dr)
nbrs.get_params()

{'algorithm': 'ball_tree',
 'leaf_size': 5,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 2,
 'p': 2,
 'radius': 1.0}

In [136]:
%%timeit
G = nbrs.kneighbors_graph(frame_dr)
# print(G)



836 ms ± 15.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [138]:
nbrs = NearestNeighbors(n_neighbors=2, algorithm='ball_tree', leaf_size=5, metric=calc_mi_f, metric_params=metric_params)
nbrs.fit(frame_dr)
nbrs.get_params()

{'algorithm': 'ball_tree',
 'leaf_size': 5,
 'metric': <function __main__.calc_mi_f(arr1, arr2, bins, m)>,
 'metric_params': {'bins': 13, 'm': 100},
 'n_jobs': None,
 'n_neighbors': 2,
 'p': 2,
 'radius': 1.0}

In [140]:
%%timeit
G = nbrs.kneighbors_graph(frame_dr)

16min 44s ± 17.5 s per loop (mean ± std. dev. of 7 runs, 1 loop each)
