In [17]:
import scanpy as sc
import scipy as sci
import numpy as np
import anndata
import time
from sklearn.decomposition import PCA
import fast_histogram
import logging
logging.basicConfig(level=logging.INFO)


In [2]:
preprocessed_results = '../../test_data/inputs/10x/PBMC/3k/pre-processed/pbmc3k_preprocessed.h5ad'
adata = anndata.read_h5ad(preprocessed_results)
frame = adata.to_df()
frame.shape


(2496, 10499)

In [10]:
%memit
start = time.time()
embedding = PCA(n_components=100)
frame_dr = embedding.fit_transform(frame)
frame_dr.shape
end = time.time()
runtime = end - start
msg = "The runtime for PCA took {} seconds to complete".format(runtime)
logging.info(msg)



peak memory: 349.61 MiB, increment: -0.25 MiB


INFO:root:The runtime for PCA took 1.1483678817749023 seconds to complete


In [11]:
%load_ext memory_profiler

The memory_profiler extension is already loaded. To reload it, use:
  %reload_ext memory_profiler


In [76]:
def calc_mi_f(arr1, arr2, bins, m):
    """ Calculates mutual information in between two cells, considering their gene expression levels

    This function is called by calc_distance_mat. It takes gene expression data from single cells,
    and compares them using standard calculation for mutual information. It builds a 2d histogram,
    which is used to calculate P(arr1, arr2)

    Args:
        arr1 (pandas series): gene expression data for a given cell in matrix_1
        arr2 (pandas series):
        bins           (int):
        m              (int):
    """
    fq = fast_histogram.histogram2d(arr1, arr2, range=[[arr1.min(), arr1.max()+1e-9], [arr2.min(), arr2.max()+1e-9]],
                                    bins=(bins, bins)) / float(m)
    sm = np.sum(fq * float(m), axis=1)
    tm = np.sum(fq * float(m), axis=0)
    sm = np.asmatrix(sm / float(sm.sum()))
    tm = np.asmatrix(tm / float(tm.sum()))
    sm_tm = np.matmul(np.transpose(sm), tm)
    div = np.divide(fq, sm_tm, where=sm_tm != 0, out=np.zeros_like(fq))
    ent = np.log(div, where=div != 0, out=np.zeros_like(div))
    agg = np.multiply(fq, ent, out=np.zeros_like(fq), where=fq != 0)
    return agg.sum()

In [15]:
num_bins = int((frame_dr.shape[0]) ** (1 / 3.0))
num_genes = frame_dr.shape[1]

In [77]:
%timeit calc_mi_f(frame_dr[0], frame_dr[1], num_bins, num_genes)

152 µs ± 5.86 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [40]:
arr = frame_dr[0]
fast_histogram.histogram1d(arr, bins=num_bins, range=[arr.min(), arr.max()+1e-9]) / num_genes


array([0.02, 0.  , 0.03, 0.04, 0.14, 0.14, 0.18, 0.18, 0.14, 0.09, 0.01,
       0.  , 0.03])

In [73]:
num_cells = frame_dr.shape[0]
marginals = np.empty((num_cells, num_bins))
for index, cell in enumerate(frame_dr):
    ht1d = fast_histogram.histogram1d(cell, bins=num_bins, range=[cell.min(), cell.max()+1e-9]) / num_genes
    marginals[index] = ht1d
print(marginals[0])
print(marginals[1])
np.transpose(np.asmatrix(marginals[0]))

[0.02 0.   0.03 0.04 0.14 0.14 0.18 0.18 0.14 0.09 0.01 0.   0.03]
[0.01 0.   0.   0.   0.   0.   0.05 0.13 0.17 0.19 0.23 0.16 0.06]


matrix([[0.02],
        [0.  ],
        [0.03],
        [0.04],
        [0.14],
        [0.14],
        [0.18],
        [0.18],
        [0.14],
        [0.09],
        [0.01],
        [0.  ],
        [0.03]])

In [78]:
def calc_mi_f2(arr1, arr2, marginals, index1, index2, bins, m):
    """ Calculates mutual information in between two cells, considering their gene expression levels

    This function is called by calc_distance_mat. It takes gene expression data from single cells,
    and compares them using standard calculation for mutual information. It builds a 2d histogram,
    which is used to calculate P(arr1, arr2)

    Args:
        arr1 (pandas series): gene expression data for a given cell in matrix_1
        arr2 (pandas series):
        bins           (int):
        m              (int):
    """
    fq = fast_histogram.histogram2d(arr1, arr2, range=[[arr1.min(), arr1.max()+1e-9], [arr2.min(), arr2.max()+1e-9]],
                                    bins=(bins, bins)) / float(m)
    sm_tm = np.matmul(np.transpose(np.asmatrix(marginals[index1])), np.asmatrix(marginals[index2]))
    div = np.divide(fq, sm_tm, where=sm_tm != 0, out=np.zeros_like(fq))
    ent = np.log(div, where=div != 0, out=np.zeros_like(div))
    agg = np.multiply(fq, ent, out=np.zeros_like(fq), where=fq != 0)
    return agg.sum()


In [80]:
#%timeit
calc_mi_f2(frame_dr[0], frame_dr[1], marginals, 0, 1, num_bins, num_genes)

0.2893862952844467

In [None]:
def calc_marginals(frame_dr, num_bins, num_genes):
    num_cells = frame_dr.shape[0]
    marginals = np.empty((num_cells, num_bins))
    for index, cell in enumerate(frame_dr):
        ht1d = fast_histogram.histogram1d(cell, bins=num_bins, range=[cell.min(), cell.max() + 1e-9]) / num_genes
        marginals[index] = ht1d
    np.transpose(np.asmatrix(marginals[0]))
    return marginals

In [98]:
def calc_norm_mi_marginal(arr1, arr2, marginals, index1, index2, bins, m):
    """ Calculates a normalized mutual information distance in between two cells

    It takes gene expression data from single cells, and compares them using standard calculation for
    mutual information. It builds a 2d histogram, which is used to calculate P(arr1, arr2)

    Args:
        arr1 (pandas series): gene expression data for a given cell in matrix_1
        arr2 (pandas series):
        bins           (int):
        m              (int):
    """
    fq = fast_histogram.histogram2d(arr1, arr2, range=[[arr1.min(), arr1.max()+1e-9], [arr2.min(), arr2.max()+1e-9]],
                                    bins=(bins, bins)) / float(m)
    sm_tm = np.matmul(np.transpose(np.asmatrix(marginals[index1])), np.asmatrix(marginals[index2]))
    div = np.divide(fq, sm_tm, where=sm_tm != 0, out=np.zeros_like(fq))
    ent = np.log(div, where=div != 0, out=np.zeros_like(div))
    agg = np.multiply(fq, ent, out=np.zeros_like(fq), where=fq != 0)
    joint_ent = -np.multiply(fq, np.log(fq, where=fq != 0, out=np.zeros_like(fq)),
                             out=np.zeros_like(fq), where=fq != 0).sum()
    return (joint_ent - agg.sum()) / joint_ent

In [99]:
# %timeit
calc_norm_mi_marginal(frame_dr[0], frame_dr[1], marginals, 0, 1, num_bins, num_genes)

0.9220349966842364

In [100]:
calc_norm_mi_marginal(frame_dr[0], frame_dr[2], marginals, 0, 2, num_bins, num_genes)



0.9143521520120568

In [102]:
def calc_norm_mi(arr1, arr2, bins, m):
    """ Calculates a normalized mutual information distance in between two cells

    It takes gene expression data from single cells, and compares them using standard calculation for
    mutual information. It builds a 2d histogram, which is used to calculate P(arr1, arr2)

    Args:
        arr1 (pandas series): gene expression data for a given cell in matrix_1
        arr2 (pandas series):
        bins           (int):
        m              (int):
    """
    fq = fast_histogram.histogram2d(arr1, arr2, range=[[arr1.min(), arr1.max()+1e-9], [arr2.min(), arr2.max()+1e-9]],
                                    bins=(bins, bins)) / float(m)
    sm = np.sum(fq * float(m), axis=1)
    tm = np.sum(fq * float(m), axis=0)
    sm = np.asmatrix(sm / float(sm.sum()))
    tm = np.asmatrix(tm / float(tm.sum()))
    sm_tm = np.matmul(np.transpose(sm), tm)
    div = np.divide(fq, sm_tm, where=sm_tm != 0, out=np.zeros_like(fq))
    ent = np.log(div, where=div != 0, out=np.zeros_like(div))
    agg = np.multiply(fq, ent, out=np.zeros_like(fq), where=fq != 0)
    joint_ent = -np.multiply(fq, np.log(fq, where=fq != 0, out=np.zeros_like(fq)),
                             out=np.zeros_like(fq), where=fq != 0).sum()
    return (joint_ent - agg.sum()) / joint_ent

In [104]:
%timeit calc_norm_mi(frame_dr[0], frame_dr[1], num_bins, num_genes)

171 µs ± 1.4 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [109]:
from sklearn.neighbors import DistanceMetric
dist = DistanceMetric.get_metric('euclidean')
X = [frame_dr[0], frame_dr[1]]

In [111]:
%timeit dist.pairwise(X)

9.29 µs ± 335 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
