In [1]:
import scanpy as sc
import scipy as sci
import numpy as np
import anndata
import time
from sklearn.decomposition import PCA
import fast_histogram
import logging
logging.basicConfig(level=logging.INFO)
import pandas as pd


In [2]:
preprocessed_results = '../../test_data/inputs/10x/PBMC/3k/pre-processed/pbmc3k_preprocessed.h5ad'
adata = anndata.read_h5ad(preprocessed_results)
frame = adata.to_df()
frame.shape

(2496, 10499)

In [None]:
frame.to_csv('/Users/lding/Documents/MICA/kgraph/pmbc3k.csv')

In [None]:
%memit
start = time.time()
embedding = PCA(n_components=100)
frame_dr = embedding.fit_transform(frame)
frame_dr.shape
end = time.time()
runtime = end - start
msg = "The runtime for PCA took {} seconds to complete".format(runtime)
logging.info(msg)



In [None]:
%load_ext memory_profiler

In [None]:
def calc_mi_f(arr1, arr2, bins, m):
    """ Calculates mutual information in between two cells, considering their gene expression levels

    This function is called by calc_distance_mat. It takes gene expression data from single cells,
    and compares them using standard calculation for mutual information. It builds a 2d histogram,
    which is used to calculate P(arr1, arr2)

    Args:
        arr1 (pandas series): gene expression data for a given cell in matrix_1
        arr2 (pandas series):
        bins           (int):
        m              (int):
    """
    fq = fast_histogram.histogram2d(arr1, arr2, range=[[arr1.min(), arr1.max()+1e-9], [arr2.min(), arr2.max()+1e-9]],
                                    bins=(bins, bins)) / float(m)
    sm = np.sum(fq * float(m), axis=1)
    tm = np.sum(fq * float(m), axis=0)
    sm = np.asmatrix(sm / float(sm.sum()))
    tm = np.asmatrix(tm / float(tm.sum()))
    sm_tm = np.matmul(np.transpose(sm), tm)
    div = np.divide(fq, sm_tm, where=sm_tm != 0, out=np.zeros_like(fq))
    ent = np.log(div, where=div != 0, out=np.zeros_like(div))
    agg = np.multiply(fq, ent, out=np.zeros_like(fq), where=fq != 0)
    return agg.sum()

In [None]:
num_bins = int((frame_dr.shape[0]) ** (1 / 3.0))
num_genes = frame_dr.shape[1]

In [None]:
%timeit calc_mi_f(frame_dr[0], frame_dr[1], num_bins, num_genes)

In [None]:
arr = frame_dr[0]
fast_histogram.histogram1d(arr, bins=num_bins, range=[arr.min(), arr.max()+1e-9]) / num_genes


In [None]:
num_cells = frame_dr.shape[0]
marginals = np.empty((num_cells, num_bins))
for index, cell in enumerate(frame_dr):
    ht1d = fast_histogram.histogram1d(cell, bins=num_bins, range=[cell.min(), cell.max()+1e-9]) / num_genes
    marginals[index] = ht1d
print(marginals[0])
print(marginals[1])
np.transpose(np.asmatrix(marginals[0]))

In [None]:
def calc_mi_f2(arr1, arr2, marginals, index1, index2, bins, m):
    """ Calculates mutual information in between two cells, considering their gene expression levels

    This function is called by calc_distance_mat. It takes gene expression data from single cells,
    and compares them using standard calculation for mutual information. It builds a 2d histogram,
    which is used to calculate P(arr1, arr2)

    Args:
        arr1 (pandas series): gene expression data for a given cell in matrix_1
        arr2 (pandas series):
        bins           (int):
        m              (int):
    """
    fq = fast_histogram.histogram2d(arr1, arr2, range=[[arr1.min(), arr1.max()+1e-9], [arr2.min(), arr2.max()+1e-9]],
                                    bins=(bins, bins)) / float(m)
    sm_tm = np.matmul(np.transpose(np.asmatrix(marginals[index1])), np.asmatrix(marginals[index2]))
    div = np.divide(fq, sm_tm, where=sm_tm != 0, out=np.zeros_like(fq))
    ent = np.log(div, where=div != 0, out=np.zeros_like(div))
    agg = np.multiply(fq, ent, out=np.zeros_like(fq), where=fq != 0)
    return agg.sum()


In [None]:
#%timeit
calc_mi_f2(frame_dr[0], frame_dr[1], marginals, 0, 1, num_bins, num_genes)

In [None]:
def calc_marginals(frame_dr, num_bins, num_genes):
    num_cells = frame_dr.shape[0]
    marginals = np.empty((num_cells, num_bins))
    for index, cell in enumerate(frame_dr):
        ht1d = fast_histogram.histogram1d(cell, bins=num_bins, range=[cell.min(), cell.max() + 1e-9]) / num_genes
        marginals[index] = ht1d
    np.transpose(np.asmatrix(marginals[0]))
    return marginals

In [None]:
def calc_norm_mi_marginal(arr1, arr2, marginals, index1, index2, bins, m):
    """ Calculates a normalized mutual information distance in between two cells

    It takes gene expression data from single cells, and compares them using standard calculation for
    mutual information. It builds a 2d histogram, which is used to calculate P(arr1, arr2)

    Args:
        arr1 (pandas series): gene expression data for a given cell in matrix_1
        arr2 (pandas series):
        bins           (int):
        m              (int):
    """
    fq = fast_histogram.histogram2d(arr1, arr2, range=[[arr1.min(), arr1.max()+1e-9], [arr2.min(), arr2.max()+1e-9]],
                                    bins=(bins, bins)) / float(m)
    sm_tm = np.matmul(np.transpose(np.asmatrix(marginals[index1])), np.asmatrix(marginals[index2]))
    div = np.divide(fq, sm_tm, where=sm_tm != 0, out=np.zeros_like(fq))
    ent = np.log(div, where=div != 0, out=np.zeros_like(div))
    agg = np.multiply(fq, ent, out=np.zeros_like(fq), where=fq != 0)
    joint_ent = -np.multiply(fq, np.log(fq, where=fq != 0, out=np.zeros_like(fq)),
                             out=np.zeros_like(fq), where=fq != 0).sum()
    return (joint_ent - agg.sum()) / joint_ent

In [None]:
# %timeit
calc_norm_mi_marginal(frame_dr[0], frame_dr[1], marginals, 0, 1, num_bins, num_genes)

In [None]:
calc_norm_mi_marginal(frame_dr[0], frame_dr[2], marginals, 0, 2, num_bins, num_genes)



In [None]:
def calc_norm_mi(arr1, arr2, bins, m):
    """ Calculates a normalized mutual information distance in between two cells

    It takes gene expression data from single cells, and compares them using standard calculation for
    mutual information. It builds a 2d histogram, which is used to calculate P(arr1, arr2)

    Args:
        arr1 (pandas series): gene expression data for a given cell in matrix_1
        arr2 (pandas series):
        bins           (int):
        m              (int):
    """
    fq = fast_histogram.histogram2d(arr1, arr2, range=[[arr1.min(), arr1.max()+1e-9], [arr2.min(), arr2.max()+1e-9]],
                                    bins=(bins, bins)) / float(m)
    sm = np.sum(fq * float(m), axis=1)
    tm = np.sum(fq * float(m), axis=0)
    sm = np.asmatrix(sm / float(sm.sum()))
    tm = np.asmatrix(tm / float(tm.sum()))
    sm_tm = np.matmul(np.transpose(sm), tm)
    div = np.divide(fq, sm_tm, where=sm_tm != 0, out=np.zeros_like(fq))
    ent = np.log(div, where=div != 0, out=np.zeros_like(div))
    agg = np.multiply(fq, ent, out=np.zeros_like(fq), where=fq != 0)
    joint_ent = -np.multiply(fq, np.log(fq, where=fq != 0, out=np.zeros_like(fq)),
                             out=np.zeros_like(fq), where=fq != 0).sum()
    return (joint_ent - agg.sum()) / joint_ent

In [None]:
%timeit calc_norm_mi(frame_dr[0], frame_dr[1], num_bins, num_genes)

In [None]:
from sklearn.neighbors import DistanceMetric
dist = DistanceMetric.get_metric('euclidean')
X = [frame_dr[0], frame_dr[1]]

In [None]:
%timeit dist.pairwise(X)

In [None]:
def read_preprocessed_mat(in_file):
    """Read in preprocessed matrix file into a dataframe."""
    if in_file.endswith('.txt'):
        frame = pd.read_csv(in_file, sep="\t", index_col=0).iloc[:, 0:]
    if in_file.endswith('.h5ad') or in_file.endswith('.h5'):
        adata = anndata.read_h5ad(in_file)
        frame = adata.to_df()
    return frame

In [2]:
adata = anndata.read_h5ad('/Users/lding/Git/MICA/test_data/inputs/10x/PBMC/3k/pre-processed/pbmc3k_preprocessed.h5ad')

In [3]:
print(adata.X.shape)

(2496, 10499)


In [4]:
start = time.time()
indices, dists, forest = sc.neighbors.compute_neighbors_umap(adata.X, n_neighbors=10)
end = time.time()
runtime = end - start
msg = "The runtime for compute_neighbors_umap took {} seconds to complete".format(runtime)
logging.info(msg)

INFO:root:The runtime for compute_neighbors_umap took 123.29728174209595 seconds to complete


In [5]:
def calc_norm_mi(arr1, arr2, bins, m):
    """ Calculates a normalized mutual information distance D(X, Y) = 1 - I(X, Y)/H(X, Y) using bin-based method

    It takes gene expression data from single cells, and compares them using standard calculation for
    mutual information and joint entropy. It builds a 2d histogram, which is used to calculate P(arr1, arr2).

    Args:
        arr1 (pandas series): gene expression data for cell 1
        arr2 (pandas series): gene expression data for cell 2
        marginals  (ndarray): marginal probability matrix
        index1         (int): index of cell 1
        index2         (int): index of cell 2
        bins           (int): number of bins
        m              (int): number of genes
    Returns:
        a float between 0 and 1
    """
    fq = fast_histogram.histogram2d(arr1, arr2, range=[[arr1.min(), arr1.max()+1e-9], [arr2.min(), arr2.max()+1e-9]],
                                    bins=(bins, bins)) / float(m)
    sm = np.sum(fq * float(m), axis=1)
    tm = np.sum(fq * float(m), axis=0)
    sm = np.asmatrix(sm / float(sm.sum()))
    tm = np.asmatrix(tm / float(tm.sum()))
    sm_tm = np.matmul(np.transpose(sm), tm)
    div = np.divide(fq, sm_tm, where=sm_tm != 0, out=np.zeros_like(fq))
    ent = np.log(div, where=div != 0, out=np.zeros_like(div))
    agg = np.multiply(fq, ent, out=np.zeros_like(fq), where=fq != 0)
    joint_ent = -np.multiply(fq, np.log(fq, where=fq != 0, out=np.zeros_like(fq)),
                             out=np.zeros_like(fq), where=fq != 0).sum()
    return (joint_ent - agg.sum()) / joint_ent

In [7]:
num_bins = int((adata.X.shape[0]) ** (1 / 3.0))
num_genes = adata.X.shape[1]
metric_params = {"bins": num_bins, "m": num_genes}

In [8]:
start = time.time()
indices, dists, forest = sc.neighbors.compute_neighbors_umap(adata.X, n_neighbors=10, metric=calc_norm_mi,
                                                             metric_kwds=metric_params)
end = time.time()
runtime = end - start
msg = "The runtime for compute_neighbors_umap took {} seconds to complete".format(runtime)
logging.info(msg)

TypingError: Failed in nopython mode pipeline (step: nopython frontend)
Failed in nopython mode pipeline (step: nopython frontend)
Untyped global name '_distance_func': cannot determine Numba type of <class 'function'>

File "../../../../.pyenv/versions/3.7.4/lib/python3.7/site-packages/umap/umap_.py", line 334:
                    def _partial_dist_func(ind1, data1, ind2, data2):
                        return _distance_func(ind1, data1, ind2, data2, *dist_args)
                        ^

[1] During: resolving callee type: type(CPUDispatcher(<function nearest_neighbors.<locals>._partial_dist_func at 0x1347970e0>))
[2] During: typing of call at /Users/lding/.pyenv/versions/3.7.4/lib/python3.7/site-packages/umap/sparse_nndescent.py (235)

[3] During: resolving callee type: type(CPUDispatcher(<function nearest_neighbors.<locals>._partial_dist_func at 0x1347970e0>))
[4] During: typing of call at /Users/lding/.pyenv/versions/3.7.4/lib/python3.7/site-packages/umap/sparse_nndescent.py (54)

[5] During: resolving callee type: type(CPUDispatcher(<function nearest_neighbors.<locals>._partial_dist_func at 0x1347970e0>))
[6] During: typing of call at /Users/lding/.pyenv/versions/3.7.4/lib/python3.7/site-packages/umap/sparse_nndescent.py (54)

[7] During: resolving callee type: type(CPUDispatcher(<function nearest_neighbors.<locals>._partial_dist_func at 0x1347970e0>))
[8] During: typing of call at /Users/lding/.pyenv/versions/3.7.4/lib/python3.7/site-packages/umap/sparse_nndescent.py (103)

[9] During: resolving callee type: type(CPUDispatcher(<function nearest_neighbors.<locals>._partial_dist_func at 0x1347970e0>))
[10] During: typing of call at /Users/lding/.pyenv/versions/3.7.4/lib/python3.7/site-packages/umap/sparse_nndescent.py (120)

[11] During: resolving callee type: type(CPUDispatcher(<function nearest_neighbors.<locals>._partial_dist_func at 0x1347970e0>))
[12] During: typing of call at /Users/lding/.pyenv/versions/3.7.4/lib/python3.7/site-packages/umap/sparse_nndescent.py (103)

[13] During: resolving callee type: type(CPUDispatcher(<function nearest_neighbors.<locals>._partial_dist_func at 0x1347970e0>))
[14] During: typing of call at /Users/lding/.pyenv/versions/3.7.4/lib/python3.7/site-packages/umap/sparse_nndescent.py (120)

[15] During: resolving callee type: type(CPUDispatcher(<function nearest_neighbors.<locals>._partial_dist_func at 0x1347970e0>))
[16] During: typing of call at /Users/lding/.pyenv/versions/3.7.4/lib/python3.7/site-packages/umap/sparse_nndescent.py (172)

[17] During: resolving callee type: type(CPUDispatcher(<function nearest_neighbors.<locals>._partial_dist_func at 0x1347970e0>))
[18] During: typing of call at /Users/lding/.pyenv/versions/3.7.4/lib/python3.7/site-packages/umap/sparse_nndescent.py (191)

[19] During: resolving callee type: type(CPUDispatcher(<function nearest_neighbors.<locals>._partial_dist_func at 0x1347970e0>))
[20] During: typing of call at /Users/lding/.pyenv/versions/3.7.4/lib/python3.7/site-packages/umap/sparse_nndescent.py (172)

[21] During: resolving callee type: type(CPUDispatcher(<function nearest_neighbors.<locals>._partial_dist_func at 0x1347970e0>))
[22] During: typing of call at /Users/lding/.pyenv/versions/3.7.4/lib/python3.7/site-packages/umap/sparse_nndescent.py (191)

[23] During: resolving callee type: type(CPUDispatcher(<function nearest_neighbors.<locals>._partial_dist_func at 0x1347970e0>))
[24] During: typing of call at /Users/lding/.pyenv/versions/3.7.4/lib/python3.7/site-packages/umap/sparse_nndescent.py (235)


File "../../../../.pyenv/versions/3.7.4/lib/python3.7/site-packages/umap/sparse_nndescent.py", line 235:
def sparse_nn_descent(
    <source elided>

            d = sparse_dist(from_inds, from_data, to_inds, to_data)
            ^


In [4]:
adata = anndata.read_h5ad('/Users/lding/Documents/MICA/Datasets/filtered_gene_bc_matrices/hg19/pbmc33k_preprocessed.h5ad')
frame = adata.to_df()
frame.shape

(30716, 9886)

In [5]:
frame.to_csv('/Users/lding/Documents/MICA/kgraph/pmbc33k.csv')

In [3]:
np.arange(1, 12)

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])