In [1]:
import numpy as np
import os
import sys
import collections
import matplotlib.pyplot as plt
import gzip
import loompy
import scipy.sparse as sparse
import urllib.request
import pybedtools
import warnings
from sklearn.neighbors import NearestNeighbors
from matplotlib.collections import LineCollection

import cytograph as cg
from cytograph.decomposition import HPF
from scipy.stats import poisson
from cytograph.manifold import BalancedKNN
from cytograph.metrics import jensen_shannon_distance
# from cytograph.embedding import tsne
from cytograph.clustering import PolishedLouvain, PolishedSurprise
from cytograph.plotting import manifold

sys.path.append('/Users/camima/Documents/py_proj/chromograph')
from chromograph.plotting.QC_plot import QC_plot
from chromograph.pipeline.TF_IDF import TF_IDF

from umap import UMAP
import sklearn.metrics
from scipy.spatial import distance
import community
import networkx as nx
from scipy import sparse
from typing import *

from sklearn.decomposition import IncrementalPCA

import logging

logger = logging.getLogger()
logging.basicConfig(
    format='%(asctime)s %(levelname)-8s %(message)s',
    level=logging.INFO,
    datefmt='%H:%M:%S')

In [None]:
f = '/Users/camima/data/scATAC/10X242_1_5kb.loom'
with loompy.connect(f, 'r') as ds:    
    ## Create n doublets
    n_doublets = 1000
    doublets = np.zeros((ds.shape[0], n_doublets))
    db_tot = np.zeros(n_doublets)
    
    logging.info('Creating data')
    ## Create doublets
    for i in range(n_doublets):
        a = np.random.choice(ds.shape[1])
        b = np.random.choice(ds.shape[1])
        doublets[:, i] = ds[:, a] + ds[:, b]
        db_tot[i] = ds.ca['passed_filters'][a] + ds.ca['passed_filters'][b]
        if i%100 ==0:
            logging.info(f'{i} completed')
            
    ## Use only Q25 top bins
    logging.info(f'Calculating row wise nonzero rate')
    NCells = ds.map([np.count_nonzero], axis=0)[0]
    q = np.quantile(NCells, .75)
    logging.info(f'Using only bins present in more than {q} cells')
    valid = NCells > q
    
    data = np.concatenate((ds[valid,:], doublets[valid,:]), axis=1)
    logging.info(f'Test data has {data.shape} shape')

14:25:35 INFO     Calculating row wise nonzero rate
14:27:14 INFO     Using only bins present in more than 251.0 cells


In [None]:
## Calculate TF_IDF and fit PCA

logging.info(f'Performing TF-IDF')
tf_idf = TF_IDF()
tf_idf.fit(data)
X = tf_idf.transform(data)

# X = np.zeros(data.shape)
# for (ix, selection, view) in ds.scan(axis=1):
#     X[:,selection] = tf_idf.transform(view[:,:], selection)
#     logging.info(f'transformed {max(selection)} cells')
ds.layers['TF_IDF'] = X.astype('float16')
self.blayer = 'TF_IDF'

In [None]:
k = None
use_pca = True

if k is None:
    k = int(np.min([100, ds.shape[1] * 0.01]))

logging.info(f"Initialize NN structure with k = {k}")
if use_pca:
    knn_result = NearestNeighbors(n_neighbors=k, metric='euclidean', n_jobs=4)
    knn_result.fit(pca)
    knn_dist, knn_idx = knn_result.kneighbors(X=pca, return_distance=True)

    num = ds.shape[1]
    knn_result1 = NearestNeighbors(n_neighbors=k, metric='euclidean', n_jobs=4)
    knn_result1.fit(pca[0:num, :])
    knn_dist1, knn_idx1 = knn_result1.kneighbors(X=pca[num + 1:, :], n_neighbors=10)
    knn_dist_rc, knn_idx_rc = knn_result1.kneighbors(X=pca[0:num, :], return_distance=True)