In [1]:
import sys
sys.path.append('/home/miaochangjiu/miniconda3/envs/snakemake/lib/python3.12/site-packages/')
import pickle, os, gzip, json, sys, itertools
from pathlib import Path
from importlib import reload
from dataclasses import dataclass, field
import collections
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
import pysam
import scipy as sp
import seaborn
import sharedmem

plt.rcParams["figure.facecolor"] = "white"
plt.rcParams["figure.dpi"] = 300


sys.path.append("scripts")
sys.path.append("../../scripts")

In [20]:
from data_io import is_fwd_id, get_fwd_id, get_sibling_id
from dim_reduction import SpectralEmbedding, scBiMapEmbedding
from nearest_neighbors import (
    ExactNearestNeighbors,
    NNDescent,
    WeightedLowHash,
    PAFNearestNeighbors,
    LowHash,
    HNSW,
    ProductQuantization,
    _NearestNeighbors,
    IVFProductQuantization,
)
from graph import OverlapGraph, GenomicInterval, get_overlap_statistics, remove_false_edges
from truth import get_overlaps
from evaluate import NearestNeighborsConfig, mp_compute_nearest_neighbors
from plots import plot_read_graph, mp_plot_read_graphs, get_graphviz_layout, get_umap_layout

In [8]:
MAX_SAMPLE_SIZE = int(1e9)
COVERAGE_DEPTH = 20
npz_path = "/home/miaochangjiu/myrun/data/feature_matrix/CHM13/HLA/pbsim_ONT_93_30k/kmer_k16/feature_matrix.npz"
tsv_path = "/home/miaochangjiu/myrun/data/feature_matrix/CHM13/HLA/pbsim_ONT_93_30k/kmer_k16/metadata.tsv.gz"
json_path = "/home/miaochangjiu/myrun/data/feature_matrix/CHM13/HLA/pbsim_ONT_93_30k/kmer_k16/read_features.json.gz"
paf_path = "/home/miaochangjiu/myrun/data/minimap2/CHM13/HLA/pbsim_ONT_93_30k/alignment.paf.gz"

output_folder = "/home/miaochangjiu/myrun/data/evaluation/CHM13/HLA/pbsim_ONT_93_30k/kmer_k16/"

threads  = 8

meta_df = pd.read_table(tsv_path).iloc[:MAX_SAMPLE_SIZE, :].reset_index()
read_indices = {read_name: read_id for read_id, read_name in meta_df['read_name'].items()}

feature_matrix = sp.sparse.load_npz(npz_path)[meta_df.index, :]

with gzip.open(json_path, "rt") as f:
    read_features = json.load(f)
    read_features = {i: read_features[i] for i in meta_df.index}

feature_weights = {i: 1 for i in range(feature_matrix.shape[1])}

kw = dict(data=feature_matrix)
max_bucket_size = COVERAGE_DEPTH * 1.5

In [10]:
kw = dict(data=feature_matrix )
max_bucket_size = COVERAGE_DEPTH * 1.5
max_n_neighbors = COVERAGE_DEPTH

find_neighbor_method = []
methods = ['Exact_Euclidean','Exact_Cosine',
          'NNDescent_Euclidean','PQ_Euclidean','PQ_Cosine','HNSW_Euclidean','HNSW_Cosine',
          'LowHash_Jaccard','WeightedLowHash_Jaccard']
dim_redu_method = ['Spectural_100d','Spectural_500d','scBiMap_100d','scBiMap_500d','None']
pre_process = ['TF','IDF','TF-IDF','None']
for a in methods:
    for b in dim_redu_method:
        for c in pre_process:
            simple_name = '_'.join([a,b,c])
            find_neighbor_method.append(simple_name)

simple_to_complete = {'Exact':'ExactNearestNeighbors',
      'PQ':'ProductQuantization',
      'scBiMap':'scBiMapEmbedding',
      'Spectural':'SpectralEmbedding',      
      '100d':'100',
      '500d':'500'}
to_fx = {'ExactNearestNeighbors':ExactNearestNeighbors,
      'ProductQuantization':ProductQuantization,
      'NNDescent':NNDescent,
      'WeightedLowHash':WeightedLowHash,
      'LowHash':LowHash,
      'HNSW':HNSW,
      'scBiMapEmbedding':scBiMapEmbedding,
      'SpectralEmbedding':SpectralEmbedding,
      '100':100,
      '500':500}
config_dict = {}
for method_index in range(len(find_neighbor_method)):
    simple_name = find_neighbor_method[method_index]
    simple_name_list = simple_name.split('_')
    for i in range(len(simple_name_list)):
        if simple_name_list[i] in simple_to_complete:
            simple_name_list[i] = simple_to_complete[simple_name_list[i]]
    complete_name = simple_name_list
    des = ','.join(complete_name)
    if 'Hash' in complete_name[0] and complete_name[2] == 'None':
        print(complete_name)
        config = NearestNeighborsConfig(
        nearest_neighbors_method=to_fx[complete_name[0]],
        description=des,
        tfidf=complete_name[3],
        nearest_neighbors_kw=dict(
        lowhash_fraction=0.01,
        max_bucket_size=max_bucket_size,
        repeats=100,
        seed=458,
        ),
        **kw)
        print(config)
        config_dict[find_neighbor_method[method_index]] = config
        print(method_index)
        print(find_neighbor_method[method_index])
        
    elif complete_name[0] in ['HNSW','ProductQuantization','NNdescent','ExactNearestNeighbors']:
        if complete_name[2] == 'None':
            config = NearestNeighborsConfig(
            nearest_neighbors_method=to_fx[complete_name[0]],
            description=des,
            tfidf=complete_name[3],
            nearest_neighbors_kw=dict(metric=complete_name[1].lower()),
            **kw) 
            config_dict[find_neighbor_method[method_index]] = config
        else:
            config = NearestNeighborsConfig(
            nearest_neighbors_method=to_fx[complete_name[0]],
            description=des,
            tfidf=complete_name[4],
            dimension_reduction_method=to_fx[complete_name[2]],
            dimension_reduction_kw=dict(n_dimensions=complete_name[3]),
            nearest_neighbors_kw=dict(metric=complete_name[1].lower()),
            **kw) 
            #print(find_neighbor_method[method_index])
            #print(config)
            config_dict[find_neighbor_method[method_index]] = config


['LowHash', 'Jaccard', 'None', 'TF']
NearestNeighborsConfig(description='LowHash,Jaccard,None,TF', binarize=False, tfidf='TF', dimension_reduction_method=None, nearest_neighbors_method=<class 'nearest_neighbors.LowHash'>)
156
LowHash_Jaccard_None_TF
['LowHash', 'Jaccard', 'None', 'IDF']
NearestNeighborsConfig(description='LowHash,Jaccard,None,IDF', binarize=False, tfidf='IDF', dimension_reduction_method=None, nearest_neighbors_method=<class 'nearest_neighbors.LowHash'>)
157
LowHash_Jaccard_None_IDF
['LowHash', 'Jaccard', 'None', 'TF-IDF']
NearestNeighborsConfig(description='LowHash,Jaccard,None,TF-IDF', binarize=False, tfidf='TF-IDF', dimension_reduction_method=None, nearest_neighbors_method=<class 'nearest_neighbors.LowHash'>)
158
LowHash_Jaccard_None_TF-IDF
['LowHash', 'Jaccard', 'None', 'None']
NearestNeighborsConfig(description='LowHash,Jaccard,None,None', binarize=False, tfidf='None', dimension_reduction_method=None, nearest_neighbors_method=<class 'nearest_neighbors.LowHash'>)
15

In [21]:
to_fx = {'ExactNearestNeighbors':ExactNearestNeighbors,
      'ProductQuantization':ProductQuantization,
      'NNDescent':NNDescent,
      'WeightedLowHash':WeightedLowHash,
      'LowHash':LowHash,
      'HNSW':HNSW,
      'scBiMapEmbedding':scBiMapEmbedding,
      'SpectralEmbedding':SpectralEmbedding,
      '100':100,
      '500':500}

method = 'LowHash'
tf = 'tf'
distance = ''
configs = [
    
    NearestNeighborsConfig(
        nearest_neighbors_method=ProductQuantization,
        description="PQ (TF-IDF,scBiMap 100 dim.)",
        tfidf=True,
        dimension_reduction_method=scBiMapEmbedding,
        dimension_reduction_kw=dict(n_dimensions=100),
        nearest_neighbors_kw=dict(nbits=6,metric = "cosine"),
        **kw
    ),]
print(configs)

[NearestNeighborsConfig(description='PQ (TF-IDF,scBiMap 100 dim.)', binarize=False, tfidf=True, dimension_reduction_method=<class 'dim_reduction.scBiMapEmbedding'>, nearest_neighbors_method=<class 'nearest_neighbors.ProductQuantization'>)]
