In [2]:
import pandas as pd
import psutil
import os
import numpy as np
import sys
import faiss
import sys
import time 
import torch
import tracemalloc
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, matthews_corrcoef, homogeneity_completeness_v_measure
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
embedding_db = np.load("db_embedding.npy")
embedding_t = np.load("test_embedding.npy")
X_train = pd.read_csv("metadata_db.csv")
X_test = pd.read_csv("metadata_test.csv")

In [4]:
#embedding_db.shape
num_features = embedding_db.shape[1]
print(embedding_db.shape[0])
#crit = faiss.OneRecallAtRCriterion(embedding_t.shape[0], 1)

165615


Parameter Sweep

In [7]:
import time
import faiss
import psutil
import os
from sklearn.metrics import accuracy_score, confusion_matrix, matthews_corrcoef, homogeneity_completeness_v_measure
import pandas as pd

use_gpu = False

if use_gpu:
    # if this fails, it means that the GPU version was not comp
    res = faiss.StandardGpuResources()
    dev_no = 0

# Test Different Indexing Methods: IndexFlatL2, IndexFlatL2 w/ PCAmatrix, IndexIVFFlat, etc.
index_key = ['Flat', 'PCA64,Flat', 'PCAR64,Flat', 'PCAW64,Flat', 'PCAWR64,Flat', 'IVF4096,Flat', 'PCA64,IVF4096,Flat', 'PCAR64,IVF4096,Flat', 'PCAW64,IVF4096,Flat', 'PCAWR64,IVF4096,Flat', 'IVF4096,PQ16', 'OPQ16,IVF4096,PQ16', 'OPQ32,IVF4096,PQ32', 'OPQ32,IVF4096,PQ16x4fsr']

# Initialize an empty DataFrame or load an existing one if available
results_file_path = 'results.csv'
if os.path.exists(results_file_path):
    result_df = pd.read_csv(results_file_path)
else:
    result_df = pd.DataFrame()

for key in index_key:
    #Sweep through indexing methods seen in index_key
    r = range(100, 1000, 100)
    start = time.time()
    tracemalloc.start()
    index = faiss.index_factory(num_features, key)
    nlist = 4096
    if use_gpu:
        # transfer to GPU (may be partial)
        index = faiss.index_cpu_to_gpu(res, dev_no, index)

    start_indexing = time.time()
    # Index Database Embedding
    index.train(embedding_db)
    index.add(embedding_db)
    elapsed_indexing = time.time() - start_indexing
    _, peak_i = tracemalloc.get_traced_memory()
    print(f'Time Indexing: {elapsed_indexing}')
    print('Memory Indexing % used:', peak_i)
    print('CPU Indexing % used:', psutil.cpu_percent())
    tracemalloc.stop()
    for i in r:
        #Test different values of nprobe to see their effects on the search process for each index.
        index.nprobe = i
        start_searching = time.time()
        # Search using Test Embedding
        tracemalloc.start()
        D_p, I_p = index.search(embedding_t, 1)
        elapsed_searching = time.time() - start_searching
        _, peak_s = tracemalloc.get_traced_memory()
        print(f'Time Searching: {elapsed_searching}')
        print('Memory Searching % used:', peak_s)
        print('CPU Searching % used:', psutil.cpu_percent())
        tracemalloc.stop()
        # Assuming each test embedding corresponds to a single label in the training data
        predicted_label = X_train.iloc[I_p.reshape(-1)]["gene"]

        # Calculate accuracy
        accuracy = accuracy_score(X_test["gene"], predicted_label)
        print(f'Accuracy: {accuracy}')
    
    
        # Calculate confusion matrix
        conf_matrix = confusion_matrix(X_test["gene"], predicted_label)

        # Calculate MCC, sensitivity, and specificity
        mcc = matthews_corrcoef(X_test["gene"], predicted_label)
        sensitivity = conf_matrix[1, 1] / (conf_matrix[1, 1] + conf_matrix[1, 0])
        specificity = conf_matrix[0, 0] / (conf_matrix[0, 0] + conf_matrix[0, 1])

        print(f'MCC: {mcc}')
        print(f'Sensitivity: {sensitivity}')
        print(f'Specificity: {specificity}')

        start_training_kmeans = time.time()
        tracemalloc.start()
        # Use Faiss K-Means algorithm to cluster database embeddings
        ncentroids = 1024
        niter = 60
        verbose = True
        kmeans = faiss.Kmeans(num_features, ncentroids, niter=niter, verbose=verbose)
        kmeans.train(embedding_db)
        elapsed_training_kmeans = time.time() - start_training_kmeans
        _, peak_t = tracemalloc.get_traced_memory()
        print(f'Time training Kmeans: {elapsed_training_kmeans}')
        print('Memory training Kmeans % used:', peak_t)
        print('CPU training Kmeans % used:', psutil.cpu_percent())
        tracemalloc.stop()
    
        start_searching_kmeans = time.time()
        tracemalloc.start()
        D, I = kmeans.index.search(embedding_t, 1)
        elapsed_searching_kmeans = time.time() - start_searching_kmeans
        _, peak_ks = tracemalloc.get_traced_memory()
        print(f'Time searching Kmeans: {elapsed_searching_kmeans}')
        print('Memory searching Kmeans % used:', peak_ks)
        print('CPU searching Kmeans % used:', psutil.cpu_percent())
        tracemalloc.stop()

        predicted_label_cluster_phylum = X_train.iloc[I.reshape(-1)]["phylum"]
        h_phylum, c_phylum, v_phylum = homogeneity_completeness_v_measure(X_test["phylum"], predicted_label_cluster_phylum)  # Verify

        print(f'Homogeneity Phylum: {h_phylum}')
        print(f'Completeness Phylum: {c_phylum}')
        print(f'V-Measure Phylum: {v_phylum}')
        print()

        predicted_label_cluster_genus = X_train.iloc[I.reshape(-1)]["genus"]
        h_genus, c_genus, v_genus = homogeneity_completeness_v_measure(X_test["genus"], predicted_label_cluster_genus)  # Verify

        print(f'Homogeneity Genus: {h_genus}')
        print(f'Completeness Genus: {c_genus}')
        print(f'V-Measure Genus: {v_genus}')
        print()

        print(index.ntotal)

        # Save results to DataFrame
        result_dict = {
            'Index': key,
            'Time Indexing': elapsed_indexing,
            'Time Searching': elapsed_searching,
            'Accuracy Gene': accuracy,
            'MCC Gene': mcc,
            'Sensitivity Gene': sensitivity,
            'Specificity Gene': specificity,
            'Time Training Kmeans': elapsed_training_kmeans,
            'Time Searching Kmeans': elapsed_searching_kmeans,
            'Homogeneity Phylum': h_phylum,
            'Completeness Phylum': c_phylum,
            'V-Measure Phylum': v_phylum,
            'Homogeneity Genus': h_genus,
            'Completeness Genus': c_genus,
            'V-Measure Genus': v_genus
        }

    # Append results to the DataFrame
        result_df = result_df.append(result_dict, ignore_index=True)

# Save the updated DataFrame to a CSV file
result_df.to_csv(results_file_path, index=False)


Time Indexing: 0.8508861064910889
Memory Indexing % used: 1358271585
CPU Indexing % used: 6.3
Time Searching: 34.723170042037964
Memory Searching % used: 58713096
CPU Searching % used: 99.3
Accuracy: 0.3534652076318743
MCC: 0.34044968473423176
Sensitivity: 0.9285714285714286
Specificity: 0.937007874015748
Time training Kmeans: 163.42626309394836
Memory training Kmeans % used: 682721904
CPU training Kmeans % used: 98.8
Time searching Kmeans: 0.2752339839935303
Memory searching Kmeans % used: 58713117
CPU searching Kmeans % used: 81.8
Homogeneity Phylum: 0.027160462493759557
Completeness Phylum: 0.0416747453773972
V-Measure Phylum: 0.03288739567340118

Homogeneity Genus: 0.6019077534681689
Completeness Genus: 0.5864351184743993
V-Measure Genus: 0.5940707064430887

165615
Time Searching: 35.349645376205444
Memory Searching % used: 58711963
CPU Searching % used: 99.1
Accuracy: 0.3534652076318743
MCC: 0.34044968473423176
Sensitivity: 0.9285714285714286
Specificity: 0.937007874015748
Time tr

In [8]:
import os
import pandas as pd

# Set the path to the folder containing CSV files
folder_path = './csv'

# Get a list of all CSV files in the folder
csv_files = [file for file in os.listdir(folder_path) if file.endswith('.csv')]

# Initialize an empty DataFrame to store concatenated data
concatenated_df = pd.DataFrame()

# Loop through each CSV file and concatenate it to the DataFrame
for csv_file in csv_files:
    file_path = os.path.join(folder_path, csv_file)
    df = pd.read_csv(file_path)
    concatenated_df = pd.concat([concatenated_df, df], ignore_index=True)

# Specify the path for the output concatenated CSV file
output_csv_path = 'concatenated_parameter_faiss.csv'

# Save the concatenated DataFrame to a CSV file
concatenated_df.to_csv(output_csv_path, index=False)

print(f"Concatenation complete. Results saved to: {output_csv_path}")

Concatenation complete. Results saved to: concatenated_parameter_faiss.csv


Generate Distance Matrix for UMAP

In [9]:
import time
import faiss
import psutil
import os
from sklearn.metrics import accuracy_score, confusion_matrix, matthews_corrcoef, homogeneity_completeness_v_measure
import pandas as pd

#Generate the index using FlatL2. Returns Euclidean Distances. May want to run parameters again to verify
nlist = 4096
index = faiss.index_factory(num_features, 'PCAW64,Flat')
index.train(embedding_db)
index.add(embedding_db)

In [22]:
#Search Index to generate index and distance vectors. Check if doing this correctly (verify dimensions and if we're missing a step)
D_p, I_p = index.search(embedding_t, len(embedding_db))

MemoryError: Unable to allocate 17.6 GiB for an array with shape (14256, 165615) and data type int64

In [9]:
print(f'Distance Dimensions: {D_p.shape}')
print(f'Index Dimensions: {I_p.shape}')

Distance Dimensions: (14256, 165615)
Index Dimensions: (14256, 165615)


In [10]:
np.save('faiss_embeddings_distance_matrix.npy', D_p)
np.save('faiss_embeddings_index_matrix.npy', I_p)

In [8]:
ncentroids = 1024
niter = 60
verbose = True
kmeans = faiss.Kmeans(num_features, ncentroids, niter=niter, verbose=verbose)
kmeans.train(embedding_db)
D, I = kmeans.index.search(embedding_t, 1)
predicted_label_cluster_phylum = X_train.iloc[I.reshape(-1)]["phylum"]  # Verify
h_phylum, c_phylum, v_phylum = homogeneity_completeness_v_measure(X_test["phylum"], predicted_label_cluster_phylum)
predicted_label_cluster_genus = X_train.iloc[I.reshape(-1)]["genus"]  # Verify
h_genus, c_genus, v_genus = homogeneity_completeness_v_measure(X_test["genus"], predicted_label_cluster_genus)


print(f'Homogeneity Phylum: {h_phylum}')
print(f'Completeness Phylum: {c_phylum}')
print(f'V-Measure Phylum: {v_phylum}')
print()
print(f'Homogeneity Genus: {h_genus}')
print(f'Completeness Genus: {c_genus}')
print(f'V-Measure Genus: {v_genus}')

Homogeneity Phylum: 0.027160462493759557
Completeness Phylum: 0.0416747453773972
V-Measure Phylum: 0.03288739567340118

Homogeneity Genus: 0.6019077534681689
Completeness Genus: 0.5864351184743993
V-Measure Genus: 0.5940707064430887
