In [1]:
import numpy as np
import faiss
import pandas as pd

In [3]:
# Fetching Data from Storages
def fetch_train_test_split():
    data_dir = '/scratch/gpfs/jr8867/embeddings/scop/train-test-split'

    train_embeddings = np.load(f'{data_dir}/train_embeddings.npy')
    test_embeddings = np.load(f'{data_dir}/test_embeddings.npy')
    train_labels = np.load(f'{data_dir}/train_labels.npy')
    test_labels = np.load(f'{data_dir}/test_labels.npy')

    return train_embeddings, test_embeddings, train_labels, test_labels

train_embeddings, test_embeddings, train_labels, test_labels = fetch_train_test_split()
print(train_embeddings.shape)
print(test_embeddings.shape)
print(train_labels.shape)
print(test_labels.shape)


(28105, 1280)
(7027, 1280)
(28105,)
(7027,)


In [4]:
def create_faiss_index(embeddings, labels, type='train'):
    # Create a flat index with inner product metric (equivalent to cosine similarity for normalized vectors)
    d = embeddings.shape[1]  # dimension of embeddings
    index = faiss.IndexFlatIP(d)
    
    # Add embeddings to the index
    index.add(embeddings)
    
    # Create metadata DataFrame mapping index to labels
    metadata = pd.DataFrame({
        'index_id': range(len(labels)),
        'label': labels
    })
    
    # Save index and metadata
    output_dir = '/scratch/gpfs/jr8867/embeddings/scop/train-test-split'
    faiss.write_index(index, f'{output_dir}/protein_embeddings_{type}.index')
    metadata.to_csv(f'{output_dir}/protein_embeddings_{type}_metadata.csv', index=False)
    
    return index, metadata

In [5]:
train_index, train_metadata = create_faiss_index(train_embeddings, train_labels, type='train')
test_index, test_metadata = create_faiss_index(test_embeddings, test_labels, type='test')

In [8]:
# Test querying train index and check labels of returned neighbors
def test_faiss_query(query_embedding, train_index, train_metadata, k=5):
    # Reshape query embedding to match FAISS input requirements
    query_emb = query_embedding.reshape(1, -1)
    
    # Perform similarity search
    D, I = train_index.search(query_emb, k)
    
    # Retrieve labels for the returned indices
    returned_labels = train_metadata.loc[I[0], 'label'].values
    
    print("Distances:", D[0])
    print("Returned Indices:", I[0])
    print("Returned Labels:", returned_labels)
    
    return D[0], I[0], returned_labels

# Example: Query with the first test embedding
test_query_embedding = test_embeddings[0]
_, _, _ = test_faiss_query(test_query_embedding, train_index, train_metadata)


Distances: [0.9788125  0.97758263 0.97731286 0.97641706 0.97627425]
Returned Indices: [25994  1178 27292  1154 26674]
Returned Labels: [ 285 1023 1023   97  285]
