In [1]:
import torch
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import DataLoader
import numpy as np
import faiss
from tqdm import tqdm
import os
assert hasattr(faiss, "StandardGpuResources"), "Still using CPU-only FAISS"

# === CONFIG ===
MODEL_NAME = "Alibaba-NLP/gte-Qwen2-7B-instruct"
BATCH_SIZE = 64
CHUNK_SIZE = 512
# EMBED_DIM = 4096  # Depends on model
EMBED_DIM = 3584  # Updated to match actual embedding dimension
DEVICE_COUNT = torch.cuda.device_count()

# === Load model/tokenizer on all devices ===
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME, torch_dtype=torch.float16).half().eval()

# === Read and chunk your documents ===
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter

with open("extracted_texts.txt", "r", encoding="utf-8") as f:
    texts = f.read().split("=== Document ")[1:]
    texts = [doc.split("\n", 1)[1] for doc in texts]

splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=100)
documents = [chunk for text in texts for chunk in splitter.split_text(text)]

# === Embed with batching on one GPU (parallelize later) ===
def embed_batch(batch_texts):
    inputs = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=CHUNK_SIZE)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)  # Ensure model is on same device
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True)
        # Mean pooling over sequence (dimension 1)
        embeddings = outputs.last_hidden_state.mean(dim=1)
        return embeddings.cpu().numpy()


  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 7/7 [00:01<00:00,  4.55it/s]


In [2]:

# === Loop over batches and embed ===
all_embeddings = []
for i in tqdm(range(0, len(documents), BATCH_SIZE), desc="Embedding"):
    batch = documents[i:i + BATCH_SIZE]
    batch_texts = [doc for doc in batch]
    emb = embed_batch(batch_texts)
    all_embeddings.append(emb)

all_embeddings = np.vstack(all_embeddings).astype("float32")


Embedding: 100%|██████████| 1788/1788 [12:37<00:00,  2.36it/s]


In [3]:

# === Build FAISS GPU index ===
res = faiss.StandardGpuResources()
index = faiss.IndexFlatL2(EMBED_DIM)
gpu_index = faiss.index_cpu_to_gpu(res, 0, index)  # Use GPU 0 or loop over all

gpu_index.add(all_embeddings)

# === Save index and docs ===
faiss.write_index(faiss.index_gpu_to_cpu(gpu_index), "faiss_index.index")
np.save("documents.npy", np.array(documents, dtype=object))


In [4]:
all_embeddings

array([[-0.7583008 ,  0.4428711 , -1.4121094 , ...,  0.18249512,
        -3.0039062 ,  0.08831787],
       [-0.4086914 ,  0.35913086, -3.3945312 , ...,  3.5664062 ,
        -3.3984375 , -0.05462646],
       [-0.48291016,  0.7138672 , -3.2285156 , ..., -0.15551758,
        -2.1640625 ,  2.2597656 ],
       ...,
       [ 0.1583252 ,  0.30493164, -2.5136719 , ...,  2.546875  ,
        -2.1894531 ,  3.5664062 ],
       [ 1.6416016 ,  3.7597656 , -3.28125   , ...,  4.2148438 ,
        -1.7158203 ,  4.046875  ],
       [-3.9335938 , -1.7763672 , -0.6088867 , ...,  3.5683594 ,
        -2.3320312 ,  1.8359375 ]], dtype=float32)