In [None]:
!pip install PyPDF2
!pip install langchain
!pip install transformers
!pip install faiss-cpu
!pip install tqdm

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
Collecting langchain
  Downloading langchain-0.2.6-py3-none-any.whl (975 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m975.5/975.5 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
Collecting langchain-core<0.3.0,>=0.2.10 (from langchain)
  Downloading langchain_core-0.2.11-py3-none-any.whl (337 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m337.4/337.4 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain-text-splitters<0.3.0,>=0.2.0 (from langchain)
  Downloading langchain_text_splitters-0.2.2-py3-none-any.whl (25 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langchain)
  Downloading langsmith-0.1.83-py3-none-any.whl (127 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━

In [None]:
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import AutoTokenizer, AutoModel
import torch
import faiss
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch
import requests
import numpy as np
import pickle
from tqdm import tqdm
import os

In [None]:
# Read pdfs from uploaded directory
def read_pdfs(directory):
    texts = {}
    for filename in os.listdir(directory):
        if filename.endswith(".pdf"):
            file_path = os.path.join(directory, filename)
            reader = PdfReader(file_path)
            raw_text = ""
            for page in reader.pages:
                raw_text += page.extract_text()
            texts[filename] = raw_text
    return texts

In [None]:
def split_text(text, chunk_size=1000, chunk_overlap=200):
    start = 0
    chunks = []
    while start < len(text):
        end = min(start + chunk_size, len(text))
        chunks.append(text[start:end])
        start += chunk_size - chunk_overlap
    return chunks

In [None]:
pdf_directory = "Documents"
pdf_texts = read_pdfs(pdf_directory)
pdf_chunks = {filename: split_text(text) for filename, text in pdf_texts.items()}

In [None]:
# Load model and tokenizer
#model_name = "Alibaba-NLP/gte-base-en-v1.5"
model_name = "Alibaba-NLP/gte-large-en-v1.5"
#model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
model.eval()

NewModel(
  (embeddings): NewEmbeddings(
    (word_embeddings): Embedding(30528, 1024, padding_idx=0)
    (rotary_emb): NTKScalingRotaryEmbedding()
    (token_type_embeddings): Embedding(2, 1024)
    (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): NewEncoder(
    (layer): ModuleList(
      (0-23): 24 x NewLayer(
        (attention): NewAttention(
          (qkv_proj): Linear(in_features=1024, out_features=3072, bias=True)
          (dropout): Dropout(p=0.0, inplace=False)
          (o_proj): Linear(in_features=1024, out_features=1024, bias=True)
        )
        (mlp): NewGatedMLP(
          (up_gate_proj): Linear(in_features=1024, out_features=8192, bias=False)
          (down_proj): Linear(in_features=4096, out_features=1024, bias=True)
          (act_fn): GELUActivation()
          (hidden_dropout): Dropout(p=0.1, inplace=False)
        )
        (attn_ln): LayerNorm((1024,), eps=1e-12, elementwise_af

In [None]:
# Function to embed text
def embed_text(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        embeddings = model(**inputs).last_hidden_state[:, 0, :].squeeze().cpu().numpy()
    return embeddings

In [None]:
embeddings = {}
for filename, chunks in pdf_chunks.items():
    chunk_embeddings = []
    for chunk in tqdm(chunks, desc=f"Embedding chunks from {filename}", leave=False):
        chunk_embeddings.append(embed_text(chunk, tokenizer, model))
    embeddings[filename] = np.array(chunk_embeddings, dtype='float32')

Embedding chunks from Operations Manual- 2080.pdf:  20%|█▉        | 65/330 [03:13<12:37,  2.86s/it]

In [None]:
# Save embeddings and splits to disk
for filename, emb in embeddings.items():
    np.save(f'{filename}_embeddings.npy', emb)
    with open(f'{filename}_chunks.npy', 'wb') as f:
        np.save(f, pdf_chunks[filename])

In [None]:
# Combine all embeddings into a single matrix for indexing
all_embeddings = np.concatenate([emb for emb in embeddings.values()], axis=0)
dimension = all_embeddings.shape[1]

In [None]:
# Create and populate the Faiss index
index = faiss.IndexFlatL2(dimension)
index.add(all_embeddings)
faiss.write_index(index, 'faiss_index.index')

In [None]:
# Save metadata to map embeddings to original chunks
metadata = []
offset = 0
for filename, emb in embeddings.items():
    for i in range(len(emb)):
        metadata.append((filename, i + offset))
    offset += len(emb)

In [None]:
np.save('metadata.npy', metadata)