In [39]:
import os
import warnings
from dotenv import load_dotenv
from tqdm import tqdm
import faiss
import hashlib
import pickle
from langchain.embeddings.base import Embeddings


from langchain_community.document_loaders import PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_ollama import OllamaEmbeddings
warnings.filterwarnings("ignore")
os.environ["KMP_DUPLICATE_LIB_OK"] = "True"
load_dotenv()

False

# LM Studio

In [40]:
import os
import warnings
from dotenv import load_dotenv
from tqdm import tqdm
import faiss
import hashlib
import pickle
import requests  # <-- you forgot this

from langchain_community.document_loaders import PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore

warnings.filterwarnings("ignore")
os.environ["KMP_DUPLICATE_LIB_OK"] = "True"
load_dotenv()

# === Config ===
PDF_FOLDER = "Data"
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 150
PDF_BATCH_SIZE = 10
VECTORSTORE_DIR = "Test_data_Store"
MERGED_VECTORSTORE_PATH = "Test_data_merged"
CHUNK_SAVE_DIR = "Test_data_chun"
EMBED_MODEL = "text-embedding-nomic-embed-text-v1.5"   # LM Studio model name
LMSTUDIO_URL = "http://localhost:1234/v1/embeddings"

os.makedirs(VECTORSTORE_DIR, exist_ok=True)
os.makedirs(CHUNK_SAVE_DIR, exist_ok=True)


# === Utility: List PDFs ===
def list_pdf_files(data_dir):
    return [
        os.path.join(root, file)
        for root, _, files in os.walk(data_dir)
        for file in files if file.endswith(".pdf")
    ]


# === Utility: Split list into batches ===
def split_list(lst, chunk_size):
    for i in range(0, len(lst), chunk_size):
        yield lst[i:i + chunk_size]


# === Utility: Check if pickle is still valid ===
def is_pickle_up_to_date(pdf_paths, pkl_path):
    if not os.path.exists(pkl_path):
        return False
    pkl_mtime = os.path.getmtime(pkl_path)
    return all(os.path.getmtime(pdf) <= pkl_mtime for pdf in pdf_paths)


# === Step 1: Load PDFs ===
def load_pdf_batch(pdf_batch):
    all_docs = []
    for pdf in pdf_batch:
        try:
            loader = PyMuPDFLoader(pdf)
            pages = loader.load()
            for page in pages:
                page.metadata["source"] = os.path.basename(pdf)
            all_docs.extend(pages)
        except Exception as e:
            print(f"❌ Failed to load {pdf}: {e}")
    return all_docs


# === Step 2: Chunk documents and save as .pkl ===
def chunk_documents(docs, batch_index=None, chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", ".", " ", ""],
        add_start_index=True,
        keep_separator=True,
    )

    print("✂️ Chunking documents...")
    chunks = splitter.split_documents(docs)

    # Add chunk_id using hash
    for chunk in chunks:
        content_hash = hashlib.md5(chunk.page_content.encode()).hexdigest()[:8]
        chunk.metadata["chunk_id"] = f"{chunk.metadata['source']}_{chunk.metadata.get('page', 0)}_{content_hash}"

    # Save to .pkl
    if batch_index is not None:
        chunk_file = os.path.join(CHUNK_SAVE_DIR, f"chunks_batch_{batch_index + 1}.pkl")
        with open(chunk_file, "wb") as f:
            pickle.dump(chunks, f)
        print(f"💾 Saved {len(chunks)} chunks to {chunk_file}")
    
    return chunks


# === LM Studio Embeddings Wrapper ===

class LMStudioEmbeddings(Embeddings):
    def __init__(self, model=EMBED_MODEL, base_url=LMSTUDIO_URL):
        self.model = model
        self.base_url = base_url

    def embed_query(self, text: str):
        return self._embed([text])[0]

    def embed_documents(self, texts: list[str]):
        return self._embed(texts)

    def _embed(self, texts: list[str]):
        response = requests.post(
            self.base_url,
            json={"model": self.model, "input": texts},
            headers={"Content-Type": "application/json"}
        )
        response.raise_for_status()
        data = response.json()

        if "data" not in data or len(data["data"]) == 0:
            raise ValueError(f"❌ No embeddings returned. Response: {data}")

        return [item["embedding"] for item in data["data"]]



# === Step 3: Create vectorstore ===
# def create_vectorstore(chunks, embeddings):
#     sample_vec = embeddings.embed_query("test")
#     index = faiss.IndexFlatL2(len(sample_vec))
#     vectorstore = FAISS(
#         embedding=embeddings,
#         index=index,
#         docstore=InMemoryDocstore(),
#         index_to_docstore_id={}
#     )
#     vectorstore.add_documents(chunks)
#     return vectorstore
# === Step 3: Create vectorstore ===
def create_vectorstore(chunks, embeddings):
    if not chunks:
        raise ValueError("❌ No chunks provided to create_vectorstore")

    test_vec = embeddings.embed_query("test")
    if not test_vec:
        raise ValueError("❌ Embedding model returned empty vector for test input")

    return FAISS.from_documents(chunks, embeddings)




# === Save/load vectorstore ===
def save_vectorstore(vs, path):
    vs.save_local(path)

def load_vectorstore(path, embeddings):
    return FAISS.load_local(path, embeddings, allow_dangerous_deserialization=True)


# === Main ===
if __name__ == "__main__":
    pdf_files = list_pdf_files(PDF_FOLDER)
    print(f"📄 Found {len(pdf_files)} PDF files.")

    embeddings = LMStudioEmbeddings(model=EMBED_MODEL, base_url=LMSTUDIO_URL)
    pdf_batches = list(split_list(pdf_files, PDF_BATCH_SIZE))

    for i, batch in enumerate(pdf_batches):
        print(f"\n📚 Processing batch {i+1}/{len(pdf_batches)}...")
        chunk_file = os.path.join(CHUNK_SAVE_DIR, f"chunks_batch_{i + 1}.pkl")

        if is_pickle_up_to_date(batch, chunk_file):
            print(f"✅ Using cached chunks from {chunk_file}")
            with open(chunk_file, "rb") as f:
                chunks = pickle.load(f)
        else:
            print("🔁 PDFs modified or chunks not found. Re-processing...")
            docs = load_pdf_batch(batch)
            print(f"✅ Loaded {len(docs)} pages")
            chunks = chunk_documents(docs, batch_index=i)

        print(f"✂️ Total chunks: {len(chunks)}")
        vs = create_vectorstore(chunks, embeddings)
        part_path = os.path.join(VECTORSTORE_DIR, f"vectorstore_part_{i + 1}")
        save_vectorstore(vs, part_path)
        print(f"💾 Saved vectorstore part {i + 1} at {part_path}")

    # === Merge parts ===
    print("\n🔄 Merging all vectorstore parts...")
    merged_vs = None
    for i in range(1, len(pdf_batches) + 1):
        part_path = os.path.join(VECTORSTORE_DIR, f"vectorstore_part_{i}")
        if not os.path.exists(part_path):
            print(f"⚠️ Missing part: {part_path}")
            continue
        part_vs = load_vectorstore(part_path, embeddings)
        if merged_vs is None:
            merged_vs = part_vs
        else:
            merged_vs.merge_from(part_vs)
        print(f"✅ Merged part {i}")

    if merged_vs:
        merged_vs.save_local(MERGED_VECTORSTORE_PATH)
        print(f"\n🎉 Final vectorstore saved to '{MERGED_VECTORSTORE_PATH}'")
    else:
        print("❌ No vectorstores were merged.")


📄 Found 2 PDF files.

📚 Processing batch 1/1...
✅ Using cached chunks from Test_data_chun\chunks_batch_1.pkl
✂️ Total chunks: 56
💾 Saved vectorstore part 1 at Test_data_Store\vectorstore_part_1

🔄 Merging all vectorstore parts...
✅ Merged part 1

🎉 Final vectorstore saved to 'Test_data_merged'


# Chat Wrapper

In [41]:

class LMStudioLLM:
    def __init__(self, model="humanizerai", base_url="http://localhost:1234/v1/chat/completions"):
        self.model = model
        self.base_url = base_url

    def generate(self, prompt, temperature=0.7, max_tokens=512):
        """Send a chat-style completion request to LM Studio"""
        payload = {
            "model": self.model,
            "messages": [{"role": "user", "content": prompt}],
            "temperature": temperature,
            "max_tokens": max_tokens
        }
        response = requests.post(
            self.base_url,
            json=payload,
            headers={"Content-Type": "application/json"}
        )
        response.raise_for_status()
        data = response.json()

        if "choices" not in data or not data["choices"]:
            raise ValueError(f"❌ No response from LM Studio. Response: {data}")

        return data["choices"][0]["message"]["content"]


# Eg RAG Question

In [42]:
from langchain.chains import RetrievalQA

# Load your merged vectorstore
# ✅ Load merged vectorstore correctly
embeddings = LMStudioEmbeddings(model=EMBED_MODEL, base_url=LMSTUDIO_URL)
vs = FAISS.load_local(
    "Test_data_merged",      # folder path
    embeddings,              # your LMStudioEmbeddings instance
    allow_dangerous_deserialization=True
)


# Wrap retriever
retriever = vs.as_retriever(search_type="mmr", search_kwargs={"k": 5})

# LM Studio LLM
llm = LMStudioLLM(model="humanizerai")

# Ask a question
query = "What is written in temp.pdf about AI development?"
context_docs = retriever.get_relevant_documents(query)

# Build context
context_text = "\n\n".join([d.page_content for d in context_docs])
final_prompt = f"Answer the question based on the context below:\n\n{context_text}\n\nQuestion: {query}\nAnswer:"

answer = llm.generate(final_prompt)
print("🤖", answer)


🤖 The passage mentions several books and articles that discuss AI development, including Carl Jung's "Types of Personality" test and Isabel Briggs Myers' "Personality Type Questionnaire." However, there is no mention of "temp.pdf."
