In [1]:
import os
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import json
from pypdf import PdfReader
from sentence_transformers import SentenceTransformer
import chromadb
from langchain.text_splitter import RecursiveCharacterTextSplitter
os.chdir('../')




In [2]:
%pwd

'c:\\Projects\\python\\journAI'

In [3]:
chromadb_path="./data/chroma_db"
pdf_folder = ".\data\pdf_data"


In [4]:
# Define model path
model_path = r".\models\deepseek-qwen-1.5B"  # Use raw string (r"") to avoid escape issues

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)

# Load the model with optimized settings
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype=torch.float16,  # Use float16 for better performance
    device_map="auto"  # Auto-detect GPU, fallback to CPU
)

# Set model to evaluation mode
model.eval()

# Function to generate text
def generate_text(prompt, max_length=1024):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)  # Send inputs to correct device
    with torch.no_grad():
        output = model.generate(**inputs, max_length=max_length)
    return tokenizer.decode(output[0], skip_special_tokens=True)

In [5]:
# **1. Load Improved Embedding Model**
embedder = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")

# **2. Initialize ChromaDB**
chroma_client = chromadb.PersistentClient(path=chromadb_path)
collection = chroma_client.get_or_create_collection(name="document_vectors")

# **3. Function to Extract and Chunk PDF Text**
def extract_text_from_pdf(pdf_path):
    """Extracts text from a PDF and splits it into smaller chunks for better retrieval."""
    with open(pdf_path, "rb") as f:
        reader = PdfReader(f)
        text = "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])
    
    # Split text into smaller chunks (200-500 tokens per chunk)
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50)
    chunks = text_splitter.split_text(text)
    
    return chunks

# **4. Ingest Multiple PDFs into Vector Store**
documents = {}

for pdf_file in os.listdir(pdf_folder):
    if pdf_file.endswith(".pdf"):
        file_path = os.path.join(pdf_folder, pdf_file)
        chunks = extract_text_from_pdf(file_path)
        
        # Store chunks in ChromaDB
        for i, chunk in enumerate(chunks):
            embedding = embedder.encode(chunk, convert_to_numpy=True).tolist()
            doc_id = f"{pdf_file}_chunk_{i}"
            collection.add(ids=[doc_id], embeddings=[embedding], metadatas=[{"filename": pdf_file, "content": chunk}])

print(f"📄 Processed and stored {len(documents)} PDF files with chunking.")


📄 Processed and stored 0 PDF files with chunking.


In [15]:
import re

# **Function to Clean Retrieved Text**
def clean_text(text):
    """Removes artifacts like <EOS>, <pad>, and fixes tokenized fragments."""
    text = text.replace("<EOS>", "").replace("<pad>", "").strip()  # Remove tokens
    text = re.sub(r"\s+", " ", text)  # Fix spacing issues
    sentences = text.split(". ")  # Split into sentences
    
    # Keep only meaningful sentences (length > 5 words)
    clean_sentences = [s.strip() for s in sentences if len(s.split()) > 5]
    
    return ". ".join(clean_sentences)

# **Updated Retrieval Function**
def retrieve_docs(query, top_k=5):
    """Retrieves most relevant document chunks and cleans the text."""
    query_vector = embedder.encode([query], convert_to_numpy=True).tolist()
    results = collection.query(query_embeddings=query_vector, n_results=top_k)
    
    retrieved_docs = [doc["content"] for doc in results["metadatas"][0]]
    
    # Clean and join retrieved context
    cleaned_docs = "\n\n".join([clean_text(doc) for doc in retrieved_docs if doc])
    
    return cleaned_docs

def generate_response(query):
    """Retrieves relevant docs, structures prompt, and generates only the response."""
    retrieved_context = retrieve_docs(query)

    prompt1 = f"""
    You are an AI assistant with access to research papers and textbooks.
    Use the retrieved knowledge to generate a structured and accurate answer.

    **Context from Documents:**
    {retrieved_context}

    **Question:**
    {query}

    ### Response:
    """
    print('At Response 1 stage')
    inputs = tokenizer(prompt1, return_tensors="pt").to(model.device)
    with torch.no_grad():
        output = model.generate(**inputs, max_length=1024)

    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

    # **Extract only the response by removing everything before "### Response:"**
    if "### Response:" in generated_text:
        response = generated_text.split("### Response:")[-1].strip()
    else:
        response = generated_text.strip()

    print('At summarization stage')
    prompt2= f"""Summarize the given text in clear, concise and professional manner. 
    Elaborate on any mathematical formulae, explanation needed to explain the concept described in the text.
    \nInput text:\n {response}

    ### Response:
    """

    inputs2 = tokenizer(prompt2,return_tensors="pt").to(model.device)
    with torch.no_grad():
        output2 = model.generate(**inputs2, max_length=1024)
    generated_text2 = tokenizer.decode(output2[0],skip_special_tokens=True)

    if "### Response:" in generated_text2:
        response2 = generated_text2.split("### Response:")[-1].strip()
    else:
        response2 = generated_text2.strip() 

    return response2



In [None]:
import pprint
query = "Explainthe segment anything model in detail!"
response = generate_response(query)

# print("\n🔹 Retrieved Context (Cleaned):\n", retrieve_docs(query))
pprint.pprint("\n🔹 Model Response:\n", response)

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


At Response 1 stage


Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


At summarization stage

🔹 Model Response:
 The segment anything model is a neural network designed for semantic segmentation tasks, where each pixel is assigned to a specific class. It uses a graph-based approach, with nodes representing pixels or regions and edges representing relationships. The architecture includes multiple layers, refining segmentation masks through skip connections, which preserve low-level features while capturing higher-level information. Attention mechanisms enhance focus, improving accuracy and efficiency. Performance is influenced by object class complexity, image size, and computational resources. This model has achieved top-tier results on benchmark datasets, making it a versatile tool for medical imaging and autonomous driving.


In [21]:
import pprint
pprint.pprint(response.split('\n'))

['The segment anything model is a neural network designed for semantic '
 'segmentation tasks, where each pixel is assigned to a specific class. It '
 'uses a graph-based approach, with nodes representing pixels or regions and '
 'edges representing relationships. The architecture includes multiple layers, '
 'refining segmentation masks through skip connections, which preserve '
 'low-level features while capturing higher-level information. Attention '
 'mechanisms enhance focus, improving accuracy and efficiency. Performance is '
 'influenced by object class complexity, image size, and computational '
 'resources. This model has achieved top-tier results on benchmark datasets, '
 'making it a versatile tool for medical imaging and autonomous driving.']


In [9]:
prompt = "Explain quantum computing in simple terms."
response = generate_text(prompt)
print("\nGenerated Response:\n", response)

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.



Generated Response:
 Explain quantum computing in simple terms. How do you get a quantum computer to work? What are the main advantages and disadvantages of quantum computing compared to classical computing? Can you explain in simple terms the difference between quantum computing and classical computing? Why is quantum computing important in today's world? How do you get a quantum computer to work? What are the main advantages and disadvantages of quantum computing compared to classical computing? Can you explain in simple terms the difference between quantum computing and classical computing? Why is quantum computing important in today's world?

I need to explain quantum computing in simple terms. How do you get a quantum computer to work? What are the main advantages and disadvantages of quantum computing compared to classical computing? Can you explain in simple terms the difference between quantum computing and classical computing? Why is quantum computing important in today's wor