In [1]:
import os
import fitz  # PyMuPDF
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Path to your data folder
DATA_PATH = "Data/"

# Function to extract text from PDFs
def load_pdfs(data_path):
    texts = []
    for file in os.listdir(data_path):
        if file.endswith(".pdf"):
            doc = fitz.open(os.path.join(data_path, file))
            pdf_text = ""
            for page in doc:
                pdf_text += page.get_text()
            texts.append({"file": file, "text": pdf_text})
    return texts

# Load PDFs
documents = load_pdfs(DATA_PATH)

print(f"Loaded {len(documents)} PDFs")
print("Example file:", documents[0]["file"])
print("First 500 characters:\n", documents[0]["text"][:500])


Loaded 1 PDFs
Example file: attention is all you need.pdf
First 500 characters:
 Attention Is All You Need
Ashish Vaswani∗
Google Brain
avaswani@google.com
Noam Shazeer∗
Google Brain
noam@google.com
Niki Parmar∗
Google Research
nikip@google.com
Jakob Uszkoreit∗
Google Research
usz@google.com
Llion Jones∗
Google Research
llion@google.com
Aidan N. Gomez∗†
University of Toronto
aidan@cs.toronto.edu
Łukasz Kaiser∗
Google Brain
lukaszkaiser@google.com
Illia Polosukhin∗‡
illia.polosukhin@gmail.com
Abstract
The dominant sequence transduction models are based on complex recurrent or


In [2]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len
)

all_chunks = []
for doc in documents:
    chunks = splitter.split_text(doc["text"])
    for i, chunk in enumerate(chunks):
        all_chunks.append({
            "file": doc["file"],
            "chunk_id": i,
            "text": chunk
        })

print(f"Total chunks created: {len(all_chunks)}")
print("Example chunk:\n", all_chunks[0]["text"][:300])


Total chunks created: 41
Example chunk:
 Attention Is All You Need
Ashish Vaswani∗
Google Brain
avaswani@google.com
Noam Shazeer∗
Google Brain
noam@google.com
Niki Parmar∗
Google Research
nikip@google.com
Jakob Uszkoreit∗
Google Research
usz@google.com
Llion Jones∗
Google Research
llion@google.com
Aidan N. Gomez∗†
University of Toronto
aid


In [3]:
import chromadb
from chromadb.utils import embedding_functions

chroma_client = chromadb.Client()

embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name="all-MiniLM-L6-v2"  
)

collection = chroma_client.get_or_create_collection(
    name="arxiv_papers",
    embedding_function=embedding_func
)

for chunk in all_chunks:
    collection.add(
        documents=[chunk["text"]],
        metadatas=[{"file": chunk["file"], "chunk_id": chunk["chunk_id"]}],
        ids=[f'{chunk["file"]}_{chunk["chunk_id"]}']
    )

print("✅ All chunks stored in ChromaDB!")
print("Collection size:", collection.count())


  from .autonotebook import tqdm as notebook_tqdm


✅ All chunks stored in ChromaDB!
Collection size: 41


In [4]:
# Example query
query = "Explain Transformers in simple terms"
#query = "Why do we need positional encoding in Transformers?"

# Search in ChromaDB
results = collection.query(
    query_texts=[query],
    n_results=5
)

for i, doc in enumerate(results["documents"][0]):
    print(f"\nResult {i+1}:")
    print(doc[:300], "...")
    print("From:", results["metadatas"][0][i]["file"])



Result 1:
Here, the encoder maps an input sequence of symbol representations (x1, ..., xn) to a sequence
of continuous representations z = (z1, ..., zn). Given z, the decoder then generates an output
sequence (y1, ..., ym) of symbols one element at a time. At each step the model is auto-regressive
[9], consum ...
From: attention is all you need.pdf

Result 2:
2
Figure 1: The Transformer - model architecture.
wise fully connected feed-forward network. We employ a residual connection [10] around each of
the two sub-layers, followed by layer normalization [1]. That is, the output of each sub-layer is
LayerNorm(x + Sublayer(x)), where Sublayer(x) is the func ...
From: attention is all you need.pdf

Result 3:
In this work we propose the Transformer, a model architecture eschewing recurrence and instead
relying entirely on an attention mechanism to draw global dependencies between input and output.
The Transformer allows for signiﬁcantly more parallelization and can reach a new state of the

# test with llama instead of mistral

In [5]:
import ollama


response = ollama.chat(
    model="llama3.2:3b",
    messages=[
        {"role": "system", "content": "You are an expert AI/ML assistant."},
        {"role": "user", "content": "Explain Transformers in simple terms."},
    ]


)

print(response["message"]["content"])


Transformers! They're a type of machine learning model that's revolutionized the way we process and understand natural language.

**What does "Transformer" mean?**

The term "Transformer" comes from the idea that these models "transform" or change the way we think about processing sequences (like words in a sentence) into a more efficient and powerful approach.

**How do Transformers work?**

Traditional machine learning models, like Recurrent Neural Networks (RNNs), process sequences one step at a time. They look at each word in the sequence, one by one, and use that information to make predictions about the next word.

Transformers take a different approach. Instead of looking at individual words, they break down the entire sequence into smaller groups called "attention heads." Each attention head looks at all the other words in the sequence simultaneously, but only considers those that are relevant to it (based on their similarity).

This allows the model to capture long-range relat

# test with llama and option parameters ( max token, and temperature)

In [6]:
import ollama

context = "\n\n".join(results["documents"][0])

response = ollama.chat(
    model="llama3.2:3b",
    messages=[
        {"role": "system", "content": "You are an expert AI/ML assistant."},
        {"role": "user", "content": "Explain Transformers in simple terms and do not exceed 150 characters."},
    ],


    options={
    "num_predict": 150,#,        # Longer technical answers
     "temperature": 0.1       # More focused, less creative
    # "top_p": 0.8,             # More precise token selection
    # "repeat_penalty": 1.05,    # Reduce repetition
    # "num_ctx": 4096           # Larger context for papers
}
)

print(response["message"]["content"])

Transformers: Neural network architecture that uses attention mechanisms to process sequential data (e.g., text, speech) by transforming input into a fixed-length vector representation.


# test withh llama and options and with prompt based on my context

In [9]:
import ollama

context = "\n\n".join(results["documents"][0])
query="what is encoder in transformers"
response = ollama.chat(
    model="llama3.2:3b",
    messages=[
        {
            "role": "system", 
            "content": "You are an expert AI/ML assistant should reply based on context else reply your inquiry is not available, Keep responses clear, concise and under 100 characters maximum. Be direct and to the point."
        },
        {
            "role": "user", 
            "content": f"Based on this context: {context}\n\nQuestion: {query}\n\nKeep your answer under 150 characters."
        }
    ],
    options={
        "num_predict": 150,      # Longer technical answers
        "temperature": 0.1       # More focused, less creative
        # "top_p": 0.8,          # More precise token selection
        # "repeat_penalty": 1.05, # Reduce repetition
        # "num_ctx": 4096        # Larger context for papers
    }
)

#print(response["message"]["content"])
# Using textwrap (recommended)
import textwrap

output = response["message"]["content"]
wrapped_output = textwrap.fill(output, width=80)
print(wrapped_output)

In the Transformer model, the Encoder consists of a stack of identical layers
with two sub-layers: 1. Multi-head self-attention mechanism 2. Simple fully
connected feed-forward network with residual connections and layer
normalization.


In [17]:
#chekc context
print(context)

Here, the encoder maps an input sequence of symbol representations (x1, ..., xn) to a sequence
of continuous representations z = (z1, ..., zn). Given z, the decoder then generates an output
sequence (y1, ..., ym) of symbols one element at a time. At each step the model is auto-regressive
[9], consuming the previously generated symbols as additional input when generating the next.
The Transformer follows this overall architecture using stacked self-attention and point-wise, fully
connected layers for both the encoder and decoder, shown in the left and right halves of Figure 1,
respectively.
3.1
Encoder and Decoder Stacks
Encoder:
The encoder is composed of a stack of N = 6 identical layers. Each layer has two
sub-layers. The ﬁrst is a multi-head self-attention mechanism, and the second is a simple, position-
2
Figure 1: The Transformer - model architecture.
wise fully connected feed-forward network. We employ a residual connection [10] around each of

2
Figure 1: The Transformer - model

# adding smart context to simplify the long context

In [11]:
def get_optimized_context(context, question, max_length=300):
    """More efficient context trimming"""
    # Split into sentences and keep most relevant ones
    sentences = context.split('. ')
    # Simple keyword matching for relevance
    question_words = set(question.lower().split())
    scored_sentences = []
    
    for sentence in sentences:
        score = sum(1 for word in sentence.lower().split() if word in question_words)
        scored_sentences.append((score, sentence))
    
    # Sort by relevance and take top sentences
    scored_sentences.sort(reverse=True)
    return '. '.join([s[1] for s in scored_sentences[:5]])[:max_length]

In [21]:
#check smar conext
smart_context = get_optimized_context(context, query, 600)
print(smart_context)

We present these results in Table 3.
In Table 3 rows (A), we vary the number of attention heads and the attention key and value dimensions,
keeping the amount of computation constant, as described in Section 3.2.2. This

In this work we propose the Transformer, a model architecture eschewing recurrence and instead
relying entirely on an attention mechanism to draw global dependencies between input and output.
The Transformer allows for signiﬁcantly more parallelization and can reach a new state of the art in
translation quality after being trained for as little as twelve hours on eight P100 GP


In [12]:
import ollama
query="what transformers in simple terms "
context = "\n\n".join(results["documents"][0])
smart_context = get_optimized_context(context, query, 400)
smart_context=context
max_tokens=200
response = ollama.chat(
    model="llama3.2:3b",
    messages=[
        {
            "role": "system", 
            "content": "You are an expert AI/ML assistant should reply based on context else reply your inquiry is not available, Keep responses clear, concise and under 100 characters maximum. Be direct and to the point."
        },
        {
            "role": "user", 
            "content": f"Based on this context: {smart_context}\n\nQuestion: {query}\n\nKeep your answer under {max_tokens} characters."
        }
    ],
    options={
        "num_predict": max_tokens,      # Longer technical answers
        "temperature": 0.1       # More focused, less creative
        # "top_p": 0.8,          # More precise token selection
        # "repeat_penalty": 1.05, # Reduce repetition
        # "num_ctx": 4096        # Larger context for papers
    }
)

# print(response["message"]["content"])
import textwrap

# Using textwrap (recommended)
output = response["message"]["content"]
wrapped_output = textwrap.fill(output, width=80)
print(wrapped_output)

Transformer: Model that uses self-attention mechanism to draw global
dependencies between input & output sequences.


In [13]:
import time
import logging

# Configure logging once in your app entrypoint
logging.basicConfig(
    filename="arxiv_bot_optimized.log",  # log file
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s"
)

def ask_arxiv_bot_with_sources(question, top_k=10):
    if not question.strip():
        return "Please enter a question."
    
    model_start = time.time()  # Start timing
    
    try:
        # Retrieve top-k chunks
        results = collection.query(
            query_texts=[question],
            n_results=top_k
        )
       
        if not results["documents"][0]:
            return "No relevant content found in the PDFs."
        
        context = "\n\n".join(results["documents"][0])
        max_tokens = 400
        smart_context = get_optimized_context(context, question, max_tokens)
        
        response = ollama.chat(
            model="llama3.2:3b",
            messages=[
                {
                    "role": "system", 
                    "content": "You are an expert AI/ML assistant should reply based on context else reply your inquiry is not available, Keep responses clear, concise and under 100 characters maximum. Be direct and to the point."
                },
                {
                    "role": "user", 
                    "content": f"Based on this context: {smart_context}\n\nQuestion: {question}\n\nKeep your answer under {max_tokens} characters."
                }
            ],
            options={
                "num_predict": max_tokens,      # Longer technical answers
                "temperature": 0.1       # More focused, less creative
                # "top_p": 0.8,          # More precise token selection
                # "repeat_penalty": 1.05, # Reduce repetition
                # "num_ctx": 4096        # Larger context for papers
            }
        )
        
        generated_text = response["message"]["content"]  # Extract text using dict access
        logging.info("end time %s", time.time() - model_start)
        
        # Collect sources
        sources = [md["file"] for md in results["metadatas"][0]]
        
        return f"{generated_text}\n\nSources: {', '.join(set(sources))}"
        
    except Exception as e:
        return f"Error generating answer: {e}"

In [14]:
def add_pdfs_to_collection(file_paths):
    for path in file_paths:
        # path is now a string path
        doc = fitz.open(path)  
        text = ""
        for page in doc:
            text += page.get_text()
        chunks = splitter.split_text(text)
        for i, chunk in enumerate(chunks):
            collection.add(
                documents=[chunk],
                metadatas=[{"file": os.path.basename(path), "chunk_id": i}],
                ids=[f'{os.path.basename(path)}_{i}']
            )
    return " PDFs added to collection!"


In [None]:
import gradio as gr

with gr.Blocks() as demo:
    gr.Markdown("## Arxiv RAG AI optimized Assistant")
    
    # Upload PDFs (fixed)
    pdf_input = gr.File(file_types=[".pdf"], type="filepath", label="Upload new PDFs", file_count="multiple")
    
    # Question input
    question_input = gr.Textbox(lines=2, placeholder="Type your question here...", label="Ask a question")
    
    # Answer output
      # Answer output
    answer_output = gr.Textbox(
    label="Answer", 
    lines=10,  # Increase from default ~3 to 10 lines
    max_lines=20,  # Allow expansion up to 20 lines
    show_copy_button=True  # Add copy button for long answers
)
    # Buttons
    upload_btn = gr.Button("Add PDFs")
    ask_btn = gr.Button("Ask Question")

    upload_btn.click(lambda files: add_pdfs_to_collection(files), inputs=[pdf_input], outputs=[answer_output])
    ask_btn.click(
    fn=ask_arxiv_bot_with_sources,  # must return a string
    inputs=[question_input],
    outputs=[answer_output]
)

    
demo.launch()


* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.




Batches: 100%|██████████| 1/1 [00:00<00:00, 11.31it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 17.63it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 17.44it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 19.43it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 18.10it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 17.55it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 13.15it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 11.29it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 16.98it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 13.82it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 16.09it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 17.86it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 18.58it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 18.29it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 18.50it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 18.80it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 21.02it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 22.39it/s]
Batches: 1

In [None]:
# more enahcememnt 
# chunking by page or by paragpraph based on my data
# example AI engineer guide by page 