In [None]:
# Cell 1: Mount Google Drive
from google.colab import drive
import os

print("Mounting Google Drive...")
drive.mount('/content/drive')
print("Google Drive mounted.")

# Define your base project directory within Google Drive
# IMPORTANT: Replace 'my_finetune_project' with your actual project folder name.
# It's recommended to create this folder in your Drive first (e.g., in "My Drive/Colab Notebooks/my_finetune_project")
BASE_PROJECT_DIR = "/content/drive/My Drive/Colab Notebooks/finetuned_+_rag_project"

# Create the project directory if it doesn't exist
os.makedirs(BASE_PROJECT_DIR, exist_ok=True)
print(f"Base project directory set to: {BASE_PROJECT_DIR}")

# Optional: Change current working directory to your project folder
# This makes it easier to use relative paths later, but absolute paths are safer.
# %cd {BASE_PROJECT_DIR}
# print(f"Changed current working directory to: {os.getcwd()}")

Collecting accelerate
  Downloading accelerate-1.7.0-py3-none-any.whl.metadata (19 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.46.0-py3-none-win_amd64.whl.metadata (10 kB)
Downloading accelerate-1.7.0-py3-none-any.whl (362 kB)
Downloading bitsandbytes-0.46.0-py3-none-win_amd64.whl (66.5 MB)
   ---------------------------------------- 0.0/66.5 MB ? eta -:--:--
    --------------------------------------- 1.6/66.5 MB 8.3 MB/s eta 0:00:08
   --- ------------------------------------ 5.2/66.5 MB 13.3 MB/s eta 0:00:05
   ----- ---------------------------------- 9.2/66.5 MB 15.4 MB/s eta 0:00:04
   ------- -------------------------------- 13.1/66.5 MB 16.1 MB/s eta 0:00:04
   ---------- ----------------------------- 17.0/66.5 MB 16.5 MB/s eta 0:00:03
   ------------ --------------------------- 21.2/66.5 MB 17.2 MB/s eta 0:00:03
   --------------- ------------------------ 25.2/66.5 MB 17.3 MB/s eta 0:00:03
   ----------------- ---------------------- 29.4/66.5 MB 17.6 MB/s eta 0:00:


[notice] A new release of pip is available: 24.2 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip





In [None]:
!pip install faiss-cpu
import os
import shutil
import faiss # Make sure faiss is imported here too for consistency with path definition

global GDRIVE_KNOWLEDGE_BASE_PATH, GDRIVE_FINETUNED_MODEL_PATH, GDRIVE_FAISS_INDEX_PATH, KNOWLEDGE_BASE_PATH_TO_USE

GDRIVE_KNOWLEDGE_BASE_PATH = os.path.join(BASE_PROJECT_DIR, "rephrased_output.json")
GDRIVE_FINETUNED_MODEL_PATH = os.path.join(BASE_PROJECT_DIR, "finetuned_qwen")
GDRIVE_FAISS_INDEX_PATH = os.path.join(BASE_PROJECT_DIR, "my_faiss_index.bin")

LOCAL_TEMP_DIR = "/content/temp_rag_data"
os.makedirs(LOCAL_TEMP_DIR, exist_ok=True)

LOCAL_KNOWLEDGE_BASE_PATH = os.path.join(LOCAL_TEMP_DIR, "rephrased_output.json")

if not os.path.exists(LOCAL_KNOWLEDGE_BASE_PATH) and os.path.exists(GDRIVE_KNOWLEDGE_BASE_PATH):
    print(f"Copying knowledge base from Google Drive to local: {GDRIVE_KNOWLEDGE_BASE_PATH} -> {LOCAL_KNOWLEDGE_BASE_PATH}")
    shutil.copy(GDRIVE_KNOWLEDGE_BASE_PATH, LOCAL_KNOWLEDGE_BASE_PATH)
    print("Knowledge base copied to local storage.")
elif not os.path.exists(GDRIVE_KNOWLEDGE_BASE_PATH):
    print(f"Error: Knowledge base file not found in Google Drive at {GDRIVE_KNOWLEDGE_BASE_PATH}. Please ensure it's uploaded.")
else:
    print(f"Knowledge base already exists locally at {LOCAL_KNOWLEDGE_BASE_PATH}. Skipping copy.")

KNOWLEDGE_BASE_PATH_TO_USE = LOCAL_KNOWLEDGE_BASE_PATH

print(f"Knowledge base path for use: {KNOWLEDGE_BASE_PATH_TO_USE}")
print(f"Fine-tuned model path: {GDRIVE_FINETUNED_MODEL_PATH}")
print(f"FAISS index path (Drive): {GDRIVE_FAISS_INDEX_PATH}")

if not os.path.exists(GDRIVE_FINETUNED_MODEL_PATH):
    print(f"Warning: Fine-tuned model directory not found at {GDRIVE_FINETUNED_MODEL_PATH}.")
    print("Please ensure your fine-tuning script has completed successfully and saved the model to this location.")

In [None]:
print("Starting RAG & Gradio library installation process...")
!pip install sentence-transformers -q
print("Installed sentence-transformers.")
import torch # Important for checking GPU availability
if torch.cuda.is_available():
    print("GPU detected. Installing faiss-gpu...")
    !pip install faiss-gpu -q
else:
    print("No GPU detected. Installing faiss-cpu...")
    !pip install faiss-cpu -q
print("Installed FAISS.")
!pip install gradio -q
print("Installed Gradio.")
!pip install -U bitsandbytes accelerate -q # Crucial for 8-bit quantization
print("Installed/Upgraded bitsandbytes and accelerate.")
!pip install numpy huggingface_hub -q
print("Installed numpy and huggingface_hub.")
print("\n--- ALL REQUIRED LIBRARY INSTALLATIONS COMPLETE ---")
print("Important: Please RESTART RUNTIME (Runtime -> Restart runtime) now,")
print("then re-run cells from the top (starting with Drive Mount)!")

For more details, check out https://huggingface.co/docs/huggingface_hub/main/en/guides/download#download-files-to-local-folder.


Fetching 10 files:   0%|          | 0/10 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

LICENSE:   0%|          | 0.00/11.3k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/660 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.92k [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

'D:\\aksharaplus\\qwen2.5-1.5b-instruct'

In [None]:
# Cell 4: RAG Pipeline Functions (MODIFIED FOR EXPLICIT PATH PASSING)
import json
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig
from peft import PeftModel
import torch
from huggingface_hub import login
import os



# Load the knowledge base with nested "Introduction" structure
# Now takes 'knowledge_base_path' as an argument
def load_knowledge_base(knowledge_base_path):
    print(f"Attempting to load knowledge base from: {knowledge_base_path}")
    try:
        with open(knowledge_base_path, "r", encoding="utf-8") as f:
            try:
                data = json.load(f)
            except json.JSONDecodeError:
                f.seek(0)
                data = [json.loads(line.strip()) for line in f if line.strip()]
        chunks = []
        if isinstance(data, dict):
            data = [data]
        for item in data:
            for topic, sections in item.items():
                for section_name, levels in sections.items():
                    for i in range(1, 4):
                        level_key = f"chunks_level{i}"
                        if level_key in levels and isinstance(levels[level_key], list):
                            for chunk in levels[level_key]:
                                if isinstance(chunk, dict) and "text" in chunk:
                                    chunks.append({
                                        "text": chunk["text"],
                                        "title": topic,
                                        "section": section_name
                                    })
                                else:
                                    print(f"Warning: Skipping malformed chunk in {level_key}: {chunk}")
                        elif level_key in levels:
                             print(f"Warning: Expected list for {level_key}, found {type(levels[level_key])}. Skipping.")
        if not chunks:
            raise ValueError("No valid chunks found in knowledge base after parsing.")
        print(f"Loaded {len(chunks)} chunks from knowledge base.")
        return chunks
    except FileNotFoundError:
        raise FileNotFoundError(f"Knowledge base not found at {knowledge_base_path}. Please check the path and ensure it's in your Google Drive or copied locally.")
    except json.JSONDecodeError as e:
        raise ValueError(f"Invalid JSON/JSONL format in {knowledge_base_path}: {e}. Inspect the file for malformed lines.")
    except Exception as e:
        raise RuntimeError(f"Error loading knowledge base: {e}")

# Build retriever with Sentence-BERT and FAISS (and save/load)
# Now takes 'faiss_index_path' as an argument for saving/loading the index
def build_retriever(chunks, faiss_index_path):
    print("Building retriever model and FAISS index...")

    # Try loading existing FAISS index first
    if os.path.exists(faiss_index_path):
        try:
            index = faiss.read_index(faiss_index_path)
            retriever_model = SentenceTransformer('all-MiniLM-L6-v2')
            print(f"FAISS index loaded from {faiss_index_path}.")
            if index.ntotal != len(chunks):
                print("Warning: Loaded FAISS index size does not match current chunk count. Rebuilding index.")
                raise ValueError("Index mismatch")
            return retriever_model, index, chunks
        except Exception as e:
            print(f"Error loading FAISS index: {e}. Rebuilding index.")
            pass

    retriever_model = SentenceTransformer('all-MiniLM-L6-v2')
    chunk_texts = [chunk["text"] for chunk in chunks]
    embeddings = retriever_model.encode(chunk_texts, convert_to_tensor=False)
    embeddings = np.array(embeddings).astype('float32')
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)

    try:
        faiss.write_index(index, faiss_index_path)
        print(f"FAISS index built and saved to {faiss_index_path}.")
    except Exception as e:
        print(f"Warning: Could not save FAISS index to Google Drive: {e}")
        print("Continuing without saving FAISS index.")

    return retriever_model, index, chunks

# Retrieve top-k relevant chunks (No change)
def retrieve_chunks(question, retriever_model, index, chunks, k=3):
    print(f"Retrieving top {k} chunks for the question...")
    question_embedding = retriever_model.encode([question], convert_to_tensor=False)[0].astype('float32')
    distances, indices = index.search(np.array([question_embedding]), k)
    retrieved_chunks = [chunks[idx] for idx in indices[0]]
    print(f"Retrieved {len(retrieved_chunks)} chunks.")
    return retrieved_chunks

# Load fine-tuned Qwen model (max_new_tokens updated)
# Now takes 'finetuned_model_path' as an argument
def load_qwen_local(finetuned_model_path):
    print(f"Loading fine-tuned Qwen model from: {finetuned_model_path}")
    try:
        base_model_name = "Qwen/Qwen2.5-1.5B-Instruct"

        tokenizer = AutoTokenizer.from_pretrained(finetuned_model_path, trust_remote_code=True)

        bnb_config = BitsAndBytesConfig(
            load_in_8bit=True,
            bnb_8bit_compute_dtype=torch.float16
        )
        base_model = AutoModelForCausalLM.from_pretrained(
            base_model_name,
            quantization_config=bnb_config,
            torch_dtype=torch.float16,
            device_map="auto",
            low_cpu_mem_usage=True,
            trust_remote_code=True
        )
        print("Base Qwen model loaded.")

        model = PeftModel.from_pretrained(base_model, finetuned_model_path, trust_remote_code=True)
        print("PEFT adapter loaded and merged.")

        model.eval()

        generator = pipeline(
            "text-generation",
            model=model,
            tokenizer=tokenizer,
            max_new_tokens=200,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            torch_dtype=torch.float16,
        )
        print("Text generation pipeline created.")
        return generator
    except Exception as e:
        raise RuntimeError(f"Failed to load fine-tuned Qwen model: {e}")

# Generate answer with guidance (max_new_tokens for prompt adjusted)
def generate_answer(question, retrieved_chunks, generator):
    context = "\n".join([f"From {chunk['title']} - {chunk['section']}:\n{chunk['text']}" for chunk in retrieved_chunks])
    if any(word in question.lower() for word in ["explain", "how"]):
        guidance = "Explain the answer for the given question precisely in step-by-step manner and do not hallucinate the answer, give answer in 200 tokens"
    elif any(word in question.lower() for word in ["why"]):
        guidance = "Explain the answer for the given question in step-by-step manner, give answer in 200 tokens"
    elif any(word in question.lower() for word in ["derive", "prove"]):
        guidance = "Provide a mathematical or logical derivation and justify each step clearly, give answer in 200 tokens"
    elif "difference" in question.lower() or "compare" in question.lower():
        guidance = "Compare the concepts side by side, listing their differences and similarities clearly, give answer in 200 tokens"
    elif any(word in question.lower() for word in ["what is", "define"]):
        guidance = "Provide a clear and concise definition with relevant examples, give answer in 200 tokens"
    else:
        guidance = "Give a direct, informative, and relevant answer, give answer in 200 tokens"

    prompt = f"""You are an AI tutor helping a student learn machine learning. Answer concisely and clearly.

    Question: {question}
    Context from the textbook:
    {context}

    Instruction: {guidance}

    Answer:"""
    print(f"Generating answer with prompt length: {len(prompt)} characters.")
    response = generator(prompt, max_new_tokens=200, do_sample=True, temperature=0.7, top_p=0.9)
    answer = response[0]["generated_text"].split("Answer:")[-1].strip()
    return answer

# Global variables for single loading (optimization)
_chunks = None
_retriever_model = None
_index = None
_generator = None

# Main RAG pipeline function
def rag_pipeline(question):
    print(f"\n--- Running RAG pipeline for: '{question}' ---")
    try:
        global _chunks, _retriever_model, _index, _generator

        if _generator is None:
            print("Initializing RAG components for the first time...")

            # Use the global variables (defined in Cell 2) to pass as arguments
            _chunks = load_knowledge_base(KNOWLEDGE_BASE_PATH_TO_USE)
            _retriever_model, _index, _chunks = build_retriever(_chunks, GDRIVE_FAISS_INDEX_PATH)
            _generator = load_qwen_local(GDRIVE_FINETUNED_MODEL_PATH)
            print("RAG components initialized.")

        retrieved_chunks = retrieve_chunks(question, _retriever_model, _index, _chunks, k=3)
        answer = generate_answer(question, retrieved_chunks, _generator)

        print(f"\n📌 Question: {question}")
        print(f"💡 Answer: {answer}")
        print("\n🔍 Retrieved Chunks (showing first 200 chars):")
        for i, chunk in enumerate(retrieved_chunks, 1):
            print(f"{i}. From {chunk['title']} - {chunk['section']}:\n{chunk['text'][:200]}...\n")
    except Exception as e:
        print(f"Error in RAG pipeline: {e}")

# Example usage (no change)
if __name__ == "__main__":
    questions = [
        "What is logistic regression? Explain in short",
        # "How does SVM work? Explain in short",
        # "What are evaluation metrics in machine learning? Explain in short",
        # "how logistic regression works? explain in short",
        # "what does k stands in K-NN, explain in short",
        # "how accuracy is calculated?, answer in short.",
        # "what is the difference between ridge and Lasso regularization, explain in short",
        # "how naive bayes algorithem works, explain in short",
        # "explain me how gradient descent works, explain in short",
        # "what is OvO in multiclass classification",
        # "derive the gradient descent",
        # "In simple terms, what is the role of the sigmoid function in a logistic regression model?",
        # "Explain how the decision boundary transforms the logit calculation into a probability.",
        # "Prove the sigmoid’s optimality for logistic regression's text categorization using model training"
    ]

    for question in questions:
        rag_pipeline(question)

Collecting auto-gptq
  Downloading auto_gptq-0.7.1.tar.gz (126 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Discarding https://files.pythonhosted.org/packages/90/e5/b22697903982284fe284568fb2663a2196694a8eee637f5cf4ccfe435a38/auto_gptq-0.7.1.tar.gz (from https://pypi.org/simple/auto-gptq/) (requires-python:>=3.8.0): Requested auto-gptq from https://files.pythonhosted.org/packages/90/e5/b22697903982284fe284568fb2663a2196694a8eee637f5cf4ccfe435a38/auto_gptq-0.7.1.tar.gz has inconsistent version: expected '0.7.1', but metadata has '0.7.1+cu121'
  Downloading auto_gptq-0.7.0.tar.gz (124 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Discarding https://files.pythonhosted.org/packages/34/71/c3e73cf17681f6ff4754ef8f4cb8b67af3def230fc8711eac1250bbd78d5/auto_gptq-0.7.0.tar.gz (from https://pypi.org/simple/auto-gptq/) (requires-python:>=3.8.0): Requested auto-gptq from https://


[notice] A new release of pip is available: 24.2 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
!pip install pyngrok
# Cell 5: Gradio Interface with Ngrok (MODIFIED FOR EXPLICIT PATH PASSING)
import gradio as gr
from pyngrok import ngrok
import os
# from google.colab import userdata # Uncomment if using Colab Secrets for ngrok

# Set your ngrok auth token (replace with your actual token!)
# os.environ["NGROK_AUTH_TOKEN"] = userdata.get("NGROK_AUTH_TOKEN") # if using Colab Secrets
# ngrok.set_auth_token(os.environ["2yMKm5Da2PQFR0VDJopwwhv0F5E_2r6Zy6QRYSkinWggCrFoF"]) # Or directly set it
ngrok.set_auth_token("2yMKm5Da2PQFR0VDJopwwhv0F5E_2r6Zy6QRYSkinWggCrFoF")

# Wrapper function for the RAG pipeline to be used by Gradio
def rag_inference(question):
    global _chunks, _retriever_model, _index, _generator

    if _generator is None:
        print("Initializing RAG components for the first time (Gradio context)...")
        try:
            print(f"DEBUG (Gradio): Attempting to load knowledge base from {KNOWLEDGE_BASE_PATH_TO_USE}")
            _chunks = load_knowledge_base(KNOWLEDGE_BASE_PATH_TO_USE)
            print("DEBUG (Gradio): Knowledge base loaded.")

            print(f"DEBUG (Gradio): Attempting to build retriever from {GDRIVE_FAISS_INDEX_PATH}")
            _retriever_model, _index, _chunks = build_retriever(_chunks, GDRIVE_FAISS_INDEX_PATH)
            print("DEBUG (Gradio): Retriever built.")

            print(f"DEBUG (Gradio): Attempting to load Qwen model from {GDRIVE_FINETUNED_MODEL_PATH}")
            _generator = load_qwen_local(GDRIVE_FINETUNED_MODEL_PATH)
            print("DEBUG (Gradio): Qwen model loaded.")

            print("RAG components initialized in Gradio context.")
        except Exception as init_e_gradio:
            print(f"CRITICAL ERROR (Gradio) during RAG component initialization: {init_e_gradio}")
            import traceback
            traceback.print_exc() # This will print the full traceback
            return f"Error during initialization: {init_e_gradio}", "Initialization failed."


    try:
        print("DEBUG (Gradio): Components are ready. Proceeding with retrieval and generation.")
        retrieved_chunks = retrieve_chunks(question, _retriever_model, _index, _chunks, k=3)
        answer = generate_answer(question, retrieved_chunks, _generator)

        retrieved_text = "\n\n".join([f"**From {chunk['title']} - {chunk['section']}:**\n{chunk['text']}" for chunk in retrieved_chunks])
        return answer, retrieved_text
    except Exception as e:
        print(f"Error in Gradio RAG inference: {e}")
        import traceback
        traceback.print_exc() # This will print the full traceback
        return f"Error: {e}", "Could not retrieve chunks."

# Create the Gradio interface
iface = gr.Interface(
    fn=rag_inference,
    inputs=gr.Textbox(lines=2, placeholder="Enter your machine learning question here..."),
    outputs=[
        gr.Textbox(label="Generated Answer"),
        gr.Textbox(label="Retrieved Chunks (Context)"),
    ],
    title="Qwen RAG Machine Learning Tutor",
    description="Ask questions about machine learning, and I'll retrieve relevant information and answer using a fine-tuned Qwen model.",
    examples=[
        "What is logistic regression?",
        "How does SVM work?",
        "Explain gradient descent.",
        "What is the difference between ridge and Lasso regularization?",
        "Derive the formula for accuracy."
    ]
)

print("Launching Gradio interface...")
iface.launch(share=True, debug=True)

print("\n--- Gradio interface launched. Look for the public URL above ---")
print("It will be something like 'Running on public URL: https://[random-string].gradio.live'")
print("Copy this URL to use it from VS Code or a web browser.")