<a href="https://colab.research.google.com/github/it21813702/ctse_ml_chatbot/blob/main/CTSE_ML_Chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup Environment and Load Data

In [1]:
# install libraries
# langchain       - main framework to build document-based LLM applications
# faiss-cpu       - fast vector search for document similarity
# tiktoken        - helps in chunking text properly by token length
# python-dotenv   - loads environment variables from .env files (not used, but good to have)
# python-pptx     - used to extract text from PowerPoint slides

!pip install -U langchain faiss-cpu tiktoken python-dotenv python-pptx langchain-community sentence-transformers
!pip install pypdf



In [None]:
# Unzip lecture notes into working directory in Colab

# upload folder/file from computer.
# folder should contain lecture PDFs or PPTX files

#         path in collab                    d- destination         path to folder to extract into
!unzip -q "/content/CTSE_Lecture_Notes.zip" -d "/content/CTSE_Lecture_Notes"
print("📁 Files extracted to /content/CTSE_Lecture_Notes")

replace /content/CTSE_Lecture_Notes/ML Lec 2 - Part 2 LLM.pdf? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [None]:
# prepare for loading files
!pip install pypdf

In [None]:
import os

notes_dir = "/content/CTSE_Lecture_Notes"
print("Files in your CTSE_Lecture_Notes folder:")
print(os.listdir(notes_dir))

In [None]:
from langchain_community.document_loaders import PyPDFLoader
from pptx import Presentation
from langchain.schema import Document

# Helper function to load .pptx files
def load_pptx_as_document(path):
    prs = Presentation(path)
    text_runs = []

    for slide in prs.slides:
        for shape in slide.shapes:
            if hasattr(shape, "text"):
                text_runs.append(shape.text)

    full_text = "\n".join(text_runs)
    return Document(page_content=full_text, metadata={"source": os.path.basename(path)})


# Load and convert all documents into LangChain-compatible format
documents = []

for file in os.listdir(notes_dir):
    path = os.path.join(notes_dir, file)

    if file.endswith(".pdf"):
        loader = PyPDFLoader(path)
        pdf_docs = loader.load()  # one doc per page
        documents.extend(pdf_docs)

    elif file.endswith(".pptx"):
        doc = load_pptx_as_document(path)
        documents.append(doc)



print(f"✅ Total documents loaded: {len(documents)}")
print("📄 Sample content:\n")
print(documents[0].page_content[:500])


# Create Chunks and Store in a Vector Database (FAISS)

In [None]:
!pip install -U langchain-openai

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Split each document into smaller, overlapping chunks
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,        # max characters per chunk
    chunk_overlap=150       # overlap to preserve context between chunks
)

chunked_documents = splitter.split_documents(documents)

print("✅ Total chunks created:", len(chunked_documents))
print("📄 Sample chunk preview:\n")
print(chunked_documents[0].page_content[:500])



from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

# Load a high-quality, free embedding model
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# Create the vector database from your document chunks
vector_database = FAISS.from_documents(
    documents=chunked_documents,
    embedding=embedding_model
)

# Show how many chunks were stored
print("✅ Total chunks stored in FAISS:", vector_database.index.ntotal)

# Save FAISS index to reuse later
vector_database.save_local("ctse_vector_index_free")
print("💾 FAISS vector database saved as: ctse_vector_index_free")

# Build Question-Answering Chatbot using Retrieval + HuggingFace Vector DB

In [None]:
# load api key using collab secret

from huggingface_hub import login
from google.colab import userdata

# Load token securely from Colab Secrets
hf_token = userdata.get("HF_TOKEN")

# Log in to HuggingFace
login(token=hf_token)

# check if key is loaded
print("API Key Loaded:", "Yes" if hf_token else "No")

In [None]:
!pip install -U transformers accelerate huggingface-hub

In [None]:
from huggingface_hub import login
from google.colab import userdata
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch

# load your HuggingFace token and authenticate
hf_token = userdata.get("HF_TOKEN")
login(token=hf_token)

# 🧠 Load Mistral-7B-Instruct model + tokenizer from HuggingFace
model_id = "mistralai/Mistral-7B-Instruct-v0.1"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float32)

# 🔁 Create a generation pipeline
mistral_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=512,                         # reasonable response length
    device=0 if torch.cuda.is_available() else -1  # run on CPU if GPU unavailable
)

print("✅ Mistral model loaded and ready")
