In [None]:
!pip install -q langchain langchain-community faiss-cpu sentence-transformers beautifulsoup4 requests

import os
import json
import requests
from bs4 import BeautifulSoup
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
def flatten_json(data, prefix="", source=""):
    docs = []
    if isinstance(data, dict):
        for k, v in data.items():
            new_prefix = f"{prefix}.{k}" if prefix else k
            docs.extend(flatten_json(v, new_prefix, source))
    elif isinstance(data, list):
        for i, item in enumerate(data):
            docs.extend(flatten_json(item, f"{prefix}[{i}]", source))

    else:
        text = str(data).strip()
        if text:
            docs.append(
                Document(
                    page_content=f"{prefix}: {text}",
                    metadata={"source": source}
                )
            )
    return docs

DATA_DIR = "/content/drive/MyDrive/new_rag"
documents = []

print("üìÑ Loading JSON files...")
for file in os.listdir(DATA_DIR):
    if not file.endswith(".json"):
        continue
    with open(os.path.join(DATA_DIR, file), "r", encoding="utf-8") as f:
        data = json.load(f)
    file_docs = flatten_json(data, source=file)
    documents.extend(file_docs)
    print(f"  - {file}: {len(file_docs)} entries")

print(f"\nTotal JSON documents: {len(documents)}")

In [None]:
DYNAMIC_URLS = {
    "Admission deadlines": [
        "https://ewubd.edu/undergraduate-dates-deadline",
        "https://ewubd.edu/graduate-dates-deadline"
    ],
    "Events": ["https://ewubd.edu/events"],
    "Grading": ["https://www.ewubd.edu/grades-rules-and-regulations"],
    "Tuition fees": ["https://ewubd.edu/undergraduate-tuition-fees"]
}


In [None]:
def scrape_dynamic_docs(urls_dict):
    docs = []
    headers = {"User-Agent": "Mozilla/5.0"}
    print("\nüîç Scraping live EWU website data...")

    for category, urls in urls_dict.items():
        for url in urls:
            try:
                r = requests.get(url, headers=headers, timeout=15)
                r.raise_for_status()
                soup = BeautifulSoup(r.text, "html.parser")
                for script in soup(["script", "style"]):
                    script.decompose()
                text = soup.get_text(separator="\n", strip=True)
                if len(text) < 30:  # skip near-empty pages
                    continue
                docs.append(Document(
                    page_content=f"LATEST {category.upper()} INFO from official site ({url}):\n{text}",
                    metadata={"source": url, "category": category}
                ))
                print(f"Scraped: {url}")
            except Exception as e:
                print(f"Failed to scrape {url}: {e}")

    print(f"\nTotal dynamic documents scraped: {len(docs)}")
    return docs

dynamic_docs = scrape_dynamic_docs(DYNAMIC_URLS)
print("\nSample scraped content (first 200 chars):")
if dynamic_docs:
    print(dynamic_docs[0].page_content[:200])
else:
    print("No dynamic docs scraped!")


In [None]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=2000,
    chunk_overlap=200
)

chunked_static = splitter.split_documents(documents)
chunked_dynamic = splitter.split_documents(dynamic_docs)
all_docs = chunked_static + chunked_dynamic

print(f"\nTotal chunks (JSON + live): {len(all_docs)}")

In [None]:
import torch
print("\nCreating vectorstore with all-mpnet-base-v2...")
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-mpnet-base-v2",
    encode_kwargs={"normalize_embeddings": True},
    model_kwargs={'device': 'cuda' if torch.cuda.is_available() else 'cpu'}
)
vectorstore = FAISS.from_documents(all_docs, embeddings)

In [None]:
!git clone https://github.com/infi9itea/Probaho.git

In [None]:
SAVE_PATH = "/content/Probaho/rag/vectorstore"
vectorstore.save_local(SAVE_PATH)
print(f"\n‚úÖ Vectorstore saved at: {SAVE_PATH}")

In [None]:
!ls -l "/content/Probaho/rag/vectorstore"

In [None]:
!pip install -q fastapi uvicorn torch transformers bitsandbytes langchain langchain-community faiss-cpu sentence-transformers pyngrok

In [None]:
!nvidia-smi

In [None]:
import os
os.environ["HF_TOKEN"] = "code"

In [None]:
from pyngrok import ngrok
ngrok.set_auth_token("code")
public_url = ngrok.connect(8000)
print("üåç Public URL:", public_url)

In [None]:
%cd /content/Probaho/rag
!uvicorn service:app --host 0.0.0.0 --port 8000