In [1]:
from google.colab import drive
drive.mount('/content/drive')

import os

# Root of your project in Drive
PROJECT_ROOT = "/content/drive/MyDrive/firstaid_rag_project"

DATA_DIR = os.path.join(PROJECT_ROOT, "data")
ADDITIONAL_DOCS_DIR = os.path.join(DATA_DIR, "additional_documents")
VECTOR_STORE_DIR = os.path.join(PROJECT_ROOT, "vector_store")

CSV_PATH = os.path.join(DATA_DIR, "etl_first_aid_microguide - Sheet1.csv")
PDF_PATH = os.path.join(ADDITIONAL_DOCS_DIR, "First Aid Quick Guide.pdf")
TXT_PATH = os.path.join(ADDITIONAL_DOCS_DIR, "First_Aid_FAQ_and_Decision_Tips.txt")

os.makedirs(VECTOR_STORE_DIR, exist_ok=True)

print("PROJECT_ROOT:", PROJECT_ROOT)
print("CSV_PATH:", CSV_PATH)
print("PDF_PATH:", PDF_PATH)
print("TXT_PATH:", TXT_PATH)
print("VECTOR_STORE_DIR:", VECTOR_STORE_DIR)


Mounted at /content/drive
PROJECT_ROOT: /content/drive/MyDrive/firstaid_rag_project
CSV_PATH: /content/drive/MyDrive/firstaid_rag_project/data/etl_first_aid_microguide - Sheet1.csv
PDF_PATH: /content/drive/MyDrive/firstaid_rag_project/data/additional_documents/First Aid Quick Guide.pdf
TXT_PATH: /content/drive/MyDrive/firstaid_rag_project/data/additional_documents/First_Aid_FAQ_and_Decision_Tips.txt
VECTOR_STORE_DIR: /content/drive/MyDrive/firstaid_rag_project/vector_store


In [2]:
import pandas as pd
from textwrap import dedent

def load_csv_docs(csv_path: str):
    df = pd.read_csv(csv_path)

    print("CSV shape:", df.shape)
    print("CSV columns:", list(df.columns))

    docs = []
    for _, row in df.iterrows():
        # Use the actual column names from your screenshot
        condition_id   = row["condition_id"]
        name           = row["condition_name"]
        category       = row["category"]
        severity       = row["severity_tag"]
        signs          = row["common_signs"]
        steps          = row["immediate_steps"]
        donts          = row["do_not_do"]
        when_help      = row["when_to_seek_help"]
        follow_up      = row["follow_up_advice"]
        source         = row["source"]

        text = dedent(f"""
        [CONDITION_ID: {condition_id}]
        Condition Name: {name}
        Category: {category}
        Severity: {severity}

        Common Signs:
        {signs}

        Immediate First-Aid Steps:
        {steps}

        What NOT To Do:
        {donts}

        When To Seek Professional Help:
        {when_help}

        Follow-Up Advice:
        {follow_up}

        Source:
        {source}
        """).strip()

        docs.append({
            "id": f"CSV::{condition_id}",
            "text": text,
            "metadata": {
                "source_type": "csv",
                "condition_id": condition_id,
                "name": name,
                "category": category,
                "severity": severity,
            }
        })
    return docs

csv_docs = load_csv_docs(CSV_PATH)
len(csv_docs), csv_docs[0]["id"]


CSV shape: (15, 10)
CSV columns: ['condition_id', 'condition_name', 'category', 'severity_tag', 'common_signs', 'immediate_steps', 'do_not_do', 'when_to_seek_help', 'follow_up_advice', 'source']


(15, 'CSV::CUT_MINOR')

In [3]:
!pip install -q pypdf


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/328.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━[0m [32m276.5/328.2 kB[0m [31m8.4 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m328.2/328.2 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [4]:
from pypdf import PdfReader
import os

def load_pdf_docs(pdf_path: str):
    reader = PdfReader(pdf_path)
    docs = []
    for page_num, page in enumerate(reader.pages):
        text = page.extract_text()
        if not text:
            continue
        text = text.strip()
        if not text:
            continue
        docs.append({
            "id": f"PDF::page_{page_num+1}",
            "text": text,
            "metadata": {
                "source_type": "pdf",
                "page": page_num + 1,
                "filename": os.path.basename(pdf_path),
            }
        })
    return docs

def load_txt_docs(txt_path: str, chunk_size: int = 800, overlap: int = 200):
    # Read the full txt file
    with open(txt_path, "r", encoding="utf-8") as f:
        full_text = f.read().strip()

    chunks = []
    start = 0
    while start < len(full_text):
        end = start + chunk_size
        chunk = full_text[start:end].strip()
        if chunk:
            chunks.append(chunk)
        # move forward with overlap
        start = end - overlap
        if start <= 0:
            break

    docs = []
    for i, chunk in enumerate(chunks):
        docs.append({
            "id": f"TXT::chunk_{i+1}",
            "text": chunk,
            "metadata": {
                "source_type": "txt",
                "chunk": i + 1,
                "filename": os.path.basename(txt_path),
            }
        })
    return docs

# Actually load your files (paths already set in Step 1)
pdf_docs = load_pdf_docs(PDF_PATH)
txt_docs = load_txt_docs(TXT_PATH)

print("PDF docs:", len(pdf_docs))
print("TXT docs:", len(txt_docs))

# peek at the first PDF page's text
pdf_docs[0]["id"], pdf_docs[0]["text"][:300]


PDF docs: 12
TXT docs: 16


('PDF::page_1',
 'How  to  Use  This  Guide  \n●  Each  section  focuses  on  one  common  problem  (cut,  burn,  sprain,  etc.).  ●  For  every  problem,  you’ll  see:  ○  Common  signs  –  what  it  tends  to  look  like  ○  What  to  do  now  –  basic  first-aid  steps  ○  What  not  to  do  –  things  that  might ')

In [5]:
!pip install -q sentence-transformers faiss-cpu


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m49.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [6]:
import numpy as np

# Combine everything we loaded:
all_docs = csv_docs + pdf_docs + txt_docs

len(all_docs)


43

In [7]:
from sentence_transformers import SentenceTransformer

# Load the embedding model
model = SentenceTransformer("BAAI/bge-small-en-v1.5")

texts = [d["text"] for d in all_docs]

# This might take a bit, but should finish in a couple of minutes
embeddings = model.encode(
    texts,
    convert_to_numpy=True,
    show_progress_bar=True
)

embeddings.shape


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

(43, 384)

In [8]:
import faiss

# Normalize for cosine similarity via inner product
faiss.normalize_L2(embeddings)

dim = embeddings.shape[1]
index = faiss.IndexFlatIP(dim)   # IP = inner product
index.add(embeddings)

print("Index dimension:", dim)
print("Number of vectors in index:", index.ntotal)


Index dimension: 384
Number of vectors in index: 43


In [9]:
from google.colab import drive
drive.mount("/content/drive")  # if it asks for permission, approve

import os

# This should match your project folder in Google Drive:
ROOT_DIR = "/content/drive/MyDrive/firstaid_rag_project"

print("ROOT_DIR contents:", os.listdir(ROOT_DIR))


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
ROOT_DIR contents: ['data', 'notebooks', 'src', 'vector_store', 'api', 'README.md']


In [10]:
import os, pickle

VECTOR_DIR = os.path.join(ROOT_DIR, "vector_store")
os.makedirs(VECTOR_DIR, exist_ok=True)

INDEX_PATH = os.path.join(VECTOR_DIR, "first_aid_index.faiss")
DOCS_PATH  = os.path.join(VECTOR_DIR, "first_aid_docs.pkl")

faiss.write_index(index, INDEX_PATH)

with open(DOCS_PATH, "wb") as f:
    pickle.dump(all_docs, f)

print("Saved index to:", INDEX_PATH)
print("Saved docs  to:", DOCS_PATH)
print("Total docs:", len(all_docs))


Saved index to: /content/drive/MyDrive/firstaid_rag_project/vector_store/first_aid_index.faiss
Saved docs  to: /content/drive/MyDrive/firstaid_rag_project/vector_store/first_aid_docs.pkl
Total docs: 43


In [11]:
def answer_with_context(question, k=3):
    # 1) retrieve
    hits = retrieve(question, k=k)   # reuse your existing function
    context_blocks = []
    for i, (doc, score, doc_id) in enumerate(hits, start=1):
        context_blocks.append(f"[SOURCE {i} | id={doc_id} | score={score:.3f}]\n{doc}\n")

    context_text = "\n\n".join(context_blocks)

    # 2) build a prompt for an LLM
    system_prompt = (
        "You are a cautious first-aid assistant. "
        "Answer ONLY from the context between <CONTEXT>...</CONTEXT>. "
        "If something is not in the context, say that you don't know. "
        "Always remind the user that this is not a substitute for emergency care."
    )

    user_prompt = f"""
{system_prompt}

<CONTEXT>
{context_text}
</CONTEXT>

Question: {question}

Write a short, clear answer using bullet points. At the end, list which SOURCE numbers you used.
"""

    return user_prompt  # if you call an external LLM, send this as the prompt


In [13]:
readme_path = os.path.join(root, "README.md")

content = """
4. Go to **Google Drive → firstaid_rag_project** and you should see a file called `README.md`.
5. You can now **edit it** either:
- by downloading and opening in any text editor, OR
- or re-writing it from Colab (we’ll do that in a sec by giving you the final text).

---

### Option 2 – Locally on your laptop

If you ever download the project folder to your computer:

1. Open any text editor (VS Code, Notepad, etc.).
2. **Paste the README content** I give you below.
3. Save the file as: `README.md` inside `firstaid_rag_project/`.

Either approach is fine. The important part:
👉 File name: `README.md`
👉 Location: project root (`firstaid_rag_project`)

---

## C. What to put in the README (ready to paste)

Below is a **complete README** that matches your current structure:

- `requirements.txt` is in `api/`
- `app.py` is in `api/`
- vector store is in `vector_store/`

### ✅ Final README text (copy this into `README.md`)

```markdown
# First Aid Micro-Guide RAG System

This project is a small **Retrieval-Augmented Generation (RAG)** system for basic first aid.

Given a natural-language question like:

> “What should I do for a minor cut?”

the system:

1. Embeds the question using `BAAI/bge-small-en-v1.5`
2. Uses a **FAISS** vector index to retrieve the most relevant chunks from:
- a structured first-aid CSV,
- a quick-guide PDF, and
- an FAQ text file
3. Returns the top-k matching chunks via a simple Flask API

> ⚠️ **Important:** This system is for educational purposes only.
> It is **not** medical advice and is **not** a substitute for professional care or emergency services.

---

## 1. Project Layout

Expected directory structure:

```text
firstaid_rag_project/
├── api/
│   ├── app.py
│   └── requirements.txt
├── data/
│   ├── etl_first_aid_microguide - Sheet1.csv
│   ├── First Aid Quick Guide.pdf
│   └── First_Aid_FAQ_and_Decision_Tips.txt
├── src/
│   └── build_first_aid_index.ipynb
├── notebooks/
│   └── ingestion_test.ipynb
└── vector_store/
 ├── first_aid_index.faiss
 └── first_aid_docs.pkl
"""


NameError: name 'root' is not defined