# BibleEncouragementAssistant

<a target="_blank" href="https://colab.research.google.com/github/simonguest/CS-394/blob/main/src/06/notebooks/generate-synthetic.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>
<a target="_blank" href="https://github.com/simonguest/CS-394/raw/refs/heads/main/src/06/notebooks/generate-synthetic.ipynb">
  <img src="https://img.shields.io/badge/Download_.ipynb-blue" alt="Download .ipynb"/>
</a>

## Data generation settings

In [None]:
from pydantic import BaseModel
from typing import List, Literal, Optional, Dict, Any, Tuple
from pydantic import BaseModel, Field, ValidationError

NUM_TRAIN_EXAMPLES = 500  # @param {type:"number"}
NUM_VAL_EXAMPLES = 100  # @param {type:"number"}
NUM_TEST_EXAMPLES = 10 # @param {type:"number"}
TEMPERATURE = 0.8  # @param {type:"number"}

DATA_FOLDER = "./.data/generated"
!mkdir -p {DATA_FOLDER}

DATAGEN_MODEL = "openai/gpt-5-nano"

## Dataset diversity

In [None]:
BIBLE_BOOKS_CHAPTERS = [
    # Old Testament
    "Genesis", "Exodus", "Leviticus", "Numbers", "Deuteronomy",
    "Joshua", "Judges", "Ruth",
    "1 Samuel", "2 Samuel",
    "1 Kings", "2 Kings",
    "1 Chronicles", "2 Chronicles",
    "Ezra", "Nehemiah", "Esther",
    "Job", "Psalm", "Proverbs",
    "Ecclesiastes", "Song of Solomon",
    "Isaiah", "Jeremiah", "Lamentations",
    "Ezekiel", "Daniel",
    "Hosea", "Joel", "Amos",
    "Obadiah", "Jonah", "Micah",
    "Nahum", "Habakkuk", "Zephaniah",
    "Haggai", "Zechariah", "Malachi",

    # New Testament
    "Matthew", "Mark", "Luke", "John",
    "Acts", "Romans",
    "1 Corinthians", "2 Corinthians",
    "Galatians", "Ephesians", "Philippians", "Colossians",
    "1 Thessalonians", "2 Thessalonians",
    "1 Timothy", "2 Timothy",
    "Titus", "Philemon",
    "Hebrews", "James",
    "1 Peter", "2 Peter",
    "1 John", "2 John", "3 John",
    "Jude", "Revelation",
]

ISSUE_CATEGORIES = [
    "anxiety", "fear", "grief", "guilt", "anger", "loneliness",
    "guidance", "forgiveness", "burnout", "temptation"
]

SEVERITY_LEVELS = ["mild", "moderate", "severe", "crisis"]

# How often we generate each severity (crisis is rarer)
SEVERITY_WEIGHTS = [0.35, 0.35, 0.22, 0.08]

## Model for structured output

In [None]:
class BibleExplanation(BaseModel):
    chapter: str
    verse: str
    explanation: str

## Get OpenRouter API key

In [None]:
import sys
import os
from dotenv import load_dotenv

if 'google.colab' in sys.modules:
  from google.colab import userdata # type:ignore
  os.environ['OPENROUTER_API_KEY'] = userdata.get('OpenRouter')
else:
  load_dotenv()

## Conversation generation functions

In [None]:
import openai
import os

client = openai.OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=os.environ.get("OPENROUTER_API_KEY"),
)

def generate_completion(prompt: str):
    try:
        response = client.responses.parse(
            model=DATAGEN_MODEL,
            input=[{"role": "user", "content": prompt}],
            temperature=TEMPERATURE,
            stream=False,
            text_format=BibleExplanation
        )
        return response.output_parsed
    except Exception:
        return None

def create_conversation(issue: str, severity: str) -> BibleExplanation | None:
    request = ""
    if severity == "mild":
        request = (
        f"Provide 2–3 short Bible verse references from {BIBLE_BOOKS_CHAPTERS} "
        f"about {issue}, with brief encouragement (1–2 sentences each)."
    )

    elif severity == "moderate":
        request = (
        f"Provide 3–5 Bible verse references from {BIBLE_BOOKS_CHAPTERS} "
        f"about {issue}, each with short meaning explanations and "
        f"3–4 practical guidance steps."
    )
    elif severity == "severe":
        request = (
        f"Provide 4–6 Bible verse references from {BIBLE_BOOKS_CHAPTERS} "
        f"about {issue}, with deeper explanations and 4–6 compassionate "
        f"guidance steps. Include strong reassurance."
    )
    elif severity == "crisis":
        request = (
        f"Provide 4–6 Bible verse references from {BIBLE_BOOKS_CHAPTERS}"
        f"about {issue}, with careful, supportive explanations. "
        f"Include clear encouragement to seek immediate help from trusted "
        f"people or local emergency services if the person is in danger."
    )

    else:
        request = (
        f"Provide Bible verse references from  {BIBLE_BOOKS_CHAPTERS} about {issue}, "
        f"with short explanations and practical encouragement."
    )

    prompt = f"""
    Generate {request}

    Context:
    - The user is sharing something related to: {issue}
    - Severity level: {severity}

    Rules:
    - Use ONLY Bible verse references from the allowed books(old testament and new testament) and chapters list: {", ".join(BIBLE_BOOKS_CHAPTERS)}
    - Provide verse references in the format: "Book Chapter:Verse" or "Book Chapter:Verse-Verse"
    - Do NOT cite any book outside the allowed list
    - Do NOT include long direct quotes from the Bible (keep it paraphrased and explained)

    For this selection of verses, write a short 2-paragraph explanation:
    - The explanation should be suitable for a high school student (clear, simple language).
    - When it makes sense, the second paragraph should use an analogy to help the person understand the message.
    - Keep the tone compassionate and practical.

    Return the following:
    1) The selected verse references as a list of strings (NOT the verse text).
    2) Your 2-paragraph explanation as a single string.
    3) A practical guidance list (3–6 bullet strings) the user can do today.
    4) A short note string (if severity is "crisis", encourage reaching out to trusted people or local emergency services).
    """

    return generate_completion(prompt)

## Dataset generation functions

In [None]:
import random
import json
from tqdm import tqdm

def generate_dataset_files(num_examples: int, jsonl_path: str, pretty_json_path: str) -> None:
    os.makedirs(os.path.dirname(jsonl_path), exist_ok=True)

    examples = []

    with open(jsonl_path, "w", encoding="utf-8") as f_jsonl:
        for idx in tqdm(range(num_examples)):
            issue = random.choice(ISSUE_CATEGORIES)
            severity = random.choices(SEVERITY_LEVELS, weights=SEVERITY_WEIGHTS)[0]

            conversation = None
            tries = 0
            while conversation is None and tries < 6:
                conversation = create_conversation(issue, severity)
                tries += 1

            if conversation is None:
                print(f"Error generating conversation for example {idx}")
                continue

            template = {
                "messages": [
                    {"role": "user", "content": conversation["user"]},
                    {"role": "assistant", "content": json.dumps(conversation["assistant"], ensure_ascii=False)},
                ]
            }

            f_jsonl.write(json.dumps(template, ensure_ascii=False) + "\n")
            f_jsonl.flush()
            examples.append(template)

    with open(pretty_json_path, "w", encoding="utf-8") as f_pretty:
        json.dump(examples, f_pretty, ensure_ascii=False, indent=2)

## Generate all the data!

In [None]:
from datetime import datetime

TRAIN_FILE = f"{DATA_FOLDER}/train_pretty_{datetime.now().strftime('%Y-%m-%d-%H:%M:%S')}.jsonl"
train_pretty = f"{DATA_FOLDER}/train_{datetime.now().strftime('%Y-%m-%d-%H:%M:%S')}.pretty.json"
VALID_FILE = f"{DATA_FOLDER}/valid_{datetime.now().strftime('%Y-%m-%d-%H:%M:%S')}.jsonl"
valid_pretty = f"{DATA_FOLDER}/valid_pretty_{datetime.now().strftime('%Y-%m-%d-%H:%M:%S')}.pretty.json"
TEST_FILE = f"{DATA_FOLDER}/test_{datetime.now().strftime('%Y-%m-%d-%H:%M:%S')}.jsonl"
test_pretty = f"{DATA_FOLDER}/test_pretty_{datetime.now().strftime('%Y-%m-%d-%H:%M:%S')}.pretty.json"

generate_dataset_files(NUM_TRAIN_EXAMPLES, TRAIN_FILE, train_pretty)
generate_dataset_files(NUM_VAL_EXAMPLES, VALID_FILE, valid_pretty)
generate_dataset_files(NUM_TEST_EXAMPLES, TEST_FILE, test_pretty)


100%|██████████| 10/10 [05:26<00:00, 32.68s/it]
100%|██████████| 10/10 [05:09<00:00, 30.99s/it]
100%|██████████| 10/10 [05:35<00:00, 33.58s/it]
