# BibleEncouragementAssistant

<a target="_blank" href="https://colab.research.google.com/github/simonguest/CS-394/blob/main/src/06/notebooks/generate-synthetic.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>
<a target="_blank" href="https://github.com/simonguest/CS-394/raw/refs/heads/main/src/06/notebooks/generate-synthetic.ipynb">
  <img src="https://img.shields.io/badge/Download_.ipynb-blue" alt="Download .ipynb"/>
</a>

## Data generation settings

In [None]:
from pydantic import BaseModel
from typing import List, Literal, Optional, Dict, Any, Tuple
from pydantic import BaseModel, Field, ValidationError

NUM_TRAIN_EXAMPLES = 500  # @param {type:"number"}
NUM_VAL_EXAMPLES = 100  # @param {type:"number"}
NUM_TEST_EXAMPLES = 10 # @param {type:"number"}
TEMPERATURE = 0.8  # @param {type:"number"}

DATA_FOLDER = "./.data/generated"
!mkdir -p {DATA_FOLDER}

DATAGEN_MODEL = "openai/gpt-5-nano"

## Dataset diversity

In [None]:
VERSE_BANK = [
    # Anxiety / worry
    {"ref": "Philippians 4:6-7", "theme": "anxiety", "paraphrase": "Bring your worries to God in prayer with gratitude; God’s peace will guard your heart and mind."},
    {"ref": "Matthew 6:34", "theme": "anxiety", "paraphrase": "Don’t borrow trouble from tomorrow; focus on today’s responsibilities."},
    {"ref": "1 Peter 5:7", "theme": "anxiety", "paraphrase": "Hand your anxieties to God because He cares for you."},

    # Fear
    {"ref": "Isaiah 41:10", "theme": "fear", "paraphrase": "Do not fear; God is with you and will strengthen and uphold you."},
    {"ref": "Joshua 1:9", "theme": "fear", "paraphrase": "Be strong and courageous; God is with you wherever you go."},
    {"ref": "Psalm 23:1-4", "theme": "fear", "paraphrase": "God shepherds you; even in dark valleys you are not alone and can be comforted."},

    # Grief / sadness
    {"ref": "Psalm 34:18", "theme": "grief", "paraphrase": "God is close to the brokenhearted and helps those crushed in spirit."},
    {"ref": "Matthew 5:4", "theme": "grief", "paraphrase": "Those who mourn will be comforted."},
    {"ref": "Revelation 21:4", "theme": "grief", "paraphrase": "God will wipe away tears; suffering and death will not last forever."},

    # Guilt / shame
    {"ref": "1 John 1:9", "theme": "guilt", "paraphrase": "If you confess, God forgives and cleanses you."},
    {"ref": "Romans 8:1", "theme": "guilt", "paraphrase": "In Christ, you are not condemned."},
    {"ref": "Psalm 51:10", "theme": "guilt", "paraphrase": "Ask God to create a clean heart and renew a steady spirit."},

    # Anger / conflict
    {"ref": "James 1:19-20", "theme": "anger", "paraphrase": "Be quick to listen, slow to speak, slow to anger; human anger doesn’t produce God’s righteousness."},
    {"ref": "Proverbs 15:1", "theme": "anger", "paraphrase": "A gentle answer can turn away wrath, but harsh words stir up anger."},
    {"ref": "Ephesians 4:26-27", "theme": "anger", "paraphrase": "It’s possible to be angry without sin; don’t let anger linger and gain control."},

    # Loneliness
    {"ref": "Hebrews 13:5", "theme": "loneliness", "paraphrase": "God will not leave you or abandon you."},
    {"ref": "Psalm 139:7-10", "theme": "loneliness", "paraphrase": "You can’t outrun God’s presence; He is with you everywhere."},
    {"ref": "John 14:27", "theme": "loneliness", "paraphrase": "Jesus gives a peace unlike the world’s; don’t let your heart be troubled or afraid."},

    # Guidance / decisions
    {"ref": "Proverbs 3:5-6", "theme": "guidance", "paraphrase": "Trust God rather than relying only on yourself; He will guide your path."},
    {"ref": "James 1:5", "theme": "guidance", "paraphrase": "If you lack wisdom, ask God; He gives generously."},
    {"ref": "Psalm 119:105", "theme": "guidance", "paraphrase": "God’s word guides your steps like a lamp and light."},

    # Forgiveness
    {"ref": "Ephesians 4:32", "theme": "forgiveness", "paraphrase": "Be kind and forgive as you have been forgiven."},
    {"ref": "Colossians 3:13", "theme": "forgiveness", "paraphrase": "Bear with one another and forgive; do it as the Lord forgave you."},
    {"ref": "Matthew 6:14-15", "theme": "forgiveness", "paraphrase": "Forgiving others matters deeply; it reflects a forgiven heart."},

    # Burnout / weariness
    {"ref": "Matthew 11:28-30", "theme": "burnout", "paraphrase": "Come to Jesus when you are weary; learn from Him and find rest."},
    {"ref": "Isaiah 40:31", "theme": "burnout", "paraphrase": "Those who hope in God renew strength; they can endure and not collapse."},
    {"ref": "2 Corinthians 12:9", "theme": "burnout", "paraphrase": "God’s grace is sufficient; His power is made strong in weakness."},

    # Temptation / self-control
    {"ref": "1 Corinthians 10:13", "theme": "temptation", "paraphrase": "Temptation is common; God provides a way to endure it."},
    {"ref": "Galatians 5:22-23", "theme": "temptation", "paraphrase": "The Spirit produces character like self-control, patience, and peace."},
    {"ref": "Psalm 119:11", "theme": "temptation", "paraphrase": "Treasure God’s word in your heart to resist sin."},
]

ISSUE_CATEGORIES = [
    "anxiety", "fear", "grief", "guilt", "anger", "loneliness",
    "guidance", "forgiveness", "burnout", "temptation"
]

FAITH_BACKGROUNDS = [
    "new_to_faith", "active_believer", "returning", "skeptical_curious"
]

SEVERITY_LEVELS = ["mild", "moderate", "severe", "crisis"]

# How often we generate each severity (crisis is rarer)
SEVERITY_WEIGHTS = [0.35, 0.35, 0.22, 0.08]

def verses_for_issue(issue: str) -> List[Dict[str, str]]:
    return [v for v in VERSE_BANK if v["theme"] == issue]

def sample_allowed_verses(issue: str, k_min: int = 1, k_max: int = 3) -> List[Dict[str, str]]:
    pool = verses_for_issue(issue)
    k = min(len(pool), random.randint(k_min, k_max))
    # If pool is small, just return all
    if len(pool) <= k:
        return pool
    return random.sample(pool, k)

def pick_issue_with_backup() -> str:
    # Ensure we pick an issue with enough verses in the bank
    candidates = [i for i in ISSUE_CATEGORIES if len(verses_for_issue(i)) >= 2]
    return random.choice(candidates)

## Model for structured output

In [None]:


class VerseItem(BaseModel):
    ref: str
    meaning: str

class BibleHelperAnswer(BaseModel):
    mode: str  # "bible" or "general"
    issue: str
    verses: List[VerseItem] = Field(default_factory=list)
    guidance: List[str] = Field(default_factory=list)
    note: str

class BibleConversation(BaseModel):
    user: str
    assistant: BibleHelperAnswer

## Get OpenRouter API key

In [None]:
import sys
import os
from dotenv import load_dotenv

if 'google.colab' in sys.modules:
  from google.colab import userdata # type:ignore
  os.environ['OPENROUTER_API_KEY'] = userdata.get('OpenRouter')
else:
  load_dotenv()

## Conversation generation functions

In [None]:
import openai
import os

client = openai.OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=os.environ.get("OPENROUTER_API_KEY"),
)

def generate_completion(prompt: str):
    try:
        response = client.responses.parse(
            model=DATAGEN_MODEL,
            input=[{"role": "user", "content": prompt}],
            temperature=TEMPERATURE,
            stream=False,
            text_format=BibleConversation
        )
        return response.output_parsed
    except Exception:
        return None

def build_prompt(issue: str, faith_bg: str, severity: str, allowed_verses: List[Dict[str, str]]) -> str:
    has_verses = len(allowed_verses) > 0

    allowed_text = "\n".join([f'- {v["ref"]}: {v["paraphrase"]}' for v in allowed_verses]) if has_verses else "(none)"

    return f"""
    You are generating one training example for a Bible guidance assistant.

    Diversity dimensions:
    - Issue: {issue}
    - Faith background: {faith_bg}
    - Severity: {severity}

    Rules:
    - If there are Allowed Verses (below), set mode="bible" and choose 2-3 verse refs ONLY from the Allowed Verses list.
    - If there are NO Allowed Verses, set mode="general" and do NOT provide verses (verses must be an empty list).
    - Always explain meaning in simple language and give practical guidance.
    - Return ONLY valid JSON matching the schema. No code fences. No extra keys.

    Allowed Verses:
    {allowed_text}

    Schema:
    {{
      "user": "1-3 sentences describing a realistic personal problem related to the issue and severity",
      "assistant": {{
        "mode": "bible OR general",
        "issue": "{issue}",
        "verses": [{{"ref": "Book X:Y-Z", "meaning": "1 sentence meaning"}}],
        "guidance": ["2-5 short actionable tips"],
        "note": "one short supportive sentence"
      }}
    }}
    """.strip()

def create_conversation() -> dict | None:
    issue = pick_issue_with_backup()

    faith_bg = random.choice(FAITH_BACKGROUNDS)
    severity = random.choices(SEVERITY_LEVELS, weights=SEVERITY_WEIGHTS, k=1)[0]

    allowed = sample_allowed_verses(issue, k_min=1, k_max=3)

    prompt = build_prompt(issue, faith_bg, severity, allowed)
    conv = generate_completion(prompt)
    if conv is None:
        return None

    # Basic enforcement so the dataset is consistent
    if len(allowed) == 0:
        conv.assistant.mode = "general"
        conv.assistant.verses = []
    else:
        # if bible mode, ensure 2-3 verses
        if conv.assistant.mode != "bible":
            conv.assistant.mode = "bible"
        if len(conv.assistant.verses) < 2:
            return None
        conv.assistant.verses = conv.assistant.verses[:3]

    # Fine-tuning friendly: assistant content as STRING
    assistant_str = json.dumps(conv.assistant.model_dump(), ensure_ascii=False)

    return {
        "messages": [
            {"role": "user", "content": conv.user},
            {"role": "assistant", "content": assistant_str},
        ]
    }

## Dataset generation functions

In [None]:
import random
import json
from tqdm import tqdm

def generate_dataset_files(num_examples: int, jsonl_path: str, pretty_json_path: str) -> None:
    os.makedirs(os.path.dirname(jsonl_path), exist_ok=True)

    examples = []

    with open(jsonl_path, "w", encoding="utf-8") as f_jsonl:
        for idx in tqdm(range(num_examples)):
            record = None
            tries = 0

            while record is None and tries < 6:
                record = create_conversation()
                tries += 1

            if record is None:
                print(f"Failed generating example {idx}")
                continue

            f_jsonl.write(json.dumps(record, ensure_ascii=False) + "\n")
            f_jsonl.flush()
            examples.append(record)

    with open(pretty_json_path, "w", encoding="utf-8") as f_pretty:
        json.dump(examples, f_pretty, ensure_ascii=False, indent=2)

## Generate all the data!

In [None]:
from datetime import datetime

TRAIN_FILE = f"{DATA_FOLDER}/train_pretty_{datetime.now().strftime('%Y-%m-%d-%H:%M:%S')}.jsonl"
train_pretty = f"{DATA_FOLDER}/train_{datetime.now().strftime('%Y-%m-%d-%H:%M:%S')}.pretty.json"
VALID_FILE = f"{DATA_FOLDER}/valid_{datetime.now().strftime('%Y-%m-%d-%H:%M:%S')}.jsonl"
valid_pretty = f"{DATA_FOLDER}/valid_pretty_{datetime.now().strftime('%Y-%m-%d-%H:%M:%S')}.pretty.json"
TEST_FILE = f"{DATA_FOLDER}/test_{datetime.now().strftime('%Y-%m-%d-%H:%M:%S')}.jsonl"
test_pretty = f"{DATA_FOLDER}/test_pretty_{datetime.now().strftime('%Y-%m-%d-%H:%M:%S')}.pretty.json"

generate_dataset_files(NUM_TRAIN_EXAMPLES, TRAIN_FILE, train_pretty)
generate_dataset_files(NUM_VAL_EXAMPLES, VALID_FILE, valid_pretty)
generate_dataset_files(NUM_TEST_EXAMPLES, TEST_FILE, test_pretty)


100%|██████████| 10/10 [05:26<00:00, 32.68s/it]
100%|██████████| 10/10 [05:09<00:00, 30.99s/it]
100%|██████████| 10/10 [05:35<00:00, 33.58s/it]
