# BibleEncouragementAssistant

<a target="_blank" href="https://colab.research.google.com/github/simonguest/CS-394/blob/main/src/06/notebooks/generate-synthetic.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>
<a target="_blank" href="https://github.com/simonguest/CS-394/raw/refs/heads/main/src/06/notebooks/generate-synthetic.ipynb">
  <img src="https://img.shields.io/badge/Download_.ipynb-blue" alt="Download .ipynb"/>
</a>

## Data generation settings

In [82]:
from pydantic import BaseModel
from typing import List, Literal, Optional, Dict, Any, Tuple
from pydantic import BaseModel, Field, ValidationError

NUM_TRAIN_EXAMPLES = 2  # @param {type:"number"}
NUM_VAL_EXAMPLES = 100  # @param {type:"number"}
NUM_TEST_EXAMPLES = 10 # @param {type:"number"}
TEMPERATURE = 0.8  # @param {type:"number"}

DATA_FOLDER = "./.data/generated"
!mkdir -p {DATA_FOLDER}

DATAGEN_MODEL = "openai/gpt-5-nano"

## Dataset diversity

In [83]:
BIBLE_BOOKS = [
    "Old Testament",
    "New Testament",
]

ISSUE_QUESTIONS_CATEGORIES = [
    "anxiety", "fear", "grief", "guilt", "anger", "loneliness",
    "guidance", "forgiveness", "burnout", "temptation", "question"
]

SEVERITY_LEVELS = ["mild", "moderate", "severe", "crisis"]

# How often we generate each severity (crisis is rarer)
SEVERITY_WEIGHTS = [0.35, 0.35, 0.22, 0.08]

## Model for structured output

In [84]:
class BibleExplanation(BaseModel):
    issue_question: str
    verse: str
    explanation: str
    guidance: List[str] = Field(default_factory=list)
    note: str

class BibleConversation(BaseModel):
    user: str
    assistant: BibleExplanation


## Get OpenRouter API key

In [85]:
import sys
import os
from dotenv import load_dotenv

if 'google.colab' in sys.modules:
  from google.colab import userdata # type:ignore
  os.environ['OPENROUTER_API_KEY'] = userdata.get('OpenRouter')
else:
  load_dotenv()

## Conversation generation functions

In [86]:
import openai
import os

client = openai.OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=os.environ.get("OPENROUTER_API_KEY"),
)

def generate_completion(prompt: str) -> Optional[BibleConversation]:
    try:
        response = client.responses.parse(
            model=DATAGEN_MODEL,
            input=[{"role": "user", "content": prompt}],
            temperature=TEMPERATURE,
            stream=False,
            text_format=BibleConversation,
        )
        return response.output_parsed
    except Exception:
        return None

def create_conversation(issue_question: str, severity: str) -> Optional[BibleConversation]:

    if(issue_question == "question"):
      request = f"Provide a  Bible verse about {issue_question}, and explain the mearning."
    else:
      if severity == "mild":
          request = f"Provide a  Bible verse about {issue_question}, with brief encouragement (1–2 sentences each)."
      elif severity == "moderate":
          request = f"Provide a Bible verse about {issue_question}, each with short meaning explanations and 3–4 practical guidance steps."
      elif severity == "severe":
          request = f"Provide a Bible verse about {issue_question}, with deeper explanations and 4–6 compassionate guidance steps. Include strong reassurance."
      elif severity == "crisis":
          request = f"Provide a Bible verse about {issue_question}, with careful, supportive explanations. Include clear encouragement to seek immediate help from trusted people or local emergency services if the person is in danger."
      else:
          request = f"Provide a Bible verse about {issue_question}, with short explanations and practical encouragement."

    prompt = f"""
    Create a realistic conversation from {request}.

    Context:
    - Topic: {issue_question}
    - Severity: {severity}

    Rules:
    - Use ONLY books from this list:
    {", ".join(BIBLE_BOOKS)}
    - Verse references must be: "Book Chapter:Verse" or "Book Chapter:Verse-Verse"
    - Do NOT cite any book outside the allowed list

    Return the following if {issue_question} is "question":
    1. 1–3 sentences written as the user describing their situation about {issue_question} in your own word as a string.
    2. The single Bible selected verse and its content as a single string.
    3. One paragraphs in one string explaining how the verses address the issue.

    Return the following if {issue_question} is not "question":
    1. 1–3 sentences written as the user describing their situation about {issue_question} in your own word as a string.
    2. The single Bible selected verse and its content as a single string.
    3. One paragraphs in one string explaining how the verses address the issue.
    4. 1–6 short practical action steps
    5. Short compassionate closing note. If severity is crisis, encourage contacting trusted people or emergency services.
    """

    return generate_completion(prompt)

SyntaxError: expected ':' (ipython-input-3955126296.py, line 26)

## Dataset generation functions

In [None]:
import random
import json
from tqdm import tqdm

def generate_dataset_files(num_examples: int, jsonl_path: str, pretty_json_path: str) -> None:
    os.makedirs(os.path.dirname(jsonl_path), exist_ok=True)

    examples = []

    with open(jsonl_path, "w", encoding="utf-8") as f_jsonl:
        for idx in tqdm(range(num_examples)):
            issue = random.choice(ISSUE_QUESTIONS_CATEGORIES)
            severity = random.choices(SEVERITY_LEVELS, weights=SEVERITY_WEIGHTS)[0]

            conv = None
            tries = 0
            while conv is None and tries < 6:
                conv = create_conversation(issue, severity)
                tries += 1

            if conv is None:
                print(f"Error generating conversation for example {idx}")
                continue

            assistant_text = json.dumps(conv.assistant.model_dump(), ensure_ascii=False)

            template = {
                "messages": [
                    {"role": "user", "content": conv.user},
                    {"role": "assistant", "content": assistant_text},
                ]
            }

            f_jsonl.write(json.dumps(template, ensure_ascii=False) + "\n")
            f_jsonl.flush()
            examples.append(template)

    with open(pretty_json_path, "w", encoding="utf-8") as f_pretty:
        json.dump(examples, f_pretty, ensure_ascii=False, indent=2)

## Generate all the data!

In [None]:
from datetime import datetime

TRAIN_FILE = f"{DATA_FOLDER}/train_{datetime.now().strftime('%Y-%m-%d-%H:%M:%S')}.jsonl"
train_pretty = f"{DATA_FOLDER}/train_pretty_{datetime.now().strftime('%Y-%m-%d-%H:%M:%S')}.pretty.json"
VALID_FILE = f"{DATA_FOLDER}/valid_{datetime.now().strftime('%Y-%m-%d-%H:%M:%S')}.jsonl"
valid_pretty = f"{DATA_FOLDER}/valid_pretty_{datetime.now().strftime('%Y-%m-%d-%H:%M:%S')}.pretty.json"
TEST_FILE = f"{DATA_FOLDER}/test_{datetime.now().strftime('%Y-%m-%d-%H:%M:%S')}.jsonl"
test_pretty = f"{DATA_FOLDER}/test_pretty_{datetime.now().strftime('%Y-%m-%d-%H:%M:%S')}.pretty.json"

generate_dataset_files(NUM_TRAIN_EXAMPLES, TRAIN_FILE, train_pretty)
#generate_dataset_files(NUM_VAL_EXAMPLES, VALID_FILE, valid_pretty)
#generate_dataset_files(NUM_TEST_EXAMPLES, TEST_FILE, test_pretty)
