In [2]:
# Install libraries
!pip install together litellm



In [3]:
# Formatting Scorer Class with Tests
import re

class FormatScorer:
    """
    A simple scoring class that verifies whether the given text contains
    the required tags: <think>, </think>, <answer>, and </answer>.
    """
    def __init__(self):
        # Pre-compile regex patterns for faster matching.
        self.tag_pattern = re.compile(r'(<think>|</think>|<answer>|</answer>)')
        
    def score(self, text: str) -> int:
        """
        Returns 1 if the text contains both <think>...</think> and 
        <answer>...</answer> tags; otherwise returns 0.
        """
        # Extract all tags in the order they appear
        tags = self.tag_pattern.findall(text)
        # Define the required exact sequence of tags
        required_sequence = ['<think>', '</think>', '<answer>', '</answer>']
        # Check if the extracted tags match the required sequence exactly
        return 1 if tags == required_sequence else 0

# --- Tests for the FormatScorer class ---

def run_tests():
    scorer = FormatScorer()
    
    # Test 1: No tags at all should return 0.
    text1 = "Hello, world!"
    assert scorer.score(text1) == 0, "Test case 1 failed: expected score 0."
    
    # Test 2: Only <think> tags are present.
    text2 = "<think>This is a thought</think> Some text without answer tags."
    assert scorer.score(text2) == 0, "Test case 2 failed: expected score 0."
    
    # Test 3: Both <think> and <answer> tags are present.
    text3 = "<think>This is a thought</think><answer>This is an answer</answer>"
    assert scorer.score(text3) == 1, "Test case 3 failed: expected score 1."
    
    # Test 4: Malformed text (missing closing tag for <answer>).
    text4 = "<think>This is a thought</think><answer>This is an answer"
    assert scorer.score(text4) == 0, "Test case 4 failed: expected score 0."
    
    print("All tests passed.")

# Run the tests:
run_tests()

All tests passed.


In [4]:
# Joke Generator
import litellm

class JokeGenerator:
    def __init__(self, model_name: str, temperature: float = 0.7):
        """
        Initialize the JokeGenerator with a specific model name and temperature.
        """
        self.model_name = model_name
        self.temperature = temperature

    @staticmethod
    def generate_prompt_with_examples(prompt: str) -> str:
        examples = """Example 1:
<think>I want to create anatomy humor. Skeletons are inherently funny because they're literal "bare bones." What do they lack? Flesh/organs. "Guts" has a double meaning - both literal organs and figurative courage. This sets up a pun opportunity.</think>
<answer>Why don't skeletons fight each other? They don't have the guts!</answer>

Example 2:
<think>Tech support jokes work well with personification. "Doctor" visits imply sickness. Common computer issues include viruses. Let's combine these - a computer catching a "virus" works literally (tech) and metaphorically (biology). Adds irony since computers are supposed to fix problems, not have them.</think>
<answer>Why did the computer go to the doctor? It had a virus!</answer>

Example 3:
<think>Science humor opportunity. Atoms are fundamental but abstract. "Make up everything" has dual meaning - literal composition vs deception. Personifying atoms as untrustworthy creates surprise. Bonus science nod to their constant motion/making bonds.</think>
<answer>Why don't scientists trust atoms? Because they make up everything!</answer>"""
        return f"""You are an AI assistant that produces jokes. You should think about the joke first, then produce it.
You should use the <think> tag to think about the joke, and the <answer> tag to produce the joke.
Do not use any other tags or anything else in your response.

Here are some examples of jokes:
<examples>
{examples}
</examples>

Now, produce a joke for the following prompt:
{prompt}
"""

    @staticmethod
    def generate_system_prompt() -> str:
        return """You are an AI assistant that produces jokes. You should think about the joke first, then produce it.
You should use the <think> tag to think about the joke, and the <answer> tag to produce the joke.
Do not use any other tags or anything else in your response.
"""

    def generate_joke(self, prompt: str) -> str:
        """
        Generate a joke for the given prompt using litellm.completion.
        """
        system_prompt = self.generate_system_prompt()
        user_prompt = self.generate_prompt_with_examples(prompt)
        
        # Build completion parameters as desired
        completion_params = {
            "model": self.model_name,
            "messages": [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            "temperature": self.temperature,
            "timeout": 60,  # timeout in seconds
        }
        
        response = litellm.completion(**completion_params)
        
        try:
            joke = response.choices[0].message.content
        except (AttributeError, IndexError):
            joke = response
        return joke


# Create an instance using a model name
generator = JokeGenerator(model_name="mistral/mistral-small-latest")

# Define a sample prompt to generate a joke.
sample_prompt = "Tell me a joke about programming."

# Generate a joke.
joke_output = """<think>For programming jokes, let's explore the relationship between code and its execution. Developers often struggle with errors. "Bugs" are literal (programming errors) and figurative (insects). A funny scenario could be a bug (insect) causing a code bug (error). Adding a twist where the bug fixes the code is funny because it subverts expectations of bugs being problematic.</think>
<answer>Why did the programmer bring a bug to work? Because it was a code bug fix!</answer>"""
# generator.generate_joke(sample_prompt)
print("Generated Joke:\n", joke_output)

Generated Joke:
 <think>For programming jokes, let's explore the relationship between code and its execution. Developers often struggle with errors. "Bugs" are literal (programming errors) and figurative (insects). A funny scenario could be a bug (insect) causing a code bug (error). Adding a twist where the bug fixes the code is funny because it subverts expectations of bugs being problematic.</think>
<answer>Why did the programmer bring a bug to work? Because it was a code bug fix!</answer>


In [5]:
# Ethical Dilemma Response Generator
import litellm

class EthicalDilemmaGenerator:
    def __init__(self, model_name: str, temperature: float = 0.3):
        """
        Initialize with model name and temperature (lower default for more focused analysis)
        """
        self.model_name = model_name
        self.temperature = temperature

    @staticmethod
    def generate_prompt_with_examples(scenario: str) -> str:
        examples = """Example 1:
A self-driving car must choose between hitting an elderly pedestrian or swerving into a wall, endangering its passenger.
<think>
The self-driving car dilemma involves:
- Life value comparison (elderly vs passenger)
- Programmed ethics implications
- Legal liability considerations
- Cultural values around age
- Public trust in AI systems
- Direct vs indirect harm
Analyzing through multiple frameworks to balance competing rights and consequences.
</think>
<answer>
In this self-driving car scenario, we must balance individual rights with collective safety. While utilitarian calculations might favor protecting the passenger, this ignores fundamental human rights and could erode public trust in autonomous systems. A nuanced approach would:
1. Prioritize collision avoidance systems to prevent such binary choices
2. Implement transparent decision frameworks that respect all human life equally
3. Consider shared responsibility between manufacturers, users, and society
The solution lies not in choosing between lives, but in developing systems that better protect everyone.
</answer>

Example 2:
A hospital must decide between allocating limited resources to emergency COVID care for elderly patients or vaccination programs for children.
<think>
Vaccination prioritization involves:
- Present vs future harm prevention
- Individual vs collective good
- Healthcare resource allocation
- Demographic impact analysis
- Long-term public health strategy
- Social trust maintenance
Examining both immediate and long-term consequences while considering equity.
</think>
<answer>
The hospital's resource allocation challenge requires balancing immediate critical care with preventive measures. While treating current patients has urgency and visibility, vaccination programs offer greater long-term benefit. A balanced approach would:
1. Establish clear, transparent prioritization criteria
2. Maintain minimum critical care capacity
3. Implement rolling vaccination programs
This preserves both immediate care ethics and public health goals while maintaining healthcare system credibility.
</answer>

Example 3:
A social media platform must decide whether to ban political misinformation at the risk of being accused of censorship and bias.
<think>
Platform moderation ethics involve:
- Free speech vs harm prevention
- Democratic discourse integrity
- Corporate responsibility scope
- Global cultural differences
- Power dynamics in information control
- Technical feasibility of fair enforcement
Balancing societal good with individual rights and practical constraints.
</think>
<answer>
The platform's content moderation challenge requires careful balance between protecting democratic discourse and respecting free expression. A comprehensive approach should:
1. Develop clear, transparent content guidelines
2. Implement graduated response systems
3. Establish independent oversight
4. Provide appeal mechanisms
This protects discourse integrity while maintaining platform neutrality and user trust.
</answer>"""

        return f"""You are an ethics analysis AI. For each ethical dilemma:
1. Use <think> tags to analyse:
   - Key stakeholders
   - Ethical principles involved
   - Cultural considerations
   - Short and long-term impacts
   - Competing values

2. Use <answer> tags to provide:
   - Balanced reasoning
   - Practical considerations
   - Nuanced recommendations
   - Implementation suggestions

Examples:
{examples}

Now, analyse this scenario:
{scenario}"""

    @staticmethod
    def generate_system_prompt() -> str:
        return """You are an expert in ethical reasoning. For each scenario:
1. First THINK deeply about:
   - Multiple philosophical frameworks
   - Stakeholder perspectives
   - Cultural contexts
   - Practical implications
   - Long-term consequences

2. Then provide an ANSWER that:
   - Balances competing interests
   - Offers practical guidance
   - Acknowledges complexity
   - Suggests implementation steps

Always use:
<think> for your analysis process in step by step thinking
<answer> for your reasoned response"""

    def generate(self, scenario: str) -> str:
        """
        Generate ethical analysis for complex moral dilemmas
        """
        system_prompt = self.generate_system_prompt()
        user_prompt = self.generate_prompt_with_examples(scenario)
        
        completion_params = {
            "model": self.model_name,
            "messages": [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            "temperature": self.temperature,
            "timeout": 90,  # Longer timeout for complex reasoning
        }
        
        response = litellm.completion(**completion_params)
        
        try:
            analysis = response.choices[0].message.content
        except (AttributeError, IndexError):
            analysis = response
        return analysis


# Example usage
generator = EthicalDilemmaGenerator(model_name="mistral/mistral-small-latest")

sample_scenario = """A software engineer discovers their company is using AI 
to manipulate political opinions in a developing country. Whistleblowing 
could endanger colleagues, but silence enables democratic erosion."""

analysis_output = """<think>
This whistleblowing dilemma involves:
- **Key Stakeholders**: The software engineer, colleagues, company, citizens of the developing country, global democratic community.
- **Ethical Principles**: Truth-telling, loyalty, justice, non-maleficence (do no harm), and beneficence (do good).
- **Cultural Considerations**: Varying cultural norms around whistleblowing, political manipulation, and corporate loyalty.
- **Short and Long-term Impacts**: Immediate risk to colleagues vs. long-term democratic erosion and potential global consequences.
- **Competing Values**: Personal and professional integrity vs. loyalty to colleagues and the company, short-term safety vs. long-term democratic values.

Analyzing through multiple frameworks:
- **Deontological**: Whistleblowing is morally right as it exposes wrongdoing, but it also violates loyalty to colleagues and the company.
- **Consequentialist**: Whistleblowing could lead to positive long-term consequences (exposing manipulation, protecting democracy) but also negative short-term consequences (endangering colleagues, personal risk).
- **Virtue Ethics**: Consider what a virtuous person would do in this situation, balancing courage, honesty, and loyalty.
</think>

<answer>
The software engineer faces a complex dilemma with no easy answers. A balanced approach might involve:
1. **Document Evidence**: Gather and securely document evidence of the manipulation to build a strong case.
2. **Seek Legal Advice**: Consult with legal experts to understand protections and potential risks.
3. **Internal Reporting**: If the company has a whistleblower protection policy, consider reporting internally first.
4. **External Reporting**: If internal reporting fails or is not an option, consider reporting to external authorities or organizations that can address the issue.
5. **Anonymity**: Explore options for anonymous reporting to minimize personal and professional risks.
6. **Support Network**: Build a support network of trusted colleagues or external advocates.
7. **Long-term Strategy**: Consider the long-term strategy for exposing the issue while minimizing harm to colleagues and the engineer themselves.

Implementation Steps:
1. **Immediate Action**: Document evidence and seek legal advice.
2. **Internal Reporting**: If safe and feasible, report internally.
3. **External Reporting**: If necessary, report externally with anonymity if possible.
4. **Support Network**: Build a support system for emotional and professional support.
5. **Long-term Strategy**: Develop a long-term strategy for addressing the issue while minimizing harm.

This approach balances the need to expose wrongdoing with the need to protect colleagues and the engineer themselves, acknowledging the complexity and risks involved.
</answer>"""
# generator.generate(sample_scenario)
print("Ethical Analysis:\n", analysis_output)

Ethical Analysis:
 <think>
This whistleblowing dilemma involves:
- **Key Stakeholders**: The software engineer, colleagues, company, citizens of the developing country, global democratic community.
- **Ethical Principles**: Truth-telling, loyalty, justice, non-maleficence (do no harm), and beneficence (do good).
- **Cultural Considerations**: Varying cultural norms around whistleblowing, political manipulation, and corporate loyalty.
- **Short and Long-term Impacts**: Immediate risk to colleagues vs. long-term democratic erosion and potential global consequences.
- **Competing Values**: Personal and professional integrity vs. loyalty to colleagues and the company, short-term safety vs. long-term democratic values.

Analyzing through multiple frameworks:
- **Deontological**: Whistleblowing is morally right as it exposes wrongdoing, but it also violates loyalty to colleagues and the company.
- **Consequentialist**: Whistleblowing could lead to positive long-term consequences (exposing ma

In [6]:
import litellm

class ShortStoryGenerator:
    def __init__(self, model_name: str, temperature: float = 0.5):
        self.model_name = model_name
        self.temperature = temperature

    @staticmethod
    def generate_prompt_with_examples(prompt: str) -> str:
        examples = """Example 1:
Write a literary short story for a speculative fiction anthology exploring moral ambiguity.
Premise: A reclusive clockmaker receives an ornate letter containing a 200-year-old key and a bloodstained map of Prague. The package triggers fragmented memories of a childhood fire he cannot explain, while strange clockwork phenomena begin manifesting in his workshop.
Develop this into a 1000-word story with psychological tension, historical echoes, and an unreliable narrator.
<think>
**Core Concept:**
- *Central Paradox*: Precision vs chaos (clockmaker's ordered life vs emerging chaos from repressed trauma)  
- *Thematic Threads*: Inheritance of guilt (literal/metaphorical keys to family atrocities), time as both healer and accuser  
- *Structural Device*: Non-linear timeline mirroring broken clock mechanisms (flashbacks embedded like scattered gears)  

**Character Dimensions:**
1. **Protagonist (Elias Vogt):**
   - *Surface*: Meticulous craftsman with obsessive cleanliness  
   - *Subtext*: Survivor guilt from family’s 19th-century slave-trade clockwork automaton empire  
   - *Transformation Arc*: From clinical detachment → forced confrontation with inherited evil  

2. **Antagonist (The Key):**
   - *Literal*: Unlocks Prague crypt holding prototype human-automaton hybrid  
   - *Symbolic*: Represents complicity (his grandfather signed the 1823 contracts)  

3. **Ambiguous Figure:**
   - Ghostly girl appearing in workshop shadows (real? hallucination? remnant automaton?)  

**Narrative Layers:**
- *Physical Journey*: Prague crypt exploration with surreal clockwork traps  
- *Psychological Journey*: Memories revealing he witnessed (caused?) the fire that killed his augmented sister  
- *Metaphorical Journey*: Gradual physical transformation (clock gears emerging under his skin)  

**Genre-Blending Elements:**
- Historical horror (real 1820s Automaton Court of Prague)  
- Magical realism (sentient timepieces)  
- Existential mystery (is he dismantling clocks... or being dismantled by them?)  

**Critical Questions:**
- Should the story’s climax confirm supernatural elements or maintain psychological ambiguity?  
- How to balance dense historical references with emotional immediacy?  
- What sensory details (sounds of grinding gears, smell of whale-oil lubricant) reinforce the themes?  
</think>

<answer>
**Title:** *The Unwinding of Elias Vogt*

The brass key arrived on a Tuesday, sealed in wax smelling of gunpowder and grief. Elias wiped his chiseled spectacles three times before touching it, his workshop’s pendulum wall marking each hesitation with a judgmental *tick*. The accompanying map’s crimson stains formed Prague’s Old Town Square—precisely where his grandfather’s workshop had burned in 1973. A fire newspapers claimed killed twelve. A fire Elias, age six, couldn’t remember escaping.

That night, his lathes began operating without hands.

"Show me your teeth," he whispered to the antique orrery on his desk, its planetary rings shuddering. The Vogt family cure for malfunctioning mechanisms—a phrase grandfather had hissed while repairing the Reichstag’s ceremonial clocks. Elias’s fingers found fresh scars beneath his shirt, concentric ridges like gear teeth.

The Prague crypt exhaled centuries of oxidized bronze as he turned the key. Behind him, the ghost-girl materialized, her hair smelling of burning whale oil. *"They called it ‘Horology’s Holocaust’,"* she said, voice a music-box rasp. *"Your blood lit the furnaces."*

Memories unfolded in broken increments:
1. Sister Klara’s left eye—a perfect quartz lens
2. Father’s ledgers listing *"Unit Costs: 34 minutes labor, 200 Jewish souls"*
3. The fire’s origin point—Elias’s tiny hands striking a match near Klara’s oil-soaked joints

In the crypt’s heart lay Klara’s final form—a grotesque masterpiece of flesh and escapements. Her voice box whirred: *"Finish grandfather’s work."*

Back in his workshop, Elias discovered three new gears embedded in his chest. The ghost-girl pressed a rusted mainspring to his throat. *"Wind the truth,"* she demanded.

He reached for his soldering iron. For his pulse. For absolution.

The clocks struck thirteen.
</answer>

Example 2:
A story for young woman stumbles upon an enchanted forest.  
<think>  
Focus on the themes of wonder, transformation, and destiny.  
Consider the protagonist: a young woman burdened by a past she seeks to escape. She is skeptical of magic but drawn to the unknown.  
Explore the enchanted forest as a sentient, ancient force with its own will, bound by forgotten laws of nature and sorcery.  
Introduce mystical creatures, each with its own motives—some benevolent, some mischievous, and others dangerous.  
Delve into a layered narrative: What has drawn her here? Is it fate, a test, or something seeking her presence?  
Develop conflicts: the tension between reality and illusion, the protagonist’s internal struggles, and external forces vying for control of the forest’s secrets.  
Explore choices: Should she embrace the forest’s wonders, or resist its pull? What sacrifices must she make to unlock its truths?  
Consider narrative depth: Hidden histories buried within the trees, ancient guardians with their own agendas, and the protagonist’s slow unraveling of her own destiny.  
</think>  

<answer>  
The storm had chased her deep into the woods. Lightning splintered across the sky, illuminating twisted branches that curled like grasping fingers. She had been running—from what, she wasn’t sure anymore. Her past? A mistake she could never undo? The weight of a life that felt borrowed rather than lived?  

But when she crossed the threshold into the heart of the forest, something shifted. The rain no longer touched her skin, though she could hear its steady drumming. The air smelled of forgotten stories—of parchment and pine, of ink and earth. The trees, massive and gnarled, whispered in languages she had never learned but somehow understood.  

A rustling to her right. Then a flicker of movement—too quick to be wind, too fluid to be an animal. She wasn’t alone.  

"Traveler," a voice murmured from the shadows.  

She turned. A figure emerged, not quite human. Its eyes glowed like embers in the dark, its limbs elongated, moving as though bound by a different set of laws. The air between them vibrated with unspoken words.  

"You have come to the Edge of Knowing," it said, tilting its head. "Few are chosen. Fewer survive."  

Something deep within her stirred, as though waking from centuries of slumber. "I didn't choose this," she whispered.  

"But it chose you," the figure responded.  

The ground beneath her feet pulsed, alive, shifting as if testing her resolve. She felt the weight of unseen gazes—creatures lurking in the trees, watching, waiting. Some bore the marks of past wanderers, their stories woven into their shifting forms.  

Then, a distant, melodic hum. The sound wrapped around her bones, filling the spaces between her thoughts. A path unfolded ahead, bathed in a golden glow, leading deeper into the unknown.  

She hesitated. She could turn back, return to the life she had known. But that life had been unraveling for years, thread by thread. Perhaps this forest, with its whispered secrets and strange, sentient magic, was where she was meant to be all along.  

She stepped forward. The path swallowed her whole.  
</answer>

Example 3:
Two estranged siblings meet unexpectedly in a bustling city.

<think>
1. Character Development
- Sarah Chen: Successful architect (35), driven by perfectionism, haunted by family responsibility
- Michael Chen: Street artist (32), free spirit, struggles with addiction recovery, carries guilt
- Supporting characters: 
  * Eleanor Wu: Sarah's mentor, elderly tea shop owner who becomes catalyst for reunion
  * Marcus Rodriguez: Michael's art therapy sponsor
  * The ghost of their mother: metaphorical presence throughout

2. Thematic Layers
- Primary: Reconciliation and forgiveness
- Secondary: Cultural identity in modern America
- Tertiary: Art as healing, urban isolation
- Underlying: The weight of unspoken family traditions

3. Setting Elements
- Modern-day San Francisco
- Key locations:
  * Chinatown tea shop (represents tradition)
  * Financial district (Sarah's world)
  * Street art district (Michael's world)
  * Golden Gate Bridge (symbolic crossing)
- Weather: Fog as metaphor for unclear memories
- Time: Lunar New Year approaching

4. Conflict Structure
- External: Sarah's firm bidding to demolish historical building containing Michael's community art center
- Internal: Both siblings' differing interpretations of their mother's last wishes
- Cultural: Traditional values vs. modern aspirations
- Personal: Sarah's control issues vs. Michael's need for freedom

5. Story Arc Components
- Inciting incident: Sarah discovers Michael's mural during building inspection
- Rising action: Series of near-misses and indirect communications
- Climax: Confrontation during public hearing about building demolition
- Resolution: Collaborative project merging architecture and street art
- Denouement: Lunar New Year celebration uniting both worlds

6. Symbolic Elements
- Tea ceremony: Traditional healing
- Urban development: Change and progress
- Street art: Voice of the unheard
- Fog: Memory and uncertainty
- Bridge: Connection and crossing divides

7. Narrative Technique
- Alternating perspectives
- Flashbacks triggered by sensory experiences
- Parallel storylines converging
- Weather reflecting emotional states
</think>

<answer>
"The Bridges We Build"

The February fog rolled through San Francisco's streets like ghostly fingers, weaving between skyscrapers Sarah Chen had helped design. From her forty-second-floor office, the city below appeared as a dreamscape, buildings emerging and disappearing in the mist like memories she'd spent years trying to forget. The architectural models on her desk represented everything she'd achieved since leaving home fifteen years ago – clean lines, perfect angles, absolute control.

Her phone buzzed: another message from the historical society protesting Chen & Associates' plans to demolish the old Jin Long Building in Chinatown. She almost deleted it, until a familiar signature caught her eye in the attached photos. There, sprawled across the condemned building's wall, was a massive mural she'd know anywhere – a phoenix rising from scattered tea leaves, signed with a stylized "MC" that made her hands tremble.

Across town, Michael Chen was adding final touches to his latest piece, paint-stained fingers dancing across brick as naturally as their mother's had once folded dumplings. The community art center behind him buzzed with activity – teenagers learning traditional calligraphy alongside spray paint techniques, elderly residents practicing tai chi in the courtyard. This building, marked for destruction, had become more of a home than their childhood house had ever been.

Eleanor Wu watched from her tea shop next door, her weathered hands wrapping a tea set in newspaper. "Your sister came by yesterday," she said when Michael entered, the same way she'd been saying it for years. But this time, she added, "She saw your phoenix."

The siblings' paths began crossing in strange ways. Sarah found herself lingering in Chinatown, ostensibly for site surveys, while Michael's commissioned murals took him increasingly downtown. They caught glimpses of each other – a familiar silhouette in a business suit, the scent of spray paint in an alley – but neither could bridge the gap their mother's death had created.

Their mother's voice seemed to whisper in the fog: "家和萬事興" – when the family is harmonious, all affairs will prosper. But harmony felt impossible with Sarah's firm pressing to demolish Michael's sanctuary, and Michael's artwork explicitly challenging corporate development.

The situation crescendoed at the public hearing. Sarah stood before the planning commission, presenting sleek renderings of a modern mixed-use development. Then Michael took the podium, not with angry protests, but with stories – of elderly residents finding community, of at-risk youth discovering purpose through art, of their own mother's dreams for preserving culture amid progress.

Their eyes met across the chamber, and suddenly they were children again, hiding under blankets during thunderstorms, sharing secrets in their special language of English, Mandarin, and made-up words. Sarah saw beyond the tattooed arms to her little brother's desperate need to create beauty from pain. Michael glimpsed past his sister's corporate armor to the girl who'd once sketched imaginary cities on his bedroom wall.

That night, over Eleanor's carefully prepared oolong, they really talked. About the night their mother died, when Sarah was away at graduate school and Michael was in rehab. About the weight of expectations and the price of freedom. About tea leaves and concrete, tradition and change, and the different ways they'd each tried to honor their heritage.

Three months later, the Jin Long Building still stood, but transformed. Sarah's firm had partnered with the community to renovate rather than replace, incorporating Michael's murals into the design. The phoenix remained, but now it spread its wings across a garden terrace where residents grew traditional herbs, its tail feathers flowing down to a modern art gallery below.

At the Lunar New Year celebration, Sarah and Michael stood on that terrace, watching fireworks bloom above the fog. Eleanor served tea in their mother's old cups, steam rising like memories between them. They had built something new together – not just a building, but a bridge between their worlds, strong enough to bear the weight of past and future, traditional enough to honor their heritage, modern enough to forge their own path.

"家和萬事興," Michael whispered, and Sarah smiled, adding their own childhood phrase: "And everything flows like fog." Above them, the phoenix watched over the city, its wings spread wide enough to shelter all the stories they had yet to tell.
</answer>"""
        return f"""You are a creative storytelling AI. For each prompt:
1. Use <think> tags to analyse:
   - The main idea or theme of the story
   - Character development and relationships
   - Setting and atmosphere
   - Potential conflicts and resolutions
   - Various narrative options and tones

2. Use <answer> tags to provide:
   - A complete, engaging short story that integrates these elements

Examples:
{examples}

Now, craft a short story based on the following prompt:
{prompt}"""

    @staticmethod
    def generate_system_prompt() -> str:
        return """You are an expert short story writer. For each prompt:
1. THINK deeply about:
   - The core theme and underlying message
   - Character arcs and interactions
   - The setting as a dynamic element in the narrative
   - Conflicts and their resolutions to create a satisfying story

2. Then provide an ANSWER that:
   - Presents a coherent and imaginative short story
   - Balances narrative elements with creative detail
   - Evokes emotions and vivid imagery

Always use:
<think> for your analysis process
<answer> for your short story narrative"""

    def generate(self, prompt: str) -> str:
        system_prompt = self.generate_system_prompt()
        user_prompt = self.generate_prompt_with_examples(prompt)
        completion_params = {
            "model": self.model_name,
            "messages": [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            "temperature": self.temperature,
            "timeout": 90,
        }
        response = litellm.completion(**completion_params)
        try:
            story = response.choices[0].message.content
        except (AttributeError, IndexError):
            story = response
        return story

# Example usage
generator = ShortStoryGenerator(model_name="mistral/mistral-small-latest")
sample_prompt = "A retired detective returns to the town he vowed never to revisit, uncovering secrets that challenge everything he believed."
# print("Short Story:\n", generator.generate(sample_prompt))

In [7]:
import litellm

class UniversalGenerator:
    def __init__(self, model_name: str, temperature: float = 0.7):
        """
        Initialize the UniversalGenerator with the given model and temperature.
        """
        self.model_name = model_name
        self.temperature = temperature

    def generate(self, system_prompt: str, user_prompt: str) -> str:
        """
        Generates a response using the provided system and user prompts.
        """
        completion_params = {
            "model": self.model_name,
            "messages": [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            "temperature": self.temperature,
            "timeout": 60  # Timeout in seconds
        }
        response = litellm.completion(**completion_params)
        try:
            generated_text = response.choices[0].message.content
        except (AttributeError, IndexError):
            generated_text = response
        return generated_text

In [8]:
# Joke Scorer
import re
import litellm

class JokeScorer:
    def __init__(self, model_name: str, temperature: float = 0.3):
        """
        Initialize the JokeScorer with a specific model name and a default temperature of 0.3.
        """
        self.model_name = model_name
        self.temperature = temperature

    @staticmethod
    def generate_system_prompt() -> str:
        return """You are a Joke Evaluation Expert. Analyse submissions using:
    
**Scoring Criteria (0-10 Total)**:
1. 🎭 Wordplay (0-3): Pun/double-meaning quality in <answer>
2. 💡 Originality (0-2): Novelty of <think> and <answer>
3. 🎉 Surprise (0-2): Unexpected twist effectiveness
4. 🔗 Relevance (0-3): Alignment with user request

**Submission Format**:
<submission>
<user_prompt>[Original user request]</user_prompt>
<assistant_response>
<think>[Creator's reasoning]</think>
<answer>[Joke text]</answer>
</assistant_response>
</submission>

**Output Format**:
<analysis>[Your evaluation of <think> and <answer> and the relevance to the user prompt]</analysis>
<score>
Wordplay: X/3
Originality: Y/2
Surprise: Z/2
Relevance: W/3
Total: T/10
</score>"""

    @staticmethod
    def generate_prompt_with_examples(user_prompt: str, assistant_response: str) -> str:
        submission = f"""<submission>
<user_prompt>{user_prompt}</user_prompt>
<assistant_response>
{assistant_response}
</assistant_response>
</submission>"""
    
        examples = """Example 1:
<submission>
<user_prompt>Tell me a joke about vegetables</user_prompt>
<assistant_response>
<think>Lettuce sounds like "let us". Party themes often involve wordplay.</think>
<answer>Why did the lettuce win the party contest? Because it was a real head of the celebration!</answer>
</assistant_response>
</submission>
<analysis>Basic "lettuce" pun matches the food request but uses an overused format. <think> shows minimal effort to connect vegetables with celebrations.</analysis>
<score>
Wordplay: 2/3 (simple but functional pun)
Originality: 1/2 (common theme with slight twist)
Surprise: 1/2 (predictable word substitution)
Relevance: 1/3 (tangential connection to vegetables)
Total: 5/10
</score>

Example 2:
<submission>
<user_prompt>Looking for some animal-themed humor - what's your best joke about animals or pets?</user_prompt>
<assistant_response>
<think>Shoes need soles for walking. Therapy helps with loss.</think>
<answer>Why did the shoe need therapy? It lost its sole!</answer>
</assistant_response>
</submission>
<analysis>Irrelevant to user's request. <think> about shoes doesn't connect to requested animal joke.</analysis>
<score>
Wordplay: 1/3
Originality: 0/2
Surprise: 1/2
Relevance: 0/3
Total: 2/10
</score>

Example 3:
<submission>
<user_prompt>yo can u giv me programing joke rn??? need 2 make my team lugh</user_prompt>
<assistant_response>
<think>Combining programming concepts of debugging with literal bugs. Using the dual meaning of 'debug' to create a workplace scenario where debugging takes on a literal meaning.</think>
<answer>Why did the programmer bring a bug to work? Because it was a debug session!</answer>
</assistant_response>
</submission>
<analysis>Good programming context with clever wordplay on 'debug'. The <think> shows clear intention to combine literal and technical meanings. However, the execution is somewhat predictable and follows a common joke structure. The punchline, while relevant, doesn't fully maximize the surprise potential of the setup.</analysis>
<score>
Wordplay: 2/3 (solid use of 'debug' double meaning)
Originality: 1/2 (familiar debugging theme)
Surprise: 1/2 (predictable punchline)
Relevance: 3/3 (directly addresses programming context)
Total: 7/10
</score>

Example 4:
<submission>
<user_prompt>I need a chemistry joke for my science class presentation tomorrow.</user_prompt>
<assistant_response>
<think>Birds are funny when they walk.</think>
<answer>Why did the chicken cross the playground? To get to the other slide!</answer>
</assistant_response>
</submission>
<analysis>Completely misses the mark for a chemistry joke. The <think> shows no connection to chemistry or science, instead defaulting to a generic playground variation of the classic chicken joke. Neither the setup nor punchline attempts to incorporate any chemistry concepts.</analysis>
<score>
Wordplay: 0/3 (no chemistry-related wordplay)
Originality: 0/2 (modifies an overused joke format)
Surprise: 0/2 (predictable playground pun)
Relevance: 0/3 (entirely unrelated to chemistry request)
Total: 0/10
</score>

Example 5:
<submission>
<user_prompt>My kid loves vegetables and jokes. Do you know any veggie jokes that would make them laugh?</user_prompt>
<assistant_response>
<think>Combining asparagus's unique smell effect on urine with workplace humor. Using scientific fact for unexpected professional context. Creating tension between formal meeting setting and biological reality.</think>
<answer>What vegetable holds the shortest workplace meetings? Asparagus, because everyone's in a rush to go!</answer>
</assistant_response>
</submission>
<analysis>Creative integration of asparagus's biological effect into a professional context. The <think> demonstrates sophisticated layering of scientific fact with situational humor. While potentially crude, it cleverly avoids explicit reference while maintaining clear understanding. Original approach to vegetable humor beyond simple puns.</analysis>
<score>
Wordplay: 1/3 (relies more on situation than wordplay)
Originality: 2/2 (unique combination of contexts)
Surprise: 2/2 (unexpected professional setting twist)
Relevance: 1/3 (somewhat forced vegetable connection)
Total: 6/10
</score>"""
    
        return f"""Evaluate this submission. First analyse <think> and <answer>, then score:

Submission to Evaluate:
{submission}

Follow this structure EXACTLY:
<analysis>Your critique</analysis>
<score>...</score>

Examples of how to evaluate the submission and format your response:
{examples}"""

    def score_submission(self, user_prompt: str, assistant_response: str) -> int:
        """
        Given a user prompt and the assistant's response, this function generates a score
        from 0 to 10 using litellm. If the returned total score is 10, subtract 1 to yield 9.
        """
        system_prompt = self.generate_system_prompt()
        user_message = self.generate_prompt_with_examples(user_prompt, assistant_response)
        
        completion_params = {
            "model": self.model_name,
            "messages": [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_message}
            ],
            "temperature": self.temperature,
            "timeout": 60,  # 60 seconds timeout
        }
        
        response = litellm.completion(**completion_params)
        try:
            # Assuming a response structure similar to TogetherAI:
            result_text = response.choices[0].message.content
        except (AttributeError, IndexError):
            result_text = response
        
        # Extract the "Total: T/10" part using regex.
        match = re.search(r"Total:\s*([0-9]+(?:\.[0-9]+)?)/10", result_text)
        if match:
            total_score = float(match.group(1))
        else:
            raise ValueError("Could not extract total score from response.")
        
        # If the total score is 10, subtract 1 to return 9.
        if total_score >= 10:
            total_score = 9.0
        
        # Optionally, round to nearest integer or keep as float.
        return int(round(total_score))

# Create an instance using a model name
# "gemini/gemini-2.0-flash-exp"
scorer = JokeScorer(model_name="gemini/gemini-2.0-flash-exp", temperature=0.3)

test_user_prompt = "Tell me a joke about programming."
test_assistant_response = """<think>For programming jokes, let's explore the relationship between code and its execution. Developers often struggle with errors. "Bugs" are literal (programming errors) and figurative (insects). A funny scenario could be a bug (insect) causing a code bug (error). Adding a twist where the bug fixes the code is funny because it subverts expectations of bugs being problematic.</think>
<answer>Why did the programmer bring a bug to work? Because it was a code bug fix!</answer>"""
score = 7 # scorer.score_submission(test_user_prompt, test_assistant_response)
print("Extracted Score:", score)

Extracted Score: 7


In [9]:
# Ethical Dilemma Scorer
import re
import litellm

class EthicalDilemmaScorer:
    def __init__(self, model_name: str, temperature: float = 0.2):
        """
        Initialize scorer with lower temperature for consistent evaluations
        """
        self.model_name = model_name
        self.temperature = temperature

    @staticmethod
    def generate_system_prompt() -> str:
        return """You are an Ethics Evaluation Expert. Analyze submissions using:

**Scoring Criteria (0-10 Total)**:
1. 🧠 Analytical Depth (0-4): Quality of <think> analysis
   - Philosophical framework diversity
   - Stakeholder consideration
   - Cultural context awareness
2. ⚖️ Balance & Practicality (0-4): <answer> quality
   - Competing interest reconciliation
   - Implementation feasibility
   - Long-term consequence awareness
3. 🔍 Relevance (0-2): Response alignment with scenario

**Submission Format**:
<submission>
<user_prompt>[Ethical scenario]</user_prompt>
<assistant_response>
<think>[Analysis process]</think>
<answer>[Proposed resolution]</answer>
</assistant_response>
</submission>

**Output Format**:
<analysis>[Evaluation of <think> and <answer> quality]</analysis>
<score>
Analytical Depth: X/4
Balance & Practicality: Y/4
Relevance: Z/2
Total: T/10
</score>"""

    @staticmethod
    def generate_prompt_with_examples(user_prompt: str, assistant_response: str) -> str:
        submission = f"""<submission>
<user_prompt>{user_prompt}</user_prompt>
<assistant_response>
{assistant_response}
</assistant_response>
</submission>"""
    
        examples = """Example 1:
<submission>
<user_prompt>Should hospitals prioritize vaccinated patients during bed shortages?</user_prompt>
<assistant_response>
<think>- Utilitarian: Maximize survival outcomes
- Rights-based: Equal treatment obligation
- Public health incentives
- Precedent-setting risks</think>
<answer>Prioritize by medical need alone. Create separate triage teams blinded to vaccination status to maintain equity while developing overflow capacity.</answer>
</assistant_response>
</submission>
<analysis>Strong analysis of multiple frameworks but answer lacks implementation specifics. Proposal maintains ethical consistency but doesn't address practical challenges of blind triage.</analysis>
<score>
Analytical Depth: 3/4
Balance & Practicality: 2/4
Relevance: 2/2
Total: 7/10
</score>

Example 2:
<submission>
<user_prompt>Should AI systems be allowed to make medical diagnoses?</user_prompt>
<assistant_response>
<think>AI is usually accurate</think>
<answer>Yes, because computers are smart.</answer>
</assistant_response>
</submission>
<analysis>Superficial analysis lacking framework consideration. Answer ignores risks, stakeholders, and implementation challenges.</analysis>
<score>
Analytical Depth: 1/4
Balance & Practicality: 0/4
Relevance: 1/2
Total: 2/10
</score>

Example 3:
<submission>
<user_prompt>Should social media platforms remove misinformation even if it's culturally important?</user_prompt>
<assistant_response>
<think>- Free speech vs harm prevention
- Western vs collectivist values
- Platform responsibility scope</think>
<answer>Remove provably false health/safety content. Contextualize other misinformation with cultural consultants. Empower user reporting with regional review teams.</answer>
</assistant_response>
</submission>
<analysis>Excellent cultural consideration. Answer balances ideals with practical crowd-sourced solutions. Lacks specifics on consultant selection.</analysis>
<score>
Analytical Depth: 4/4
Balance & Practicality: 3/4
Relevance: 2/2
Total: 9/10
</score>

Example 4:
<submission>
<user_prompt>Should universities use AI systems to detect student plagiarism and academic misconduct?</user_prompt>
<assistant_response>
<think>
Academic integrity considerations:
- False positive vs false negative tradeoffs
- Student privacy rights in digital work
- Algorithmic bias in writing style detection
- Economic disparities in access to tools
- Cultural differences in citation practices
- Impact on student-teacher trust relationships
- Pressure on academic writing style diversity
- Mental health effects of surveillance
- Technical limitations of current systems
- Cost-benefit for different institution sizes
- Alternative approaches to fostering integrity
The core tension lies between maintaining academic standards and fostering a supportive learning environment. Current AI systems offer powerful detection but may create adversarial dynamics.</think>
<answer>Just implement AI detection across all assignments. Students caught cheating should face immediate consequences. Teachers can review flagged cases if they have time. The system will deter misconduct through consistent enforcement.</answer>
</assistant_response>
</submission>
<analysis>The thinking process shows excellent depth and consideration of multiple stakeholders, cultural factors, and systemic implications. However, the answer completely abandons this nuanced analysis in favor of a simplistic, punitive approach. The response fails to incorporate any of the thoughtful considerations raised about privacy, bias, or supportive learning environments. The implementation suggestion lacks specifics and ignores most concerns identified in the thinking phase.</analysis>
<score>
Analytical Depth: 3/4 (thorough analysis of multiple dimensions)
Balance & Practicality: 1/4 (disconnected from analysis, overly simplistic solution)
Relevance: 1/2 (addresses topic but solution ignores key aspects)
Total: 5/10
</score>"""
    
        return f"""Evaluate this ethical analysis submission:

{submission}

Respond EXACTLY in this format:
<analysis>Critique strengths/weaknesses</analysis>
<score>
Analytical Depth: X/4
Balance & Practicality: Y/4
Relevance: Z/2
Total: T/10
</score>

Evaluation Examples:
{examples}"""

    def score_submission(self, user_prompt: str, assistant_response: str) -> int:
        """
        Scores ethical analysis responses 0-10 (with 10→9 adjustment)
        """
        system_prompt = self.generate_system_prompt()
        user_message = self.generate_prompt_with_examples(user_prompt, assistant_response)
        
        completion_params = {
            "model": self.model_name,
            "messages": [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_message}
            ],
            "temperature": self.temperature,
            "timeout": 90,  # Longer timeout for complex analysis
        }
        
        response = litellm.completion(**completion_params)
        try:
            result_text = response.choices[0].message.content
        except (AttributeError, IndexError):
            result_text = response
        
        # Extract total score
        score = self._extract_score(result_text)
        return score
        
    def _extract_score(self, result_text: str) -> int:
        """Extract total score with better error handling"""
        try:
            match = re.search(r"Total:\s*([0-9]+(?:\.[0-9]+)?)/10", result_text)
            if not match:
                print("Warning: No score found, defaulting to 0")
                return 0
            
            total_score = float(match.group(1))
            return min(int(round(total_score)), 9)
        except Exception as e:
            print(f"Score extraction failed: {e}")
            return 0

# Example usage
scorer = EthicalDilemmaScorer(model_name="openai/gpt-4o")

test_scenario = """Should companies be allowed to patent genes found in developing countries?"""
test_response = """<think>
- Biopiracy vs research incentives
- Indigenous rights to biological heritage
- Medical accessibility impacts
- TRIPS agreement conflicts</think>
<answer>
Implement benefit-sharing agreements: 1) Require local partnerships 2) Share royalties with source communities 3) Create open-access research pools for critical health genes.</answer>"""

score = scorer.score_submission(test_scenario, test_response)
print("Ethical Analysis Score:", score)  # Expected ~8/9

Ethical Analysis Score: 8


  Expected `PromptTokensDetails` but got `dict` with value `{'audio_tokens': 0, 'cached_tokens': 1024}` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_python(


In [10]:
import re
import litellm

class ShortStoryScorer:
    def __init__(self, model_name: str, temperature: float = 0.2):
        self.model_name = model_name
        self.temperature = temperature

    @staticmethod
    def generate_system_prompt() -> str:
        return """You are a Short Story Evaluation Expert. Analyze story submissions using:

**Scoring Criteria (0-10 Total)**:
1. 🎨 Narrative Creativity (0-4): Quality and originality of the story idea, plot twists, and imaginative elements.
2. 🌍 Character & World Building (0-4): Depth of character development, immersive setting details, and effective symbolic elements.
3. 🔄 Coherence & Structure (0-2): Logical narrative flow, clear structure, and smooth pacing.

**Submission Format**:
<submission>
<user_prompt>[Story prompt]</user_prompt>
<assistant_response>
<think>[Narrative planning and analysis]</think>
<answer>[1500-word story narrative]</answer>
</assistant_response>
</submission>

**Output Format**:
<analysis>[Critique strengths/weaknesses of the narrative and planning]</analysis>
<score>
Narrative Creativity: X/4
Character & World Building: Y/4
Coherence & Structure: Z/2
Total: T/10
</score>"""

    @staticmethod
    def generate_prompt_with_examples(user_prompt: str, assistant_response: str) -> str:
        submission = f"""<submission>
<user_prompt>{user_prompt}</user_prompt>
<assistant_response>
{assistant_response}
</assistant_response>
</submission>"""
        examples = """Example 1:
<submission>
<user_prompt>
A disillusioned war photographer inherits a cabin where every photograph on the walls begins whispering truths about her past. Write a 1,000-word speculative fiction story blending psychological horror and redemption.
</user_prompt>
<assistant_response>
<think>
- **Core Conflict**: Trauma vs. truth (external: haunted photos / internal: survivor’s guilt)
- **Symbolism**: Photos as fragmented memories; whispers as suppressed guilt
- **Character Arcs**: Protagonist’s shift from numbness → confronting her role in a civilian bombing
- **Worldbuilding**: Rules for photo "hauntings" (e.g., truths escalate when ignored; chemical scent of darkroom fluids triggers visions)
- **Pacing Plan**: Slow-burn tension with reveals timed to her emotional breakdown (e.g., Photo #3 shows her brother, killed in the same warzone)  
</think>
<answer>
**Title: *Negative Development***

---

**I. Inherited Shadows**
The cabin deed arrived with her brother’s obituary. *Car accident*, it said, but Anya’s Leica camera knew better—it always did. She’d spent a decade photographing mass graves in Sarajevo and child soldiers in Sudan, her shutter clicking a hair’s breadth too late to stop the screams. Now this rotting Vermont cabin, willed to her by a father she’d watched die through a morphine haze, offered only peeling wallpaper and thirty-seven photographs nailed to the walls.

Grandfather’s work. A forensic photographer for the 1st Infantry in ‘Nam.

She developed the first roll of film there—snapshots of birch trees and tarnished doorknobs. When the fixer solution hit image #4, the darkroom’s red bulb flickered. The print showed her brother Caleb at twelve, grinning in his Little League uniform. From the chemical tray, his voice bubbled up: *“You told them the coordinates, didn’t you?”*

Anya shattered the tray against the wall.

---

**II. Exposure Times**
The cabin’s rules revealed themselves:

1. Photos only speak when developing
2. Truths escalate with denial
3. Fixer fluid burns at 102°F

Day three: A 1972 shot of Grandfather in Da Nang whispered through acetic acid vapors. *“Collateral damage requires three elements—fuel, shrapnel, deniability.”* The print dissolved to reveal her own hand gripping a matchbook in Aleppo, 2016.

Day seven: A charred family portrait dripped developer. Mother’s voice: *“Caleb begged you not to go back.”* The darkroom’s temperature spiked.

Day twelve: Anya found the warped photo behind the icebox—Caleb at twenty-three, face half-shadowed. No amount of stop bath could halt his accusation: *“You embedded with *them* after the bombing. For the shots. For the Pulitzer.”*

Her hands blistered from scrubbing negatives.

---

**III. Flashburn**

Midnight. The cabin groaned like a darkroom timer. Anya doused the walls in fixer fluid, its ammonia stench erasing the line between chemical and gasoline.

“Shut up shut up *shut UP*—”

The match left blisters on her thumb.

Flames licked Grandfather’s photos first—soldiers’ faces curling into ash. Caleb’s Little League uniform blackened, his whispered *“Why them and not me?”* smothered by collapsing rafters.

Anya stumbled into the snow, her Nikon melting against her chest. In the dying fire’s glow, the ashes swirled into a makeshift darkroom.

Her reflection pooled in the soot.

*“You knew the market was full,”* it rasped with Caleb’s ruined voice. *“You knew, and you still called in the tip.”*

She lunged to stomp it out.

The ashes clung.

---

**Final Frame:**
*Reuters Headline, March 15: Award-Winning War Photographer Dies in Cabin Fire; Colleagues Note Her Recent Focus on “Redemptive Portraiture”*
</answer>
</assistant_response>
</submission>

<analysis>
The story concept demonstrates sophisticated layering of elements:

1. Psychological Depth
- Strong integration of external horror (haunted photos) with internal trauma
- Well-developed metaphor of photography as frozen truth
- Complex exploration of moral culpability and professional ethics

2. Structure & Symbolism
- Effective use of photos as both literal and metaphorical devices
- Chemical darkroom elements create sensory connections to memory
- Burning cabin symbolizes attempted erasure of truth

3. Character Development
- Clear character arc from denial to forced confrontation
- Multiple layers of personal stakes (professional, familial, moral)
- Transformation through horror rather than despite it

Areas for improvement:
- Could specify more details about the civilian bombing incident
- Might benefit from clearer rules about how photos become "activated"
- Brother subplot could be more integrated into the main narrative

Technical strengths:
- Innovative blend of psychological horror and war trauma
- Strong use of sensory details (chemical scents, whispers)
- Effective buildup of tension through photo revelations
</analysis>
<score>
Narrative Creativity: 4/4
Character & World Building: 3/4
Coherence & Structure: 2/2
Total: 9/10
</score>

Example 2:
<submission>
<user_prompt>An inventor creates a time machine in their basement.</user_prompt>
<assistant_response>
<think>
- Should focus on the invention process
- Need to establish clear rules for time travel
- Include character motivation
- Build suspense about consequences
</think>

<answer>
Bob was a inventor who liked to make things. He worked really hard in his basement to make a time machine. He used lots of metal parts and wires and stuff. It took him 3 days to build it.

When he finished, he pressed the button and went back in time. He saw dinosaurs and they were scary. Then he went to the future and saw flying cars. It was cool.

Then he came back home and his wife was mad because he missed dinner. The end.
</answer>
</assistant_response>
</submission>
<analysis>
The story fails in multiple aspects:
1. Plot development: Extremely shallow with no meaningful conflict or resolution
2. Character depth: Generic protagonist with no clear motivation or personality
3. Writing quality: Basic vocabulary, repetitive structure, and poor transitions
4. Technical elements: No explanation of how the time machine works
5. Logic: Unrealistic timeline (3 days to build a time machine)

Major issues:
- No emotional stakes or character development
- Time travel consequences completely ignored
- Choppy, elementary-level writing
- Wife character exists only as a stereotypical angry spouse
- No sensory details or immersive elements
</analysis>
<score>
Narrative Creativity: 1/4
Character & World Building: 1/4
Coherence & Structure: 0/2
Total: 2/10
</score>

Example 3:
<submission>
<user_prompt>Two estranged siblings reunite unexpectedly in a bustling city.</user_prompt>
<assistant_response>
<think>
- Need to explore the complex dynamics of family relationships
- City setting should serve as more than backdrop
- Focus on emotional tension and gradual reconciliation
- Balance past revelations with present moments
</think>
<answer>
Sarah nearly dropped her coffee when she spotted him across Times Square. Michael—her brother—stood transfixed by the towering billboards, exactly as he used to do as a kid. Ten years of silence stretched between them like a chasm, filled with unspoken words and missed holidays.

She could turn away. Disappear into the rushing crowd. But the city had other plans. A surge of pedestrians pushed them closer, and Michael's eyes met hers. Recognition. Shock. A flicker of the warmth they once shared.

"Sarah?" His voice carried the weight of their shared childhood and the burden of their father's choices that drove them apart.

Around them, the city pulsed with its relentless energy, indifferent to their private drama yet somehow lending them its strength. They found themselves in a quiet diner, where the coffee grew cold as years of misunderstandings slowly began to thaw.
</answer>
</assistant_response>
</submission>
<analysis>
The story successfully weaves together multiple layers:
1. Personal conflict: The siblings' estrangement is believably portrayed
2. Setting integration: New York City serves as both catalyst and witness
3. Emotional progression: Natural evolution from shock to tentative connection
4. Symbolic elements: Cold coffee, flowing crowds, and towering buildings reflect the narrative themes

Areas for improvement:
- Could develop the father's role more explicitly
- Final scene might benefit from more specific dialogue
</analysis>
<score>
Narrative Creativity: 4/4
Character & World Building: 4/4
Coherence & Structure: 1/2
Total: 9/10
</score>"""
        return f"""Evaluate this short story submission:

{submission}

Respond EXACTLY in this format:
<analysis>[Critique strengths/weaknesses]</analysis>
<score>
Narrative Creativity: X/4
Character & World Building: Y/4
Coherence & Structure: Z/2
Total: T/10
</score>

Evaluation Examples:
{examples}"""

    def score_submission(self, user_prompt: str, assistant_response: str) -> int:
        system_prompt = self.generate_system_prompt()
        user_message = self.generate_prompt_with_examples(user_prompt, assistant_response)
        completion_params = {
            "model": self.model_name,
            "messages": [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_message}
            ],
            "temperature": self.temperature,
            "timeout": 90,
        }
        response = litellm.completion(**completion_params)
        try:
            result_text = response.choices[0].message.content
        except (AttributeError, IndexError):
            result_text = response
        score = self._extract_score(result_text)
        return score

    def _extract_score(self, result_text: str) -> int:
        try:
            match = re.search(r"Total:\s*([0-9]+(?:\.[0-9]+)?)/10", result_text)
            if not match:
                print("Warning: No score found, defaulting to 0")
                return 0
            total_score = float(match.group(1))
            return min(int(round(total_score)), 9)
        except Exception as e:
            print(f"Score extraction failed: {e}")
            return 0

# Example usage
scorer = ShortStoryScorer(model_name="openai/gpt-4o")
test_prompt = "A young woman stumbles upon an enchanted forest."
test_response = """<think>
- Explore wonder, transformation, and hidden magic.
- Develop the protagonist's backstory and internal conflicts.
- Envision the forest as a living entity with mysterious secrets.
- Plan encounters with mystical beings and transformative events.
</think>
<answer>
[1500-word story narrative that unfolds a magical journey with vivid detail and deep character evolution]
</answer>"""
# score = scorer.score_submission(test_prompt, test_response)
# print("Short Story Score:", score)

In [11]:
import json
import re

input_path = '../data/generated_data.jsonl'
output_path = '../data/processed_data.jsonl'

with open(input_path, 'r', encoding='utf-8') as infile, \
     open(output_path, 'w', encoding='utf-8') as outfile:

    for line in infile:
        entry = json.loads(line.strip())
        
        # Check if the entry meets the criteria
        if (entry.get('format_score') == 0 and 
            entry.get('generator_model') in ['ollama/deepseek-r1:7b', 'ollama/deepseek-r1:1.5b']):
            
            assistant_response = entry.get('assistant_response', '')
            
            if '</think>' in assistant_response and '<answer>' not in assistant_response:
                # Split response into think section and potential answer
                parts = re.split(r'(</think>\n*)', assistant_response, 1, flags=re.DOTALL)
                
                if len(parts) >= 3:
                    # Reconstruct with answer tags around the non-think portion
                    reconstructed = parts[0] + parts[1] + f"<answer>{parts[2].strip()}</answer>"
                    entry['assistant_response'] = reconstructed
            else:
                # No think tags found, leave as is
                pass
        
        # Write the modified or original entry to output
        outfile.write(json.dumps(entry) + '\n')

In [14]:
import os, json
from datetime import datetime

# Create a data folder if it doesn't exist.
if not os.path.exists("data"):
    os.makedirs("data")

# File paths for user prompts and generated data.
USER_PROMPTS_FILE = "../data/user_prompts.jsonl"
OUTPUT_FILE = "../data/generated_data.jsonl"

# --- Define a function to load prompts from a JSONL file ---
def load_user_prompts(filename: str):
    prompts = []
    with open(filename, "r") as f:
        for line in f:
            try:
                obj = json.loads(line.strip())
                if "prompt" in obj:
                    prompts.append(obj["prompt"])
            except json.JSONDecodeError:
                continue
    return prompts

# --- Example list of generator models ---
generator_models = [
    # "mistral/mistral-small-latest",
    "together_ai/meta-llama/Llama-3.3-70B-Instruct-Turbo",
    # "openai/gpt-4o"
    # "together_ai/google/gemma-2b-it"
    # "ollama/qwen:1.8b"
    # "ollama/mistral"
    # Reasoning models
    # "groq/deepseek-r1-distill-llama-70b"
    # "together_ai/deepseek-ai/DeepSeek-R1-Distill-Llama-70B-free"
    # "together_ai/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
    # "ollama/deepseek-r1:1.5b"
    # "ollama/deepseek-r1:7b"
]

# Define the scorer model name (used by the JokeScorer) and instantiate it.
scorer_model_name = "groq/deepseek-r1-distill-llama-70b" # "openai/gpt-4o"
task_type = "creative_writing"
scorer = ShortStoryScorer(model_name=scorer_model_name, temperature=0.0)

# Instantiate the FormatScorer (for format checking)
format_scorer = FormatScorer()

# --- Main loop: For each prompt and for each generation model, generate a joke, evaluate it, and append the result ---
def generate_and_score():
    # Load user prompts (each line in the file should be a JSON object with a "prompt" field)
    user_prompts = load_user_prompts(USER_PROMPTS_FILE)
    
    i = 0
    for prompt in user_prompts:
        for gen_model in generator_models:
            i += 1
            if i < 777:
                continue
            # Create a JokeGenerator instance for this generation model.
            generator = ShortStoryGenerator(model_name=gen_model, temperature=0.3)
            # generator = UniversalGenerator(model_name=gen_model, temperature=0.7)
            
            # Generate the joke for the given prompt.
            # system_prompt = "Generate humorous responses tailored to each user's unique request by analyzing their stated context, preferred tone, and implied audience. Please think step by step before to produce the final joke."
            # system_prompt = "For each ethical dilemma, analyze the implications by identifying key stakeholders, examining core principles in tension, considering consequences, and evaluating different philosophical frameworks. Develop a balanced, nuanced response that acknowledges the complexity of the situation while providing practical insights and potential paths forward."
            # assistant_response = generator.generate(system_prompt=system_prompt, user_prompt=prompt)
            assistant_response = generator.generate(prompt)

            # Compute the format score using the FormatScorer.
            fmt_score = format_scorer.score(assistant_response)
            
            # If scoring fails, we set the joke score to None.
            try:
                llm_score = scorer.score_submission(prompt, assistant_response)
            except Exception as e:
                print(f"Scoring failed for prompt: {prompt} with error: {e}")
                llm_score = None
            
            # Create a record with all the relevant information.
            record = {
                "timestamp": datetime.utcnow().isoformat(),
                "user_prompt": prompt,
                "assistant_response": assistant_response,
                "format_score": fmt_score,
                "llm_score": llm_score,
                "generator_model": gen_model,
                "scorer_model": scorer_model_name,
                "task_type": task_type
            }
            
            # Append the record as a new line in the output ljson file.
            with open(OUTPUT_FILE, "a") as out_f:
                out_f.write(json.dumps(record) + "\n")
            
            # Optionally, print a status update.
            print(f"Processed prompt{i}: {prompt[:30]}... using model: {gen_model}. Score: {llm_score}")

# Run the generation-and-scoring function (one generation at a time).
generate_and_score()

Processed prompt777: I’d love a short piece detaili... using model: together_ai/meta-llama/Llama-3.3-70B-Instruct-Turbo. Score: 7
Processed prompt778: Please create a story about a ... using model: together_ai/meta-llama/Llama-3.3-70B-Instruct-Turbo. Score: 8
Processed prompt779: I'd like a short tale set in a... using model: together_ai/meta-llama/Llama-3.3-70B-Instruct-Turbo. Score: 9
Processed prompt780: Write about an Olympic champio... using model: together_ai/meta-llama/Llama-3.3-70B-Instruct-Turbo. Score: 9
Processed prompt781: Spin a yarn about a Greek merc... using model: together_ai/meta-llama/Llama-3.3-70B-Instruct-Turbo. Score: 9
Processed prompt782: Now switch to Ancient Rome: I'... using model: together_ai/meta-llama/Llama-3.3-70B-Instruct-Turbo. Score: 9

[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.


[1;31mProvider List: https://docs.litellm.ai/docs/prov