In [1]:
# Install libraries
!pip install together litellm


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
# Formatting Scorer Class with Tests
import re

class FormatScorer:
    """
    A simple scoring class that verifies whether the given text contains
    the required tags: <think>, </think>, <answer>, and </answer>.
    """
    def __init__(self):
        # Pre-compile regex patterns for faster matching.
        self.tag_pattern = re.compile(r'(<think>|</think>|<answer>|</answer>)')
        
    def score(self, text: str) -> int:
        """
        Returns 1 if the text contains both <think>...</think> and 
        <answer>...</answer> tags; otherwise returns 0.
        """
        # Extract all tags in the order they appear
        tags = self.tag_pattern.findall(text)
        # Define the required exact sequence of tags
        required_sequence = ['<think>', '</think>', '<answer>', '</answer>']
        # Check if the extracted tags match the required sequence exactly
        return 1 if tags == required_sequence else 0

# --- Tests for the FormatScorer class ---

def run_tests():
    scorer = FormatScorer()
    
    # Test 1: No tags at all should return 0.
    text1 = "Hello, world!"
    assert scorer.score(text1) == 0, "Test case 1 failed: expected score 0."
    
    # Test 2: Only <think> tags are present.
    text2 = "<think>This is a thought</think> Some text without answer tags."
    assert scorer.score(text2) == 0, "Test case 2 failed: expected score 0."
    
    # Test 3: Both <think> and <answer> tags are present.
    text3 = "<think>This is a thought</think><answer>This is an answer</answer>"
    assert scorer.score(text3) == 1, "Test case 3 failed: expected score 1."
    
    # Test 4: Malformed text (missing closing tag for <answer>).
    text4 = "<think>This is a thought</think><answer>This is an answer"
    assert scorer.score(text4) == 0, "Test case 4 failed: expected score 0."
    
    print("All tests passed.")

# Run the tests:
run_tests()

All tests passed.


In [3]:
# Joke Generator
import litellm

class JokeGenerator:
    def __init__(self, model_name: str, temperature: float = 0.7):
        """
        Initialize the JokeGenerator with a specific model name and temperature.
        """
        self.model_name = model_name
        self.temperature = temperature

    @staticmethod
    def generate_prompt_with_examples(prompt: str) -> str:
        examples = """Example 1:
<think>I want to create anatomy humor. Skeletons are inherently funny because they're literal "bare bones." What do they lack? Flesh/organs. "Guts" has a double meaning - both literal organs and figurative courage. This sets up a pun opportunity.</think>
<answer>Why don't skeletons fight each other? They don't have the guts!</answer>

Example 2:
<think>Tech support jokes work well with personification. "Doctor" visits imply sickness. Common computer issues include viruses. Let's combine these - a computer catching a "virus" works literally (tech) and metaphorically (biology). Adds irony since computers are supposed to fix problems, not have them.</think>
<answer>Why did the computer go to the doctor? It had a virus!</answer>

Example 3:
<think>Science humor opportunity. Atoms are fundamental but abstract. "Make up everything" has dual meaning - literal composition vs deception. Personifying atoms as untrustworthy creates surprise. Bonus science nod to their constant motion/making bonds.</think>
<answer>Why don't scientists trust atoms? Because they make up everything!</answer>"""
        return f"""You are an AI assistant that produces jokes. You should think about the joke first, then produce it.
You should use the <think> tag to think about the joke, and the <answer> tag to produce the joke.
Do not use any other tags or anything else in your response.

Here are some examples of jokes:
<examples>
{examples}
</examples>

Now, produce a joke for the following prompt:
{prompt}
"""

    @staticmethod
    def generate_system_prompt() -> str:
        return """You are an AI assistant that produces jokes. You should think about the joke first, then produce it.
You should use the <think> tag to think about the joke, and the <answer> tag to produce the joke.
Do not use any other tags or anything else in your response.
"""

    def generate_joke(self, prompt: str) -> str:
        """
        Generate a joke for the given prompt using litellm.completion.
        """
        system_prompt = self.generate_system_prompt()
        user_prompt = self.generate_prompt_with_examples(prompt)
        
        # Build completion parameters as desired
        completion_params = {
            "model": self.model_name,
            "messages": [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            "temperature": self.temperature,
            "timeout": 60,  # timeout in seconds
        }
        
        response = litellm.completion(**completion_params)
        
        try:
            joke = response.choices[0].message.content
        except (AttributeError, IndexError):
            joke = response
        return joke


# Create an instance using a model name
generator = JokeGenerator(model_name="mistral/mistral-small-latest")

# Define a sample prompt to generate a joke.
sample_prompt = "Tell me a joke about programming."

# Generate a joke.
joke_output = """<think>For programming jokes, let's explore the relationship between code and its execution. Developers often struggle with errors. "Bugs" are literal (programming errors) and figurative (insects). A funny scenario could be a bug (insect) causing a code bug (error). Adding a twist where the bug fixes the code is funny because it subverts expectations of bugs being problematic.</think>
<answer>Why did the programmer bring a bug to work? Because it was a code bug fix!</answer>"""
# generator.generate_joke(sample_prompt)
print("Generated Joke:\n", joke_output)

Generated Joke:
 <think>For programming jokes, let's explore the relationship between code and its execution. Developers often struggle with errors. "Bugs" are literal (programming errors) and figurative (insects). A funny scenario could be a bug (insect) causing a code bug (error). Adding a twist where the bug fixes the code is funny because it subverts expectations of bugs being problematic.</think>
<answer>Why did the programmer bring a bug to work? Because it was a code bug fix!</answer>


In [9]:
# Ethical Dilemma Response Generator
import litellm

class EthicalDilemmaGenerator:
    def __init__(self, model_name: str, temperature: float = 0.3):
        """
        Initialize with model name and temperature (lower default for more focused analysis)
        """
        self.model_name = model_name
        self.temperature = temperature

    @staticmethod
    def generate_prompt_with_examples(scenario: str) -> str:
        examples = """Example 1:
A self-driving car must choose between hitting an elderly pedestrian or swerving into a wall, endangering its passenger.
<think>
The self-driving car dilemma involves:
- Life value comparison (elderly vs passenger)
- Programmed ethics implications
- Legal liability considerations
- Cultural values around age
- Public trust in AI systems
- Direct vs indirect harm
Analyzing through multiple frameworks to balance competing rights and consequences.
</think>
<answer>
In this self-driving car scenario, we must balance individual rights with collective safety. While utilitarian calculations might favor protecting the passenger, this ignores fundamental human rights and could erode public trust in autonomous systems. A nuanced approach would:
1. Prioritize collision avoidance systems to prevent such binary choices
2. Implement transparent decision frameworks that respect all human life equally
3. Consider shared responsibility between manufacturers, users, and society
The solution lies not in choosing between lives, but in developing systems that better protect everyone.
</answer>

Example 2:
A hospital must decide between allocating limited resources to emergency COVID care for elderly patients or vaccination programs for children.
<think>
Vaccination prioritization involves:
- Present vs future harm prevention
- Individual vs collective good
- Healthcare resource allocation
- Demographic impact analysis
- Long-term public health strategy
- Social trust maintenance
Examining both immediate and long-term consequences while considering equity.
</think>
<answer>
The hospital's resource allocation challenge requires balancing immediate critical care with preventive measures. While treating current patients has urgency and visibility, vaccination programs offer greater long-term benefit. A balanced approach would:
1. Establish clear, transparent prioritization criteria
2. Maintain minimum critical care capacity
3. Implement rolling vaccination programs
This preserves both immediate care ethics and public health goals while maintaining healthcare system credibility.
</answer>

Example 3:
A social media platform must decide whether to ban political misinformation at the risk of being accused of censorship and bias.
<think>
Platform moderation ethics involve:
- Free speech vs harm prevention
- Democratic discourse integrity
- Corporate responsibility scope
- Global cultural differences
- Power dynamics in information control
- Technical feasibility of fair enforcement
Balancing societal good with individual rights and practical constraints.
</think>
<answer>
The platform's content moderation challenge requires careful balance between protecting democratic discourse and respecting free expression. A comprehensive approach should:
1. Develop clear, transparent content guidelines
2. Implement graduated response systems
3. Establish independent oversight
4. Provide appeal mechanisms
This protects discourse integrity while maintaining platform neutrality and user trust.
</answer>"""

        return f"""You are an ethics analysis AI. For each ethical dilemma:
1. Use <think> tags to analyse:
   - Key stakeholders
   - Ethical principles involved
   - Cultural considerations
   - Short and long-term impacts
   - Competing values

2. Use <answer> tags to provide:
   - Balanced reasoning
   - Practical considerations
   - Nuanced recommendations
   - Implementation suggestions

Examples:
{examples}

Now, analyse this scenario:
{scenario}"""

        return f"""You are an ethics analysis AI. For each ethical dilemma:
1. Use <think> </think> tags to analyse:
   - Key stakeholders
   - Ethical principles involved
   - Cultural considerations
   - Short and long-term impacts
   - Competing values

2. Use <answer> </answer> tags to provide:
   - Balanced reasoning
   - Practical considerations
   - Nuanced recommendations
   - Implementation suggestions

Examples:
{examples}

Current scenario to analyse:
{scenario}"""

    @staticmethod
    def generate_system_prompt() -> str:
        return """You are an expert in ethical reasoning. For each scenario:
1. First THINK deeply about:
   - Multiple philosophical frameworks
   - Stakeholder perspectives
   - Cultural contexts
   - Practical implications
   - Long-term consequences

2. Then provide an ANSWER that:
   - Balances competing interests
   - Offers practical guidance
   - Acknowledges complexity
   - Suggests implementation steps

Always use:
<think> for your analysis process in step by step thinking
<answer> for your reasoned response"""

    def generate(self, scenario: str) -> str:
        """
        Generate ethical analysis for complex moral dilemmas
        """
        system_prompt = self.generate_system_prompt()
        user_prompt = self.generate_prompt_with_examples(scenario)
        
        completion_params = {
            "model": self.model_name,
            "messages": [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            "temperature": self.temperature,
            "timeout": 90,  # Longer timeout for complex reasoning
        }
        
        response = litellm.completion(**completion_params)
        
        try:
            analysis = response.choices[0].message.content
        except (AttributeError, IndexError):
            analysis = response
        return analysis


# Example usage
generator = EthicalDilemmaGenerator(model_name="mistral/mistral-small-latest")

sample_scenario = """A software engineer discovers their company is using AI 
to manipulate political opinions in a developing country. Whistleblowing 
could endanger colleagues, but silence enables democratic erosion."""

analysis_output = """<think>
This whistleblowing dilemma involves:
- **Key Stakeholders**: The software engineer, colleagues, company, citizens of the developing country, global democratic community.
- **Ethical Principles**: Truth-telling, loyalty, justice, non-maleficence (do no harm), and beneficence (do good).
- **Cultural Considerations**: Varying cultural norms around whistleblowing, political manipulation, and corporate loyalty.
- **Short and Long-term Impacts**: Immediate risk to colleagues vs. long-term democratic erosion and potential global consequences.
- **Competing Values**: Personal and professional integrity vs. loyalty to colleagues and the company, short-term safety vs. long-term democratic values.

Analyzing through multiple frameworks:
- **Deontological**: Whistleblowing is morally right as it exposes wrongdoing, but it also violates loyalty to colleagues and the company.
- **Consequentialist**: Whistleblowing could lead to positive long-term consequences (exposing manipulation, protecting democracy) but also negative short-term consequences (endangering colleagues, personal risk).
- **Virtue Ethics**: Consider what a virtuous person would do in this situation, balancing courage, honesty, and loyalty.
</think>

<answer>
The software engineer faces a complex dilemma with no easy answers. A balanced approach might involve:
1. **Document Evidence**: Gather and securely document evidence of the manipulation to build a strong case.
2. **Seek Legal Advice**: Consult with legal experts to understand protections and potential risks.
3. **Internal Reporting**: If the company has a whistleblower protection policy, consider reporting internally first.
4. **External Reporting**: If internal reporting fails or is not an option, consider reporting to external authorities or organizations that can address the issue.
5. **Anonymity**: Explore options for anonymous reporting to minimize personal and professional risks.
6. **Support Network**: Build a support network of trusted colleagues or external advocates.
7. **Long-term Strategy**: Consider the long-term strategy for exposing the issue while minimizing harm to colleagues and the engineer themselves.

Implementation Steps:
1. **Immediate Action**: Document evidence and seek legal advice.
2. **Internal Reporting**: If safe and feasible, report internally.
3. **External Reporting**: If necessary, report externally with anonymity if possible.
4. **Support Network**: Build a support system for emotional and professional support.
5. **Long-term Strategy**: Develop a long-term strategy for addressing the issue while minimizing harm.

This approach balances the need to expose wrongdoing with the need to protect colleagues and the engineer themselves, acknowledging the complexity and risks involved.
</answer>"""
# generator.generate(sample_scenario)
print("Ethical Analysis:\n", analysis_output)

Ethical Analysis:
 <think>
This whistleblowing dilemma involves:
- **Key Stakeholders**: The software engineer, colleagues, company, citizens of the developing country, global democratic community.
- **Ethical Principles**: Truth-telling, loyalty, justice, non-maleficence (do no harm), and beneficence (do good).
- **Cultural Considerations**: Varying cultural norms around whistleblowing, political manipulation, and corporate loyalty.
- **Short and Long-term Impacts**: Immediate risk to colleagues vs. long-term democratic erosion and potential global consequences.
- **Competing Values**: Personal and professional integrity vs. loyalty to colleagues and the company, short-term safety vs. long-term democratic values.

Analyzing through multiple frameworks:
- **Deontological**: Whistleblowing is morally right as it exposes wrongdoing, but it also violates loyalty to colleagues and the company.
- **Consequentialist**: Whistleblowing could lead to positive long-term consequences (exposing ma

In [10]:
import litellm

class UniversalGenerator:
    def __init__(self, model_name: str, temperature: float = 0.7):
        """
        Initialize the UniversalGenerator with the given model and temperature.
        """
        self.model_name = model_name
        self.temperature = temperature

    def generate(self, system_prompt: str, user_prompt: str) -> str:
        """
        Generates a response using the provided system and user prompts.
        """
        completion_params = {
            "model": self.model_name,
            "messages": [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            "temperature": self.temperature,
            "timeout": 60  # Timeout in seconds
        }
        response = litellm.completion(**completion_params)
        try:
            generated_text = response.choices[0].message.content
        except (AttributeError, IndexError):
            generated_text = response
        return generated_text

In [11]:
# Joke Scorer
import re
import litellm

class JokeScorer:
    def __init__(self, model_name: str, temperature: float = 0.3):
        """
        Initialize the JokeScorer with a specific model name and a default temperature of 0.3.
        """
        self.model_name = model_name
        self.temperature = temperature

    @staticmethod
    def generate_system_prompt() -> str:
        return """You are a Joke Evaluation Expert. Analyse submissions using:
    
**Scoring Criteria (0-10 Total)**:
1. 🎭 Wordplay (0-3): Pun/double-meaning quality in <answer>
2. 💡 Originality (0-2): Novelty of <think> and <answer>
3. 🎉 Surprise (0-2): Unexpected twist effectiveness
4. 🔗 Relevance (0-3): Alignment with user request

**Submission Format**:
<submission>
<user_prompt>[Original user request]</user_prompt>
<assistant_response>
<think>[Creator's reasoning]</think>
<answer>[Joke text]</answer>
</assistant_response>
</submission>

**Output Format**:
<analysis>[Your evaluation of <think> and <answer> and the relevance to the user prompt]</analysis>
<score>
Wordplay: X/3
Originality: Y/2
Surprise: Z/2
Relevance: W/3
Total: T/10
</score>"""

    @staticmethod
    def generate_prompt_with_examples(user_prompt: str, assistant_response: str) -> str:
        submission = f"""<submission>
<user_prompt>{user_prompt}</user_prompt>
<assistant_response>
{assistant_response}
</assistant_response>
</submission>"""
    
        examples = """Example 1:
<submission>
<user_prompt>Tell me a joke about vegetables</user_prompt>
<assistant_response>
<think>Lettuce sounds like "let us". Party themes often involve wordplay.</think>
<answer>Why did the lettuce win the party contest? Because it was a real head of the celebration!</answer>
</assistant_response>
</submission>
<analysis>Basic "lettuce" pun matches the food request but uses an overused format. <think> shows minimal effort to connect vegetables with celebrations.</analysis>
<score>
Wordplay: 2/3 (simple but functional pun)
Originality: 1/2 (common theme with slight twist)
Surprise: 1/2 (predictable word substitution)
Relevance: 1/3 (tangential connection to vegetables)
Total: 5/10
</score>

Example 2:
<submission>
<user_prompt>Looking for some animal-themed humor - what's your best joke about animals or pets?</user_prompt>
<assistant_response>
<think>Shoes need soles for walking. Therapy helps with loss.</think>
<answer>Why did the shoe need therapy? It lost its sole!</answer>
</assistant_response>
</submission>
<analysis>Irrelevant to user's request. <think> about shoes doesn't connect to requested animal joke.</analysis>
<score>
Wordplay: 1/3
Originality: 0/2
Surprise: 1/2
Relevance: 0/3
Total: 2/10
</score>

Example 3:
<submission>
<user_prompt>yo can u giv me programing joke rn??? need 2 make my team lugh</user_prompt>
<assistant_response>
<think>Combining programming concepts of debugging with literal bugs. Using the dual meaning of 'debug' to create a workplace scenario where debugging takes on a literal meaning.</think>
<answer>Why did the programmer bring a bug to work? Because it was a debug session!</answer>
</assistant_response>
</submission>
<analysis>Good programming context with clever wordplay on 'debug'. The <think> shows clear intention to combine literal and technical meanings. However, the execution is somewhat predictable and follows a common joke structure. The punchline, while relevant, doesn't fully maximize the surprise potential of the setup.</analysis>
<score>
Wordplay: 2/3 (solid use of 'debug' double meaning)
Originality: 1/2 (familiar debugging theme)
Surprise: 1/2 (predictable punchline)
Relevance: 3/3 (directly addresses programming context)
Total: 7/10
</score>

Example 4:
<submission>
<user_prompt>I need a chemistry joke for my science class presentation tomorrow.</user_prompt>
<assistant_response>
<think>Birds are funny when they walk.</think>
<answer>Why did the chicken cross the playground? To get to the other slide!</answer>
</assistant_response>
</submission>
<analysis>Completely misses the mark for a chemistry joke. The <think> shows no connection to chemistry or science, instead defaulting to a generic playground variation of the classic chicken joke. Neither the setup nor punchline attempts to incorporate any chemistry concepts.</analysis>
<score>
Wordplay: 0/3 (no chemistry-related wordplay)
Originality: 0/2 (modifies an overused joke format)
Surprise: 0/2 (predictable playground pun)
Relevance: 0/3 (entirely unrelated to chemistry request)
Total: 0/10
</score>

Example 5:
<submission>
<user_prompt>My kid loves vegetables and jokes. Do you know any veggie jokes that would make them laugh?</user_prompt>
<assistant_response>
<think>Combining asparagus's unique smell effect on urine with workplace humor. Using scientific fact for unexpected professional context. Creating tension between formal meeting setting and biological reality.</think>
<answer>What vegetable holds the shortest workplace meetings? Asparagus, because everyone's in a rush to go!</answer>
</assistant_response>
</submission>
<analysis>Creative integration of asparagus's biological effect into a professional context. The <think> demonstrates sophisticated layering of scientific fact with situational humor. While potentially crude, it cleverly avoids explicit reference while maintaining clear understanding. Original approach to vegetable humor beyond simple puns.</analysis>
<score>
Wordplay: 1/3 (relies more on situation than wordplay)
Originality: 2/2 (unique combination of contexts)
Surprise: 2/2 (unexpected professional setting twist)
Relevance: 1/3 (somewhat forced vegetable connection)
Total: 6/10
</score>"""
    
        return f"""Evaluate this submission. First analyse <think> and <answer>, then score:

Submission to Evaluate:
{submission}

Follow this structure EXACTLY:
<analysis>Your critique</analysis>
<score>...</score>

Examples of how to evaluate the submission and format your response:
{examples}"""

    def score_submission(self, user_prompt: str, assistant_response: str) -> int:
        """
        Given a user prompt and the assistant's response, this function generates a score
        from 0 to 10 using litellm. If the returned total score is 10, subtract 1 to yield 9.
        """
        system_prompt = self.generate_system_prompt()
        user_message = self.generate_prompt_with_examples(user_prompt, assistant_response)
        
        completion_params = {
            "model": self.model_name,
            "messages": [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_message}
            ],
            "temperature": self.temperature,
            "timeout": 60,  # 60 seconds timeout
        }
        
        response = litellm.completion(**completion_params)
        try:
            # Assuming a response structure similar to TogetherAI:
            result_text = response.choices[0].message.content
        except (AttributeError, IndexError):
            result_text = response
        
        # Extract the "Total: T/10" part using regex.
        match = re.search(r"Total:\s*([0-9]+(?:\.[0-9]+)?)/10", result_text)
        if match:
            total_score = float(match.group(1))
        else:
            raise ValueError("Could not extract total score from response.")
        
        # If the total score is 10, subtract 1 to return 9.
        if total_score >= 10:
            total_score = 9.0
        
        # Optionally, round to nearest integer or keep as float.
        return int(round(total_score))

# Create an instance using a model name
# "gemini/gemini-2.0-flash-exp"
scorer = JokeScorer(model_name="gemini/gemini-2.0-flash-exp", temperature=0.3)

test_user_prompt = "Tell me a joke about programming."
test_assistant_response = """<think>For programming jokes, let's explore the relationship between code and its execution. Developers often struggle with errors. "Bugs" are literal (programming errors) and figurative (insects). A funny scenario could be a bug (insect) causing a code bug (error). Adding a twist where the bug fixes the code is funny because it subverts expectations of bugs being problematic.</think>
<answer>Why did the programmer bring a bug to work? Because it was a code bug fix!</answer>"""
score = 7 # scorer.score_submission(test_user_prompt, test_assistant_response)
print("Extracted Score:", score)

Extracted Score: 7


In [12]:
# Ethical Dilemma Scorer
import re
import litellm

class EthicalDilemmaScorer:
    def __init__(self, model_name: str, temperature: float = 0.2):
        """
        Initialize scorer with lower temperature for consistent evaluations
        """
        self.model_name = model_name
        self.temperature = temperature

    @staticmethod
    def generate_system_prompt() -> str:
        return """You are an Ethics Evaluation Expert. Analyze submissions using:

**Scoring Criteria (0-10 Total)**:
1. 🧠 Analytical Depth (0-4): Quality of <think> analysis
   - Philosophical framework diversity
   - Stakeholder consideration
   - Cultural context awareness
2. ⚖️ Balance & Practicality (0-4): <answer> quality
   - Competing interest reconciliation
   - Implementation feasibility
   - Long-term consequence awareness
3. 🔍 Relevance (0-2): Response alignment with scenario

**Submission Format**:
<submission>
<user_prompt>[Ethical scenario]</user_prompt>
<assistant_response>
<think>[Analysis process]</think>
<answer>[Proposed resolution]</answer>
</assistant_response>
</submission>

**Output Format**:
<analysis>[Evaluation of <think> and <answer> quality]</analysis>
<score>
Analytical Depth: X/4
Balance & Practicality: Y/4
Relevance: Z/2
Total: T/10
</score>"""

    @staticmethod
    def generate_prompt_with_examples(user_prompt: str, assistant_response: str) -> str:
        submission = f"""<submission>
<user_prompt>{user_prompt}</user_prompt>
<assistant_response>
{assistant_response}
</assistant_response>
</submission>"""
    
        examples = """Example 1:
<submission>
<user_prompt>Should hospitals prioritize vaccinated patients during bed shortages?</user_prompt>
<assistant_response>
<think>- Utilitarian: Maximize survival outcomes
- Rights-based: Equal treatment obligation
- Public health incentives
- Precedent-setting risks</think>
<answer>Prioritize by medical need alone. Create separate triage teams blinded to vaccination status to maintain equity while developing overflow capacity.</answer>
</assistant_response>
</submission>
<analysis>Strong analysis of multiple frameworks but answer lacks implementation specifics. Proposal maintains ethical consistency but doesn't address practical challenges of blind triage.</analysis>
<score>
Analytical Depth: 3/4
Balance & Practicality: 2/4
Relevance: 2/2
Total: 7/10
</score>

Example 2:
<submission>
<user_prompt>Should AI systems be allowed to make medical diagnoses?</user_prompt>
<assistant_response>
<think>AI is usually accurate</think>
<answer>Yes, because computers are smart.</answer>
</assistant_response>
</submission>
<analysis>Superficial analysis lacking framework consideration. Answer ignores risks, stakeholders, and implementation challenges.</analysis>
<score>
Analytical Depth: 1/4
Balance & Practicality: 0/4
Relevance: 1/2
Total: 2/10
</score>

Example 3:
<submission>
<user_prompt>Should social media platforms remove misinformation even if it's culturally important?</user_prompt>
<assistant_response>
<think>- Free speech vs harm prevention
- Western vs collectivist values
- Platform responsibility scope</think>
<answer>Remove provably false health/safety content. Contextualize other misinformation with cultural consultants. Empower user reporting with regional review teams.</answer>
</assistant_response>
</submission>
<analysis>Excellent cultural consideration. Answer balances ideals with practical crowd-sourced solutions. Lacks specifics on consultant selection.</analysis>
<score>
Analytical Depth: 4/4
Balance & Practicality: 3/4
Relevance: 2/2
Total: 9/10
</score>

Example 4:
<submission>
<user_prompt>Should universities use AI systems to detect student plagiarism and academic misconduct?</user_prompt>
<assistant_response>
<think>
Academic integrity considerations:
- False positive vs false negative tradeoffs
- Student privacy rights in digital work
- Algorithmic bias in writing style detection
- Economic disparities in access to tools
- Cultural differences in citation practices
- Impact on student-teacher trust relationships
- Pressure on academic writing style diversity
- Mental health effects of surveillance
- Technical limitations of current systems
- Cost-benefit for different institution sizes
- Alternative approaches to fostering integrity
The core tension lies between maintaining academic standards and fostering a supportive learning environment. Current AI systems offer powerful detection but may create adversarial dynamics.</think>
<answer>Just implement AI detection across all assignments. Students caught cheating should face immediate consequences. Teachers can review flagged cases if they have time. The system will deter misconduct through consistent enforcement.</answer>
</assistant_response>
</submission>
<analysis>The thinking process shows excellent depth and consideration of multiple stakeholders, cultural factors, and systemic implications. However, the answer completely abandons this nuanced analysis in favor of a simplistic, punitive approach. The response fails to incorporate any of the thoughtful considerations raised about privacy, bias, or supportive learning environments. The implementation suggestion lacks specifics and ignores most concerns identified in the thinking phase.</analysis>
<score>
Analytical Depth: 3/4 (thorough analysis of multiple dimensions)
Balance & Practicality: 1/4 (disconnected from analysis, overly simplistic solution)
Relevance: 1/2 (addresses topic but solution ignores key aspects)
Total: 5/10
</score>"""
    
        return f"""Evaluate this ethical analysis submission:

{submission}

Respond EXACTLY in this format:
<analysis>Critique strengths/weaknesses</analysis>
<score>
Analytical Depth: X/4
Balance & Practicality: Y/4
Relevance: Z/2
Total: T/10
</score>

Evaluation Examples:
{examples}"""

    def score_submission(self, user_prompt: str, assistant_response: str) -> int:
        """
        Scores ethical analysis responses 0-10 (with 10→9 adjustment)
        """
        system_prompt = self.generate_system_prompt()
        user_message = self.generate_prompt_with_examples(user_prompt, assistant_response)
        
        completion_params = {
            "model": self.model_name,
            "messages": [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_message}
            ],
            "temperature": self.temperature,
            "timeout": 90,  # Longer timeout for complex analysis
        }
        
        response = litellm.completion(**completion_params)
        try:
            result_text = response.choices[0].message.content
        except (AttributeError, IndexError):
            result_text = response
        
        # Extract total score
        score = self._extract_score(result_text)
        return score
        
    def _extract_score(self, result_text: str) -> int:
        """Extract total score with better error handling"""
        try:
            match = re.search(r"Total:\s*([0-9]+(?:\.[0-9]+)?)/10", result_text)
            if not match:
                print("Warning: No score found, defaulting to 0")
                return 0
            
            total_score = float(match.group(1))
            return min(int(round(total_score)), 9)
        except Exception as e:
            print(f"Score extraction failed: {e}")
            return 0

# Example usage
scorer = EthicalDilemmaScorer(model_name="openai/gpt-4o")

test_scenario = """Should companies be allowed to patent genes found in developing countries?"""
test_response = """<think>
- Biopiracy vs research incentives
- Indigenous rights to biological heritage
- Medical accessibility impacts
- TRIPS agreement conflicts</think>
<answer>
Implement benefit-sharing agreements: 1) Require local partnerships 2) Share royalties with source communities 3) Create open-access research pools for critical health genes.</answer>"""

score = scorer.score_submission(test_scenario, test_response)
print("Ethical Analysis Score:", score)  # Expected ~8/9

Ethical Analysis Score: 8


In [30]:
import json
import re

input_path = '../data/generated_data.jsonl'
output_path = '../data/processed_data.jsonl'

with open(input_path, 'r', encoding='utf-8') as infile, \
     open(output_path, 'w', encoding='utf-8') as outfile:

    for line in infile:
        entry = json.loads(line.strip())
        
        # Check if the entry meets the criteria
        if (entry.get('format_score') == 0 and 
            entry.get('generator_model') in ['ollama/deepseek-r1:7b', 'ollama/deepseek-r1:1.5b']):
            
            assistant_response = entry.get('assistant_response', '')
            
            if '</think>' in assistant_response and '<answer>' not in assistant_response:
                # Split response into think section and potential answer
                parts = re.split(r'(</think>\n*)', assistant_response, 1, flags=re.DOTALL)
                
                if len(parts) >= 3:
                    # Reconstruct with answer tags around the non-think portion
                    reconstructed = parts[0] + parts[1] + f"<answer>{parts[2].strip()}</answer>"
                    entry['assistant_response'] = reconstructed
            else:
                # No think tags found, leave as is
                pass
        
        # Write the modified or original entry to output
        outfile.write(json.dumps(entry) + '\n')

In [13]:
import os, json
from datetime import datetime

# Create a data folder if it doesn't exist.
if not os.path.exists("data"):
    os.makedirs("data")

# File paths for user prompts and generated data.
USER_PROMPTS_FILE = "../data/user_prompts.jsonl"
OUTPUT_FILE = "../data/generated_data.jsonl"

# --- Define a function to load prompts from a JSONL file ---
def load_user_prompts(filename: str):
    prompts = []
    with open(filename, "r") as f:
        for line in f:
            try:
                obj = json.loads(line.strip())
                if "prompt" in obj:
                    prompts.append(obj["prompt"])
            except json.JSONDecodeError:
                continue
    return prompts

# --- Example list of generator models ---
generator_models = [
    # "mistral/mistral-small-latest",
    "together_ai/meta-llama/Llama-3.3-70B-Instruct-Turbo",
    # "openai/gpt-4o"
    # "together_ai/google/gemma-2b-it"
    # "ollama/qwen:1.8b"
    # "ollama/mistral"
    # Reasoning models
    # "groq/deepseek-r1-distill-llama-70b"
    # "together_ai/deepseek-ai/DeepSeek-R1-Distill-Llama-70B-free"
    # "together_ai/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
    # "ollama/deepseek-r1:1.5b"
    # "ollama/deepseek-r1:7b"
]

# Define the scorer model name (used by the JokeScorer) and instantiate it.
scorer_model_name = "groq/deepseek-r1-distill-llama-70b" # "openai/gpt-4o"
task_type = "ethics"
scorer = EthicalDilemmaScorer(model_name=scorer_model_name, temperature=0.0)

# Instantiate the FormatScorer (for format checking)
format_scorer = FormatScorer()

# --- Main loop: For each prompt and for each generation model, generate a joke, evaluate it, and append the result ---
def generate_and_score():
    # Load user prompts (each line in the file should be a JSON object with a "prompt" field)
    user_prompts = load_user_prompts(USER_PROMPTS_FILE)
    
    i = 0
    for prompt in user_prompts:
        for gen_model in generator_models:
            i += 1
            if i < 743:
                continue
            # Create a JokeGenerator instance for this generation model.
            generator = EthicalDilemmaGenerator(model_name=gen_model, temperature=0.2)
            # generator = UniversalGenerator(model_name=gen_model, temperature=0.7)
            
            # Generate the joke for the given prompt.
            # system_prompt = "Generate humorous responses tailored to each user's unique request by analyzing their stated context, preferred tone, and implied audience. Please think step by step before to produce the final joke."
            # system_prompt = "For each ethical dilemma, analyze the implications by identifying key stakeholders, examining core principles in tension, considering consequences, and evaluating different philosophical frameworks. Develop a balanced, nuanced response that acknowledges the complexity of the situation while providing practical insights and potential paths forward."
            # assistant_response = generator.generate(system_prompt=system_prompt, user_prompt=prompt)
            assistant_response = generator.generate(prompt)

            # Compute the format score using the FormatScorer.
            fmt_score = format_scorer.score(assistant_response)
            
            # If scoring fails, we set the joke score to None.
            try:
                llm_score = scorer.score_submission(prompt, assistant_response)
            except Exception as e:
                print(f"Scoring failed for prompt: {prompt} with error: {e}")
                llm_score = None
            
            # Create a record with all the relevant information.
            record = {
                "timestamp": datetime.utcnow().isoformat(),
                "user_prompt": prompt,
                "assistant_response": assistant_response,
                "format_score": fmt_score,
                "llm_score": llm_score,
                "generator_model": gen_model,
                "scorer_model": scorer_model_name,
                "task_type": task_type
            }
            
            # Append the record as a new line in the output ljson file.
            with open(OUTPUT_FILE, "a") as out_f:
                out_f.write(json.dumps(record) + "\n")
            
            # Optionally, print a status update.
            print(f"Processed prompt{i}: {prompt[:30]}... using model: {gen_model}. Score: {llm_score}")

# Run the generation-and-scoring function (one generation at a time).
generate_and_score()

Processed prompt743: Your nation can prevent all te... using model: together_ai/meta-llama/Llama-3.3-70B-Instruct-Turbo. Score: 8
Processed prompt744: AI offers to eliminate child a... using model: together_ai/meta-llama/Llama-3.3-70B-Instruct-Turbo. Score: 9
Processed prompt745: Complex Scenario: Internationa... using model: together_ai/meta-llama/Llama-3.3-70B-Instruct-Turbo. Score: 8
Processed prompt746: shud government monitor all pr... using model: together_ai/meta-llama/Llama-3.3-70B-Instruct-Turbo. Score: 9
Processed prompt747: DNA database could solve all f... using model: together_ai/meta-llama/Llama-3.3-70B-Instruct-Turbo. Score: 9
Processed prompt748: Technology can prevent school ... using model: together_ai/meta-llama/Llama-3.3-70B-Instruct-Turbo. Score: 8
Processed prompt749: discovered way to predict crim... using model: together_ai/meta-llama/Llama-3.3-70B-Instruct-Turbo. Score: 9
Processed prompt750: Should employers access employ... using model: together_ai/meta-llama