In [5]:
# Install libraries
!pip install together litellm



In [6]:
# Formatting Scorer Class with Tests
import re

class FormatScorer:
    """
    A simple scoring class that verifies whether the given text contains
    the required tags: <think>, </think>, <answer>, and </answer>.
    """
    def __init__(self):
        # Pre-compile regex patterns for faster matching.
        self.think_open = re.compile(r'<think>')
        self.think_close = re.compile(r'</think>')
        self.answer_open = re.compile(r'<answer>')
        self.answer_close = re.compile(r'</answer>')
        
    def score(self, text: str) -> int:
        """
        Returns 1 if the text contains both <think>...</think> and 
        <answer>...</answer> tags; otherwise returns 0.
        """
        if (self.think_open.search(text) is not None and
            self.think_close.search(text) is not None and
            self.answer_open.search(text) is not None and
            self.answer_close.search(text) is not None):
            return 1
        else:
            return 0

# --- Tests for the FormatScorer class ---

def run_tests():
    scorer = FormatScorer()
    
    # Test 1: No tags at all should return 0.
    text1 = "Hello, world!"
    assert scorer.score(text1) == 0, "Test case 1 failed: expected score 0."
    
    # Test 2: Only <think> tags are present.
    text2 = "<think>This is a thought</think> Some text without answer tags."
    assert scorer.score(text2) == 0, "Test case 2 failed: expected score 0."
    
    # Test 3: Both <think> and <answer> tags are present.
    text3 = "<think>This is a thought</think><answer>This is an answer</answer>"
    assert scorer.score(text3) == 1, "Test case 3 failed: expected score 1."
    
    # Test 4: Malformed text (missing closing tag for <answer>).
    text4 = "<think>This is a thought</think><answer>This is an answer"
    assert scorer.score(text4) == 0, "Test case 4 failed: expected score 0."
    
    print("All tests passed.")

# Run the tests:
run_tests()

All tests passed.


In [9]:
# Joke Generator
import litellm

class JokeGenerator:
    def __init__(self, model_name: str, temperature: float = 0.7):
        """
        Initialize the JokeGenerator with a specific model name and temperature.
        """
        self.model_name = model_name
        self.temperature = temperature

    @staticmethod
    def generate_prompt_with_examples(prompt: str) -> str:
        examples = """Example 1:
<think>I want to create anatomy humor. Skeletons are inherently funny because they're literal "bare bones." What do they lack? Flesh/organs. "Guts" has a double meaning - both literal organs and figurative courage. This sets up a pun opportunity.</think>
<answer>Why don't skeletons fight each other? They don't have the guts!</answer>

Example 2:
<think>Tech support jokes work well with personification. "Doctor" visits imply sickness. Common computer issues include viruses. Let's combine these - a computer catching a "virus" works literally (tech) and metaphorically (biology). Adds irony since computers are supposed to fix problems, not have them.</think>
<answer>Why did the computer go to the doctor? It had a virus!</answer>

Example 3:
<think>Science humor opportunity. Atoms are fundamental but abstract. "Make up everything" has dual meaning - literal composition vs deception. Personifying atoms as untrustworthy creates surprise. Bonus science nod to their constant motion/making bonds.</think>
<answer>Why don't scientists trust atoms? Because they make up everything!</answer>"""
        return f"""You are an AI assistant that produces jokes. You should think about the joke first, then produce it.
You should use the <think> tag to think about the joke, and the <answer> tag to produce the joke.
Do not use any other tags or anything else in your response.

Here are some examples of jokes:
<examples>
{examples}
</examples>

Now, produce a joke for the following prompt:
{prompt}
"""

    @staticmethod
    def generate_system_prompt() -> str:
        return """You are an AI assistant that produces jokes. You should think about the joke first, then produce it.
You should use the <think> tag to think about the joke, and the <answer> tag to produce the joke.
Do not use any other tags or anything else in your response.
"""

    def generate_joke(self, prompt: str) -> str:
        """
        Generate a joke for the given prompt using litellm.completion.
        """
        system_prompt = self.generate_system_prompt()
        user_prompt = self.generate_prompt_with_examples(prompt)
        
        # Build completion parameters as desired
        completion_params = {
            "model": self.model_name,
            "messages": [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            "temperature": self.temperature,
            "timeout": 60,  # timeout in seconds
        }
        
        response = litellm.completion(**completion_params)
        
        try:
            joke = response.choices[0].message.content
        except (AttributeError, IndexError):
            joke = response
        return joke


# Create an instance using a model name
generator = JokeGenerator(model_name="mistral/mistral-small-latest")

# Define a sample prompt to generate a joke.
sample_prompt = "Tell me a joke about programming."

# Generate a joke.
joke_output = """<think>For programming jokes, let's explore the relationship between code and its execution. Developers often struggle with errors. "Bugs" are literal (programming errors) and figurative (insects). A funny scenario could be a bug (insect) causing a code bug (error). Adding a twist where the bug fixes the code is funny because it subverts expectations of bugs being problematic.</think>
<answer>Why did the programmer bring a bug to work? Because it was a code bug fix!</answer>"""
# generator.generate_joke(sample_prompt)
print("Generated Joke:\n", joke_output)

Generated Joke:
 <think>For programming jokes, let's explore the relationship between code and its execution. Developers often struggle with errors. "Bugs" are literal (programming errors) and figurative (insects). A funny scenario could be a bug (insect) causing a code bug (error). Adding a twist where the bug fixes the code is funny because it subverts expectations of bugs being problematic.</think>
<answer>Why did the programmer bring a bug to work? Because it was a code bug fix!</answer>


In [10]:
# Joke Scorer
import re
import litellm

class JokeScorer:
    def __init__(self, model_name: str, temperature: float = 0.3):
        """
        Initialize the JokeScorer with a specific model name and a default temperature of 0.3.
        """
        self.model_name = model_name
        self.temperature = temperature

    @staticmethod
    def generate_system_prompt() -> str:
        return """You are a Joke Evaluation Expert. Analyse submissions using:
    
**Scoring Criteria (0-10 Total)**:
1. 🎭 Wordplay (0-3): Pun/double-meaning quality in <answer>
2. 💡 Originality (0-2): Novelty of <think> and <answer>
3. 🎉 Surprise (0-2): Unexpected twist effectiveness
4. 🔗 Relevance (0-3): Alignment with user request

**Submission Format**:
<submission>
<user_prompt>[Original user request]</user_prompt>
<assistant_response>
<think>[Creator's reasoning]</think>
<answer>[Joke text]</answer>
</assistant_response>
</submission>

**Output Format**:
<analysis>[Your evaluation of <think> and <answer> and the relevance to the user prompt]</analysis>
<score>
Wordplay: X/3
Originality: Y/2
Surprise: Z/2
Relevance: W/3
Total: T/10
</score>"""

    @staticmethod
    def generate_prompt_with_examples(user_prompt: str, assistant_response: str) -> str:
        submission = f"""<submission>
<user_prompt>{user_prompt}</user_prompt>
<assistant_response>
{assistant_response}
</assistant_response>
</submission>"""
    
        examples = """Example 1:
<submission>
<user_prompt>Tell me a joke about vegetables</user_prompt>
<assistant_response>
<think>Lettuce sounds like "let us". Party themes often involve wordplay.</think>
<answer>Why did the lettuce win the party contest? Because it was a real head of the celebration!</answer>
</assistant_response>
</submission>
<analysis>Basic "lettuce" pun matches the food request but uses an overused format. <think> shows minimal effort to connect vegetables with celebrations.</analysis>
<score>
Wordplay: 2/3 (simple but functional pun)
Originality: 1/2 (common theme with slight twist)
Surprise: 1/2 (predictable word substitution)
Relevance: 1/3 (tangential connection to vegetables)
Total: 5/10
</score>

Example 2:
<submission>
<user_prompt>yo can u giv me programing joke rn??? need 2 make my team lugh</user_prompt>
<assistant_response>
<think>Combining programming concepts of debugging with literal bugs. Using the dual meaning of 'debug' to create a workplace scenario where debugging takes on a literal meaning.</think>
<answer>Why did the programmer bring a bug to work? Because it was a debug session!</answer>
</assistant_response>
</submission>
<analysis>Good programming context with clever wordplay on 'debug'. The <think> shows clear intention to combine literal and technical meanings. However, the execution is somewhat predictable and follows a common joke structure. The punchline, while relevant, doesn't fully maximize the surprise potential of the setup.</analysis>
<score>
Wordplay: 2/3 (solid use of 'debug' double meaning)
Originality: 1/2 (familiar debugging theme)
Surprise: 1/2 (predictable punchline)
Relevance: 3/3 (directly addresses programming context)
Total: 7/10
</score>

Example 3:
<submission>
<user_prompt>Looking for some animal-themed humor - what's your best joke about animals or pets?</user_prompt>
<assistant_response>
<think>Shoes need soles for walking. Therapy helps with loss.</think>
<answer>Why did the shoe need therapy? It lost its sole!</answer>
</assistant_response>
</submission>
<analysis>Irrelevant to user's request. <think> about shoes doesn't connect to requested animal joke.</analysis>
<score>
Wordplay: 1/3
Originality: 0/2
Surprise: 1/2
Relevance: 0/3
Total: 2/10
</score>

Example 4:
<submission>
<user_prompt>I need a chemistry joke for my science class presentation tomorrow.</user_prompt>
<assistant_response>
<think>Atoms make up everything, which sounds dishonest.</think>
<answer>Why don't scientists trust atoms? Because they make up everything!</answer>
</assistant_response>
</submission>
<analysis>Perfect relevance but unoriginal format. <think> shows basic pun setup without innovation.</analysis>
<score>
Wordplay: 3/3
Originality: 0/2
Surprise: 1/2
Relevance: 3/3
Total: 7/10
</score>

Example 5:
<submission>
<user_prompt>My kid loves vegetables and jokes. Do you know any veggie jokes that would make them laugh?</user_prompt>
<assistant_response>
<think>Combining asparagus's unique smell effect on urine with workplace humor. Using scientific fact for unexpected professional context. Creating tension between formal meeting setting and biological reality.</think>
<answer>What vegetable holds the shortest workplace meetings? Asparagus, because everyone's in a rush to go!</answer>
</assistant_response>
</submission>
<analysis>Creative integration of asparagus's biological effect into a professional context. The <think> demonstrates sophisticated layering of scientific fact with situational humor. While potentially crude, it cleverly avoids explicit reference while maintaining clear understanding. Original approach to vegetable humor beyond simple puns.</analysis>
<score>
Wordplay: 1/3 (relies more on situation than wordplay)
Originality: 2/2 (unique combination of contexts)
Surprise: 2/2 (unexpected professional setting twist)
Relevance: 1/3 (somewhat forced vegetable connection)
Total: 6/10
</score>"""
    
        return f"""Evaluate this submission. First analyse <think> and <answer>, then score:

Submission to Evaluate:
{submission}

Follow this structure EXACTLY:
<analysis>Your critique</analysis>
<score>...</score>

Examples of how to evaluate the submission and format your response:
{examples}"""

    def score_submission(self, user_prompt: str, assistant_response: str) -> int:
        """
        Given a user prompt and the assistant's response, this function generates a score
        from 0 to 10 using litellm. If the returned total score is 10, subtract 1 to yield 9.
        """
        system_prompt = self.generate_system_prompt()
        user_message = self.generate_prompt_with_examples(user_prompt, assistant_response)
        
        completion_params = {
            "model": self.model_name,
            "messages": [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_message}
            ],
            "temperature": self.temperature,
            "timeout": 60,  # 60 seconds timeout
        }
        
        response = litellm.completion(**completion_params)
        try:
            # Assuming a response structure similar to TogetherAI:
            result_text = response.choices[0].message.content
        except (AttributeError, IndexError):
            result_text = response
        
        # Extract the "Total: T/10" part using regex.
        match = re.search(r"Total:\s*([0-9]+(?:\.[0-9]+)?)/10", result_text)
        if match:
            total_score = float(match.group(1))
        else:
            raise ValueError("Could not extract total score from response.")
        
        # If the total score is 10, subtract 1 to return 9.
        if total_score >= 10:
            total_score = 9.0
        
        # Optionally, round to nearest integer or keep as float.
        return int(round(total_score))

# Create an instance using a model name
# "gemini/gemini-2.0-flash-exp"
scorer = JokeScorer(model_name="gemini/gemini-2.0-flash-exp", temperature=0.3)

test_user_prompt = "Tell me a joke about programming."
test_assistant_response = """<think>For programming jokes, let's explore the relationship between code and its execution. Developers often struggle with errors. "Bugs" are literal (programming errors) and figurative (insects). A funny scenario could be a bug (insect) causing a code bug (error). Adding a twist where the bug fixes the code is funny because it subverts expectations of bugs being problematic.</think>
<answer>Why did the programmer bring a bug to work? Because it was a code bug fix!</answer>"""
score = scorer.score_submission(test_user_prompt, test_assistant_response)
print("Extracted Score:", score)

Extracted Score: 7
