In [2]:
# -*- coding: utf-8 -*-
"""
GPT-OSS Humor Scorer
Evaluates joke quality on a scale of 1-10 using GPT-OSS-120B
"""

import os
from typing import List, Dict, Optional
from openai import OpenAI
import json
import re


class LLMHumorScorer:
    """
    Evaluates joke quality using GPT-OSS-120B with detailed scoring criteria.
    Returns scores on a scale of 1-10.
    """

    def __init__(
        self,
        api_key: Optional[str] = None,
        temperature: float = 0.3,
        model: str = "openai/gpt-oss-120b",
        max_retries: int = 2
    ):
        """
        Initialize the GPT-OSS humor scorer.

        Args:
            api_key: NVIDIA API key (if None, reads from NVIDIA_API_KEY env var)
            temperature: Low temperature for consistent scoring (0.2-0.4 recommended)
            model: NVIDIA model to use (default: openai/gpt-oss-120b)
            max_retries: Maximum number of retry attempts on failure
        """
        self.client = OpenAI(
            base_url="https://integrate.api.nvidia.com/v1",
            api_key=api_key or os.getenv("NVIDIA_API_KEY")
        )
        self.temperature = temperature
        self.model = model
        self.max_retries = max_retries

        # System prompt for consistent humor evaluation
        self.system_prompt = """You are an expert comedy critic and humor analyst with deep knowledge of comedic theory, wordplay, and joke structure. Your role is to objectively evaluate jokes based on multiple criteria.

Evaluation Criteria (1-10 scale):

1. **Creativity & Originality** (1-10)
   - Is the joke fresh and unexpected?
   - Does it avoid clich√©s and obvious connections?
   - Is the concept novel and inventive?

2. **Word Integration** (1-10)
   - Are both words incorporated naturally?
   - Do the words feel essential to the joke?
   - Is the connection between words clever?

3. **Humor Impact** (1-10)
   - How funny is the punchline?
   - Does it deliver a genuine laugh or smile?
   - Is the surprise element effective?

4. **Structure & Flow** (1-10)
   - Is the setup clear and concise?
   - Does the punchline land well?
   - Is the timing and pacing good?

5. **Cleverness** (1-10)
   - Does it make you think?
   - Is there wordplay, double meaning, or wit?
   - Does it reward re-reading?

Your overall score should reflect the joke's total comedic quality, weighing all criteria appropriately."""

    def _build_scoring_prompt(self, joke: str, word1: str, word2: str) -> str:
        """
        Constructs the scoring prompt for a single joke.

        Args:
            joke: The joke to evaluate
            word1: First word in the pair
            word2: Second word in the pair

        Returns:
            Formatted prompt string
        """
        prompt = f"""Evaluate this joke that was created using the word pair: "{word1}" and "{word2}"

JOKE TO EVALUATE:
"{joke}"

TASK:
Provide a comprehensive evaluation including:
1. Scores for each criterion (1-10 scale)
2. Brief justification for each score
3. An overall score (1-10 scale, can include decimals like 7.5)

The overall score should be a weighted average reflecting:
- Creativity & Originality: 25%
- Word Integration: 20%
- Humor Impact: 30%
- Structure & Flow: 15%
- Cleverness: 10%

Respond ONLY with a valid JSON object in this exact format:
{{
  "creativity": <score 1-10>,
  "word_integration": <score 1-10>,
  "humor_impact": <score 1-10>,
  "structure": <score 1-10>,
  "cleverness": <score 1-10>,
  "overall_score": <weighted score 1-10, can be decimal>,
  "justification": "Brief explanation of the overall score"
}}

Be critical but fair. Reserve scores above 8.0 for truly exceptional jokes."""

        return prompt

    def score_joke(
        self,
        joke: str,
        word1: str = "",
        word2: str = "",
        verbose: bool = False
    ) -> float:
        """
        Score a single joke using GPT-OSS-120B.

        Args:
            joke: The joke to evaluate
            word1: First word in the pair (optional, for context)
            word2: Second word in the pair (optional, for context)
            verbose: If True, print detailed scoring information

        Returns:
            Overall humor score (1-10 scale)
        """
        scoring_prompt = self._build_scoring_prompt(joke, word1, word2)

        for attempt in range(self.max_retries):
            try:
                completion = self.client.chat.completions.create(
                    model=self.model,
                    messages=[
                        {"role": "system", "content": self.system_prompt},
                        {"role": "user", "content": scoring_prompt}
                    ],
                    temperature=self.temperature,
                    max_tokens=1024,
                    stream=False
                )

                response_content = completion.choices[0].message.content

                if verbose:
                    print(f"\nüìä Raw scoring response:\n{response_content}\n")

                # Parse the response
                score_data = self._parse_scoring_response(response_content)

                if verbose:
                    print(f"üéØ Detailed Scores:")
                    print(f"   Creativity: {score_data['creativity']}/10")
                    print(f"   Word Integration: {score_data['word_integration']}/10")
                    print(f"   Humor Impact: {score_data['humor_impact']}/10")
                    print(f"   Structure: {score_data['structure']}/10")
                    print(f"   Cleverness: {score_data['cleverness']}/10")
                    print(f"   Overall: {score_data['overall_score']}/10")
                    print(f"   Justification: {score_data['justification']}\n")

                return float(score_data['overall_score'])

            except Exception as e:
                if verbose:
                    print(f"‚ö†Ô∏è Attempt {attempt + 1}/{self.max_retries}: Error - {e}")

                if attempt == self.max_retries - 1:
                    # Return a neutral score on complete failure
                    if verbose:
                        print(f"‚ùå Failed to score joke, returning default score of 5.0")
                    return 5.0

        return 5.0

    def _parse_scoring_response(self, content: str) -> Dict:
        """
        Parse the scoring response to extract scores and justification.

        Args:
            content: Raw response content from the model

        Returns:
            Dictionary with score breakdown

        Raises:
            ValueError: If parsing fails
        """
        # Try JSON parsing first
        try:
            # Look for JSON in the content
            json_start = content.find('{')
            json_end = content.rfind('}') + 1

            if json_start != -1 and json_end > json_start:
                json_str = content[json_start:json_end]
                score_data = json.loads(json_str)

                # Validate required fields
                required_fields = ['creativity', 'word_integration', 'humor_impact',
                                 'structure', 'cleverness', 'overall_score']

                if all(field in score_data for field in required_fields):
                    # Ensure scores are in valid range
                    for field in required_fields:
                        score_data[field] = max(1.0, min(10.0, float(score_data[field])))

                    return score_data

        except (json.JSONDecodeError, ValueError, KeyError) as e:
            pass

        # Fallback: extract numbers from text
        return self._fallback_score_extraction(content)

    def _fallback_score_extraction(self, content: str) -> Dict:
        """
        Fallback method to extract scores if JSON parsing fails.

        Args:
            content: Raw response content

        Returns:
            Dictionary with estimated scores
        """
        # Try to find any number that looks like a score
        numbers = re.findall(r'\b([1-9]|10)(?:\.\d+)?\b', content)

        if numbers:
            scores = [float(n) for n in numbers[:6]]  # Take first 6 numbers found

            # Pad if needed
            while len(scores) < 6:
                scores.append(5.0)

            return {
                'creativity': scores[0],
                'word_integration': scores[1],
                'humor_impact': scores[2],
                'structure': scores[3],
                'cleverness': scores[4],
                'overall_score': scores[5],
                'justification': 'Fallback extraction used'
            }

        # Complete fallback - return neutral scores
        return {
            'creativity': 5.0,
            'word_integration': 5.0,
            'humor_impact': 5.0,
            'structure': 5.0,
            'cleverness': 5.0,
            'overall_score': 5.0,
            'justification': 'Unable to parse response, using default scores'
        }

    def score_jokes(
        self,
        jokes: List[str],
        word1: str = "",
        word2: str = "",
        verbose: bool = False
    ) -> List[float]:
        """
        Score multiple jokes.

        Args:
            jokes: List of jokes to evaluate
            word1: First word in the pair (optional)
            word2: Second word in the pair (optional)
            verbose: If True, print detailed scoring for each joke

        Returns:
            List of overall scores (1-10 scale)
        """
        scores = []

        for i, joke in enumerate(jokes, 1):
            if verbose:
                print(f"\n{'='*80}")
                print(f"Scoring Joke {i}/{len(jokes)}")
                print(f"{'='*80}")
                print(f"Joke: {joke}")

            score = self.score_joke(joke, word1, word2, verbose=verbose)
            scores.append(score)

            if verbose:
                print(f"‚úÖ Final Score: {score}/10")

        return scores

    def score_with_details(
        self,
        joke: str,
        word1: str = "",
        word2: str = ""
    ) -> Dict:
        """
        Score a joke and return detailed breakdown.

        Args:
            joke: The joke to evaluate
            word1: First word in the pair
            word2: Second word in the pair

        Returns:
            Dictionary with detailed scoring breakdown
        """
        scoring_prompt = self._build_scoring_prompt(joke, word1, word2)

        try:
            completion = self.client.chat.completions.create(
                model=self.model,
                messages=[
                    {"role": "system", "content": self.system_prompt},
                    {"role": "user", "content": scoring_prompt}
                ],
                temperature=self.temperature,
                max_tokens=1024,
                stream=False
            )

            response_content = completion.choices[0].message.content
            score_data = self._parse_scoring_response(response_content)

            return {
                "joke": joke,
                "word_pair": [word1, word2],
                "scores": score_data,
                "overall_score": score_data['overall_score']
            }

        except Exception as e:
            return {
                "joke": joke,
                "word_pair": [word1, word2],
                "scores": {
                    "creativity": 5.0,
                    "word_integration": 5.0,
                    "humor_impact": 5.0,
                    "structure": 5.0,
                    "cleverness": 5.0,
                    "overall_score": 5.0,
                    "justification": f"Error during scoring: {str(e)}"
                },
                "overall_score": 5.0,
                "error": str(e)
            }

In [3]:
print("=" * 80)
print("GPT-OSS HUMOR SCORER (NVIDIA)")
print("=" * 80)

# Initialize scorer
scorer = LLMHumorScorer(
    temperature=0.3,
    max_retries=2,
    api_key='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'
)

# Test jokes
test_jokes = [
    "Why did the telescope break up with the sandwich? It needed more space, and she was too wrapped up in herself!",
    "A telescope and a sandwich walk into a bar. The bartender says, 'One of you has a stellar view, the other has layers of complexity.'",
    "I tried to use my telescope to find the perfect sandwich, but all I saw was a sub-par universe."
]

word1, word2 = "telescope", "sandwich"

print(f"\nüéØ Evaluating jokes for word pair: '{word1}' + '{word2}'")
print("=" * 80)

# Score all jokes with verbose output
scores = scorer.score_jokes(test_jokes, word1, word2, verbose=True)

print(f"\n{'='*80}")
print("FINAL RESULTS")
print(f"{'='*80}")
for i, (joke, score) in enumerate(zip(test_jokes, scores), 1):
    print(f"\nJoke {i}: {score}/10")
    print(f"Text: {joke}")

print(f"\nüìä Average Score: {sum(scores)/len(scores):.2f}/10")
print(f"üèÜ Best Score: {max(scores)}/10")
print(f"üìâ Lowest Score: {min(scores)}/10")

GPT-OSS HUMOR SCORER (NVIDIA)

üéØ Evaluating jokes for word pair: 'telescope' + 'sandwich'

Scoring Joke 1/3
Joke: Why did the telescope break up with the sandwich? It needed more space, and she was too wrapped up in herself!

üìä Raw scoring response:
{
  "creativity": 5,
  "word_integration": 5,
  "humor_impact": 4,
  "structure": 6,
  "cleverness": 5,
  "overall_score": 4.9,
  "justification": "The joke is mildly inventive but relies on a forced connection between telescope and sandwich. The wordplay (space, wrapped up) is modest, and the punchline lands with a light chuckle rather than a strong laugh. Structure is clear, yet overall comedic impact remains limited."
}

üéØ Detailed Scores:
   Creativity: 5.0/10
   Word Integration: 5.0/10
   Humor Impact: 4.0/10
   Structure: 6.0/10
   Cleverness: 5.0/10
   Overall: 4.9/10
   Justification: The joke is mildly inventive but relies on a forced connection between telescope and sandwich. The wordplay (space, wrapped up) is modest, a