In [1]:
import os
from dotenv import load_dotenv
from openai import OpenAI


load_dotenv("../saved_keys.env")

assert os.environ["OPENAI_API_KEY"][:2] == "sk",\
       "Please sign up for access to the OpenAI API and provide access token in keys.env file"

In [2]:
# Initialize the client
client = OpenAI(
    api_key=os.getenv("OPENAI_API_KEY"),
)

In [13]:
def analyze_logprobs(message):
    logprobs = message.choices[0].logprobs

    if not logprobs:
        print("No logprobs available in the response")
        return

    # Print each token and its probability
    print("\nToken-by-token analysis:")
    print("-" * 50)
    for token_info in logprobs.content:
        token = token_info.token
        logprob = token_info.logprob
        probability = round(100 * (2.718281828459045 ** logprob), 2)

        print(f"Token: {token!r}")
        print(f"Log Probability: {logprob:.4f}")
        print(f"Probability: {probability}%")

        # If top logprobs are available, show alternatives
        if token_info.top_logprobs:
            print("Top alternatives:")
            for alt_token in token_info.top_logprobs:
                if alt_token.token != token:
                    alt_probability = round(100 * (2.718281828459045 ** alt_token.logprob), 2)
                    print(f"  {alt_token.token!r}: {alt_probability}%")
        print("-" * 50)


def analyze_token_confidence(logprobs):
    """Analyze the model's confidence in its predictions"""
    if not logprobs or not hasattr(logprobs, 'content'):
        print("Debug: logprobs structure:", logprobs)  # Debug print
        return

    try:
        confidences = [2.718281828459045 ** lp.logprob for lp in logprobs.content]
        avg_confidence = sum(confidences) / len(confidences)
        min_confidence = min(confidences)
        max_confidence = max(confidences)

        print("\nConfidence Analysis:")
        print(f"Average confidence: {avg_confidence:.2%}")
        print(f"Minimum confidence: {min_confidence:.2%}")
        print(f"Maximum confidence: {max_confidence:.2%}")

        # Find tokens with unusually low confidence
        threshold = avg_confidence * 0.5  # 50% of average confidence
        low_confidence_tokens = [
            (lp.token, 2.718281828459045 ** lp.logprob)
            for lp in logprobs.content
            if 2.718281828459045 ** lp.logprob < threshold
        ]

        if low_confidence_tokens:
            print("\nTokens with unusually low confidence:")
            for token, conf in low_confidence_tokens:
                print(f"Token: {token!r}, Confidence: {conf:.2%}")
    except AttributeError as e:
        print(f"Debug: Error processing logprobs: {e}")
        print(f"Debug: logprobs type: {type(logprobs)}")
        print(f"Debug: logprobs content: {logprobs}")



def calculate_response_confidence(logprobs):
    """Calculate an overall confidence score for the response.
    Returns a score between 0 and 1, where:
    - 1 indicates very high confidence
    - 0 indicates very low confidence
    """
    if not logprobs:
        return None

    # Convert logprobs to probabilities
    confidences = [2.718281828459045 ** lp.logprob for lp in logprobs.content]

    # Calculate metrics
    avg_confidence = sum(confidences) / len(confidences)
    min_confidence = min(confidences)

    # Weight both average and minimum confidence in the final score
    # This helps catch both overall low confidence and individual uncertain tokens
    confidence_score = (0.7 * avg_confidence) + (0.3 * min_confidence)

    return round(confidence_score, 4)

In [20]:
def get_response_with_confidence(question: str, model: str = "gpt-3.5-turbo", show_logprobs: bool = False) -> dict:
    """Get a model response with confidence analysis for a given question."""
    # Make the API call
    message = client.chat.completions.create(
        model=model,
        max_tokens=512,
        temperature=0,
        messages=[
            {
                "role": "system",
                "content": "You are a helpful AI assistant."
            },
            {
                "role": "user",
                "content": question
            }
        ],
        logprobs=True,
        top_logprobs=5
    )

    response_text = message.choices[0].message.content
    logprobs = message.choices[0].logprobs
    confidence_score = calculate_response_confidence(logprobs)

    if show_logprobs:
        analyze_logprobs(message)

    result = {
        "response": response_text,
        "confidence_score": confidence_score,
        "detailed_analysis": {
            "confidence_analysis": analyze_token_confidence(logprobs)
        }
    }

    return result

# Try the function with a simple question
question = "Who was the president of the United States in 2010?"
result = get_response_with_confidence(question, show_logprobs=True)

print(f"\nQuestion: {question}")
print(f"Response: {result['response']}")
print(f"Confidence Score: {result['confidence_score']:.2%}")


Token-by-token analysis:
--------------------------------------------------
Token: 'The'
Log Probability: -0.9156
Probability: 40.03%
Top alternatives:
  'Bar': 35.67%
  'In': 24.28%
  'During': 0.02%
  'As': 0.0%
--------------------------------------------------
Token: ' president'
Log Probability: -0.3546
Probability: 70.15%
Top alternatives:
  ' President': 29.85%
  ' ': 0.01%
  ' US': 0.0%
  ' United': 0.0%
--------------------------------------------------
Token: ' of'
Log Probability: -0.0000
Probability: 100.0%
Top alternatives:
  ' in': 0.0%
  ' the': 0.0%
  ' ': 0.0%
  ' was': 0.0%
--------------------------------------------------
Token: ' the'
Log Probability: -0.0000
Probability: 100.0%
Top alternatives:
  ' United': 0.0%
  ' The': 0.0%
  ' ': 0.0%
  ' of': 0.0%
--------------------------------------------------
Token: ' United'
Log Probability: -0.0000
Probability: 100.0%
Top alternatives:
  ' united': 0.0%
  ' Un': 0.0%
  ' U': 0.0%
  ' US': 0.0%
-----------------------

In [21]:
# Try the function with a question where the model hallucinates
question = "What was the exact time and temperature when Marie Curie made her first radium discovery? Include the barometric pressure in the lab."
result = get_response_with_confidence(question, show_logprobs=False)

print(f"\nQuestion: {question}")
print(f"Response: {result['response']}")
print(f"Confidence Score: {result['confidence_score']:.2%}")


Confidence Analysis:
Average confidence: 84.54%
Minimum confidence: 36.26%
Maximum confidence: 100.00%

Tokens with unusually low confidence:
Token: ' don', Confidence: 37.59%
Token: ' in', Confidence: 36.26%
Token: ' Let', Confidence: 41.53%

Question: What was the exact time and temperature when Marie Curie made her first radium discovery? Include the barometric pressure in the lab.
Response: I'm sorry, but I don't have access to real-time data or historical records of specific events like Marie Curie's first radium discovery. However, I can provide you with general information about her discovery and the conditions in which she conducted her experiments. Let me know if you would like to know more about that.
Confidence Score: 70.06%


In [22]:
# Try the function with a question where the model hallucinates
question = "Describe the collaboration between Nikola Tesla and Thomas Edison on their joint patent for wireless energy transmission in 1891. What were the specific technical details?"
result = get_response_with_confidence(question, show_logprobs=False)

print(f"\nQuestion: {question}")
print(f"Response: {result['response']}")
print(f"Confidence Score: {result['confidence_score']:.2%}")


Confidence Analysis:
Average confidence: 87.40%
Minimum confidence: 32.15%
Maximum confidence: 100.00%

Tokens with unusually low confidence:
Token: ',', Confidence: 34.12%
Token: ' their', Confidence: 35.47%
Token: 'Tesla', Confidence: 33.25%
Token: ' developed', Confidence: 34.73%
Token: ' this', Confidence: 32.15%

Question: Describe the collaboration between Nikola Tesla and Thomas Edison on their joint patent for wireless energy transmission in 1891. What were the specific technical details?
Response: I'm sorry, but there seems to be a misunderstanding. Nikola Tesla and Thomas Edison were actually rivals in the field of electrical engineering, and they did not collaborate on a joint patent for wireless energy transmission in 1891 or at any other time. In fact, their approaches to electrical technology were quite different, with Tesla focusing on alternating current (AC) systems and wireless transmission, while Edison was known for his work on direct current (DC) systems and the i