In [None]:
# Import necessary libraries
# transformers: Provides pre-trained models and tokenizers like GPT-2.
#   GPT2LMHeadModel is the GPT-2 model with a language modeling head.
#   GPT2Tokenizer is used to convert text into tokens that the model can understand.
from transformers import GPT2LMHeadModel, GPT2Tokenizer
# torch: PyTorch library, used for tensor computations and neural network modeling.
import torch
# math: Standard Python library for mathematical functions, used here for math.exp().
import math

# Load pre-trained model and tokenizer
# model_name specifies which version of GPT-2 to use (e.g., "gpt2", "gpt2-medium").
model_name = "gpt2"
# Load the tokenizer associated with the specified GPT-2 model.
# The tokenizer converts input strings into a format (token IDs) understandable by the model.
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
# Load the pre-trained GPT-2 model (GPT2LMHeadModel for language modeling tasks).
model = GPT2LMHeadModel.from_pretrained(model_name)
# Set the model to evaluation mode.
# This disables layers like dropout and batch normalization that behave differently during training and inference.
# It's crucial for getting deterministic and correct results when using the model for inference.
model.eval()

# Define a function to compute the log probability of a sentence
# This version is updated to prepend BOS token for more accurate probability calculation.
def compute_log_probability(sentence_text, model, tokenizer):
    # For GPT-2, the End-Of-Sentence (EOS) token is often used as the Begin-Of-Sentence (BOS) token
    # when scoring sentences or calculating perplexity, as GPT-2 wasn't explicitly trained with a BOS token.
    bos_id = tokenizer.eos_token_id 
    
    # Tokenize the input sentence text to get a list of token IDs.
    token_ids_list = tokenizer(sentence_text, return_tensors="pt")["input_ids"][0].tolist()
    
    # Prepend the BOS token ID to the sequence of token IDs.
    # This forms the input that the model will process.
    model_input_ids = torch.tensor([[bos_id] + token_ids_list])

    # Disable gradient calculations for inference.
    with torch.no_grad():
        # Perform a forward pass. `labels` are provided for automatic loss calculation.
        # The model internally shifts `labels` to align them with `logits` for calculating CrossEntropyLoss.
        # For `input_ids = [BOS, t1, t2, ..., tN]`, the `labels` become `[BOS, t1, t2, ..., tN]`. 
        # The model computes loss for predicting t1 from BOS, t2 from [BOS, t1], ..., tN from [BOS, ..., tN-1].
        # There are `len(token_ids_list)` such predictions.
        outputs = model(model_input_ids, labels=model_input_ids)

        # Calculate the total log likelihood of the original sentence (excluding BOS).
        # outputs.loss is the *average* negative log likelihood per predicted token.
        # The number of predicted tokens here is `len(token_ids_list)` (i.e., the original sentence length).
        # So, total_log_likelihood = - (average_neg_log_likelihood) * num_predicted_tokens
        log_likelihood = -outputs.loss.item() * len(token_ids_list)
    return log_likelihood

# Define a function to compute conditional log probabilities of each token in a sentence
def compute_conditional_log_probabilities(sentence_text, model, tokenizer):
    # For GPT-2, EOS token ID is used as BOS token ID.
    bos_id = tokenizer.eos_token_id

    # Tokenize the input sentence to get a list of token IDs.
    token_ids_list = tokenizer(sentence_text, return_tensors="pt")["input_ids"][0].tolist()

    # Create the model input by prepending the BOS token ID.
    # model_input_ids will be [BOS, t_0, t_1, ..., t_{L-1}], where L is length of token_ids_list.
    model_input_ids = torch.tensor([[bos_id] + token_ids_list])

    # Perform a forward pass through the model to get logits.
    # No labels are passed here as we want to manually compute conditional probabilities from logits.
    with torch.no_grad():
        outputs = model(model_input_ids)
        # all_logits shape: [batch_size=1, sequence_length_with_BOS, vocab_size]
        all_logits = outputs.logits 

    # Align logits with target tokens for conditional probability calculation.
    # The logits at position `i` are used to predict the token at position `i+1`.
    # So, `all_logits[0, j, :]` contains the logits for predicting `model_input_ids[0, j+1]`.
    # We need logits for predicting tokens t_0, t_1, ..., t_{L-1} (the original sentence tokens).
    # Logit for t_0 (model_input_ids[0,1]) is all_logits[0,0,:] (context: BOS).
    # Logit for t_1 (model_input_ids[0,2]) is all_logits[0,1,:] (context: BOS, t_0).
    # ...
    # Logit for t_{L-1} (model_input_ids[0,L]) is all_logits[0,L-1,:] (context: BOS, t_0, ..., t_{L-2}).
    # Thus, we take logits from index 0 up to the second to last token position.
    shifted_prediction_logits = all_logits[0, :-1, :] # Shape: [sequence_length_orig_sentence, vocab_size]

    # The target tokens are the original sentence tokens (excluding the BOS token from input).
    # These are t_0, t_1, ..., t_{L-1}.
    target_token_ids = model_input_ids[0, 1:] # Shape: [sequence_length_orig_sentence]

    # Decode the target token IDs to strings for more readable output.
    target_tokens_str_list = [tokenizer.decode(token_id) for token_id in target_token_ids]

    conditional_log_probs_list = []
    # Iterate over each token in the original sentence to calculate its conditional log probability.
    for k in range(len(target_token_ids)):
        # Get the logits corresponding to the prediction of the k-th target token.
        current_logits_for_target = shifted_prediction_logits[k, :] # Shape: [vocab_size]
        
        # Apply log_softmax to the logits to get log probabilities over the vocabulary.
        # dim=-1 ensures softmax is computed across the vocabulary dimension.
        log_softmax_dist = torch.log_softmax(current_logits_for_target, dim=-1)
        
        # Get the ID of the actual k-th target token.
        actual_target_token_id = target_token_ids[k]
        
        # Extract the log probability of the actual target token from the distribution.
        token_log_prob = log_softmax_dist[actual_target_token_id].item()
        
        conditional_log_probs_list.append((target_tokens_str_list[k], token_log_prob))
            
    return conditional_log_probs_list

# Define a list of example sentences to compare their probabilities.
# These sentences are chosen to illustrate how the language model assigns probabilities:
# - "the mouse ate the cheese" is grammatically correct and semantically plausible.
# - "the cheese ate the mouse" is grammatically correct but semantically less plausible.
# - "mouse the the cheese ate" is grammatically incorrect and semantically implausible.
# We expect the model to assign higher probabilities to more plausible and grammatically correct sentences.
sentences = [
    "the mouse ate the cheese",
    "the cheese ate the mouse",
    "mouse the the cheese ate"
]

# Iterate through the sentences and print their log probabilities and probabilities.
for sentence in sentences:
    # Compute the log probability of the current sentence using the defined function.
    # Pass the model and tokenizer explicitly now.
    logp = compute_log_probability(sentence, model, tokenizer)
    # Convert the log probability to actual probability using math.exp().
    # Since log_probability = log(P), then P = exp(log_probability).
    prob = math.exp(logp)
    # Print the results.
    print(f"Sentence: '{sentence}'")
    print(f"Log Probability: {logp:.4f}")
    print(f"Probability: {prob:.2e}")

    # Compute and print conditional log probabilities for each token in the sentence.
    conditional_probs = compute_conditional_log_probabilities(sentence, model, tokenizer)
    print(f"Conditional Log Probabilities for '{sentence}':")
    total_cond_log_prob = 0
    for token_str, log_p_token in conditional_probs:
        # Ensure token_str is properly escaped for printing, e.g., if it contains special characters.
        # Python's f-string handles most cases well. For extreme cases, repr() might be useful.
        print(f"  Log P('{token_str}' | ...): {log_p_token:.4f}")
        total_cond_log_prob += log_p_token
    print(f"  Sum of conditional log_probs: {total_cond_log_prob:.4f}") # For verification
    print()
