In [None]:
# Import necessary libraries
# transformers: Provides pre-trained models and tokenizers like GPT-2.
#   GPT2LMHeadModel is the GPT-2 model with a language modeling head.
#   GPT2Tokenizer is used to convert text into tokens that the model can understand.
from transformers import GPT2LMHeadModel, GPT2Tokenizer
# torch: PyTorch library, used for tensor computations and neural network modeling.
import torch
# math: Standard Python library for mathematical functions, used here for math.exp().
import math

# Load pre-trained model and tokenizer
# model_name specifies which version of GPT-2 to use (e.g., "gpt2", "gpt2-medium").
model_name = "gpt2"
# Load the tokenizer associated with the specified GPT-2 model.
# The tokenizer converts input strings into a format (token IDs) understandable by the model.
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
# Load the pre-trained GPT-2 model (GPT2LMHeadModel for language modeling tasks).
model = GPT2LMHeadModel.from_pretrained(model_name)
# Set the model to evaluation mode.
# This disables layers like dropout and batch normalization that behave differently during training and inference.
# It's crucial for getting deterministic and correct results when using the model for inference.
model.eval()

# Define a function to compute the log probability of a sentence
def compute_log_probability(sentence):
    # Tokenize the input sentence.
    # - The tokenizer converts the sentence string into a sequence of token IDs.
    # - return_tensors="pt" specifies that the output should be PyTorch tensors.
    inputs = tokenizer(sentence, return_tensors="pt")

    # Disable gradient calculations.
    # torch.no_grad() is used because we are doing inference (calculating probability),
    # not training the model. This reduces memory consumption and speeds up computation
    # as gradients are not needed.
    with torch.no_grad():
        # Perform a forward pass through the model.
        # **inputs unpacks the dictionary returned by the tokenizer (input_ids, attention_mask, etc.)
        # as arguments to the model.
        # labels=inputs["input_ids"]: By providing 'labels' (which are the same as input_ids for causal LM),
        # the model automatically calculates the CrossEntropyLoss between its predictions and the true tokens.
        # The model predicts the next token at each position and compares it to the actual token provided in 'labels'.
        outputs = model(**inputs, labels=inputs["input_ids"])

        # Calculate the total log likelihood of the sentence.
        # outputs.loss: This is the average negative log likelihood per token (CrossEntropyLoss).
        #   The loss is calculated as -1/N * sum_{i=1 to N} log P(token_i | preceding_tokens),
        #   where N is the sequence length.
        # inputs["input_ids"].size(1): This gives the length of the token sequence (N).
        # To get the total log likelihood for the entire sentence, we multiply the negative average log likelihood
        # by the sequence length and then negate it again.
        # So, log_likelihood = - (average_negative_log_likelihood) * sequence_length
        #                  = - (-1/N * sum log P(token_i | ...)) * N
        #                  = sum log P(token_i | ...)
        # This sum represents the log of the joint probability of the sequence:
        # log P(w_1, w_2, ..., w_n) = sum_{i=1 to n} log P(w_i | w_1, ..., w_{i-1}).
        # The model's loss is essentially the negative average of these conditional log probabilities.
        log_likelihood = -outputs.loss.item() * inputs["input_ids"].size(1)
    return log_likelihood

# Define a list of example sentences to compare their probabilities.
# These sentences are chosen to illustrate how the language model assigns probabilities:
# - "the mouse ate the cheese" is grammatically correct and semantically plausible.
# - "the cheese ate the mouse" is grammatically correct but semantically less plausible.
# - "mouse the the cheese ate" is grammatically incorrect and semantically implausible.
# We expect the model to assign higher probabilities to more plausible and grammatically correct sentences.
sentences = [
    "the mouse ate the cheese",
    "the cheese ate the mouse",
    "mouse the the cheese ate"
]

# Iterate through the sentences and print their log probabilities and probabilities.
for sentence in sentences:
    # Compute the log probability of the current sentence using the defined function.
    logp = compute_log_probability(sentence)
    # Convert the log probability to actual probability using math.exp().
    # Since log_probability = log(P), then P = exp(log_probability).
    prob = math.exp(logp)
    # Print the results.
    print(f"Sentence: '{sentence}'")
    print(f"Log Probability: {logp:.4f}")
    print(f"Probability: {prob:.2e}")
    print()
