In [18]:
from transformers import AutoModelForSequenceClassification
import numpy as np

pairs = [ # Test data, List[Tuple[str, str]]
    ("The capital of France is Berlin.", "The capital of France is Paris."), # factual but hallucinated
    ('I am in California', 'I am in United States.'), # Consistent
    ('I am in United States', 'I am in California.'), # Hallucinated
    ("A person on a horse jumps over a broken down airplane.", "A person is outdoors, on a horse."),
    ("A boy is jumping on skateboard in the middle of a red bridge.", "The boy skates down the sidewalk on a red bridge"),
    ("A man with blond-hair, and a brown shirt drinking out of a public water fountain.", "A blond man wearing a brown shirt is reading a book."),
    ("Mark Wahlberg was a fan of Manny.", "Manny was a fan of Mark Wahlberg.")
]

# Step 1: Load the model
model = AutoModelForSequenceClassification.from_pretrained(
    'vectara/hallucination_evaluation_model', trust_remote_code=True)

# Step 2: Use the model to predict
predictions = model.predict(pairs) # note the predict() method. Do not do model(pairs). 
# tensor([0.0111, 0.6474, 0.1290, 0.8969, 0.1846, 0.0050, 0.0543])

# Iterate through results and print scores
for idx in range(len(pairs)):
    print(f"Source: {pairs[idx][0]}")
    print(f"Response: {pairs[idx][1]}")
    print(f"SCORE: {np.round(predictions[idx].item(), 3)}\n")  # Convert tensor to Python float


You are using a model of type HHEMv2Config to instantiate a model of type HHEMv2. This is not supported for all configurations of models and can yield errors.


Source: The capital of France is Berlin.
Response: The capital of France is Paris.
SCORE: 0.011

Source: I am in California
Response: I am in United States.
SCORE: 0.647

Source: I am in United States
Response: I am in California.
SCORE: 0.129

Source: A person on a horse jumps over a broken down airplane.
Response: A person is outdoors, on a horse.
SCORE: 0.897

Source: A boy is jumping on skateboard in the middle of a red bridge.
Response: The boy skates down the sidewalk on a red bridge
SCORE: 0.185

Source: A man with blond-hair, and a brown shirt drinking out of a public water fountain.
Response: A blond man wearing a brown shirt is reading a book.
SCORE: 0.005

Source: Mark Wahlberg was a fan of Manny.
Response: Manny was a fan of Mark Wahlberg.
SCORE: 0.054



In [19]:
# Adding both hallucination and non-hallucination cases and rerunning the experiments
pairs = [
    # Simple fact-checking (Hallucination)
    ("Universal Studios is in Orlando, Florida.", "Universal Studios is in Los Angeles."),  # Incorrect
    ("Employees get free park tickets every quarter.", "Employees get unlimited free tickets."),  # Incorrect

    # Longer factual consistency check (Hallucination)
    (
        "At Universal Studios, employees have access to a portal where they can view their perks. "
        "Perks include discounts on food, merchandise, and tickets for friends and family. "
        "The information is updated quarterly, and employees can check the latest benefits online.",
        "Universal Studios employees get discounts, but these are only for food. "
        "Friends and family cannot get any benefits, and there is no online portal."  # Incorrect
    ),
    (
        "To report a technical issue in the park, employees should use the internal support system. "
        "They need to log in and select the department responsible. If urgent, they can also call a support number.",
        "Employees should just call the front desk when something breaks."  # Incorrect
    ),

    # Contradictions and misleading statements (Hallucination)
    ("Universal Studios has three major theme parks.", "Universal Studios has five theme parks."),  # Incorrect
    ("The Wizarding World of Harry Potter is in Universal Studios Florida.", "Harry Potter Land is in Disneyland."),  # Incorrect

    # Non-hallucination (Correct paraphrases or factual matches)
    ("Universal Studios is in Orlando, Florida.", "Universal Studios Florida is located in Orlando."),  # Correct
    ("Employees receive quarterly free park tickets.", "Employees get free tickets every three months."),  # Correct
    (
        "The employee portal provides access to perks like discounts on food, merchandise, and park tickets.",
        "Employees can check their perks, such as food discounts and merchandise offers, on the portal."  # Correct
    ),
    (
        "To report technical issues, employees must log in to the internal support system and choose a department.",
        "Employees should use the internal system to report technical issues by selecting the relevant department."  # Correct
    ),
]

# Step 2: Use the model to predict
predictions = model.predict(pairs) # note the predict() method. Do not do model(pairs). 

# Iterate through results and print scores
for idx in range(len(pairs)):
    print(f"Source: {pairs[idx][0]}")
    print(f"Response: {pairs[idx][1]}")
    print(f"SCORE: {np.round(predictions[idx].item(), 3)}\n")  # Convert tensor to Python float

Source: Universal Studios is in Orlando, Florida.
Response: Universal Studios is in Los Angeles.
SCORE: 0.012

Source: Employees get free park tickets every quarter.
Response: Employees get unlimited free tickets.
SCORE: 0.018

Source: At Universal Studios, employees have access to a portal where they can view their perks. Perks include discounts on food, merchandise, and tickets for friends and family. The information is updated quarterly, and employees can check the latest benefits online.
Response: Universal Studios employees get discounts, but these are only for food. Friends and family cannot get any benefits, and there is no online portal.
SCORE: 0.034

Source: To report a technical issue in the park, employees should use the internal support system. They need to log in and select the department responsible. If urgent, they can also call a support number.
Response: Employees should just call the front desk when something breaks.
SCORE: 0.007

Source: Universal Studios has three m

In [50]:
import pandas as pd
import os
import time

def load_data(csv_directory, dataset):
    # List to store all (context, true sentence) pairs
    pairs = []
    labels = []
    # Read each CSV file and process it
    for filename in os.listdir(csv_directory):
        if filename.endswith(dataset):
            file_path = os.path.join(csv_directory, filename)
            df = pd.read_csv(file_path)

            # Ensure the required columns exist
            if {"Source/Context", "Ungrounded Response", "Grounded Response"}.issubset(df.columns):
                for _, row in df.iterrows():
                    context = row["Source/Context"]  # Source/Context
                    grounded_sentence = row["Grounded Response"]  # True factual statement
                    ungrounded_sentence = row["Ungrounded Response"]  # True factual statement

                    # Append as a tuple to pairs
                    pairs.append((context, grounded_sentence))
                    labels.append(1)
                    pairs.append((context, ungrounded_sentence))
                    labels.append(0)
    return pairs, labels

def evaluate(model, csv_directory, dataset, debug=False):

    # List to store all (context, true sentence) pairs
    pairs, labels = load_data(csv_directory, dataset)
    
    start_time = time.time()
    # Step 2: Use the model to predict
    predictions = model.predict(pairs) # note the predict() method. Do not do model(pairs). 
    # tensor([0.0111, 0.6474, 0.1290, 0.8969, 0.1846, 0.0050, 0.0543])
    end_time = time.time()

    labels = np.array(labels)
    predicted_labels = (predictions > 0.5).int().numpy()

    if debug:
        # Iterate through results and print scores
        for idx in range(len(pairs)):
            print(f"Source: {pairs[idx][0]}")
            print(f"Response: {pairs[idx][1]}")
            print(f"SCORE: {np.round(predictions[idx].item(), 3)}\n")  # Convert tensor to Python float

    # Calculate true positives, false positives, false negatives
    true_positives = np.sum((predicted_labels == 1) & (labels == 1))
    false_positives = np.sum((predicted_labels == 1) & (labels == 0))
    false_negatives = np.sum((predicted_labels == 0) & (labels == 1))

    # Compute precision and recall
    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
    
    # Calculate execution time in milliseconds
    execution_time_ms = (end_time - start_time) * 1000
    
    print(f"Evaluating for {dataset}...")
    print(f"Number of samples: {len(pairs)}")
    print(f"Precision: {precision:.3f}")
    print(f"Recall: {recall:.3f}")
    print(f"Execution Time per sample: {execution_time_ms/len(pairs):.2f} ms")
    print(f"--------------------------------------")

    return

# Step 1: Load the model
model = AutoModelForSequenceClassification.from_pretrained(
    'vectara/hallucination_evaluation_model', trust_remote_code=True)
    
csv_directory = "/home/ilkin/Documents/UC/LM-Hallucinations-main/data"
evaluate(model, csv_directory, "GPS-hallucination-dataset.csv")
evaluate(model, csv_directory, "OHS-hallucination-dataset.csv")
evaluate(model, csv_directory, "SubsystemController-hallucination-dataset.csv")


You are using a model of type HHEMv2Config to instantiate a model of type HHEMv2. This is not supported for all configurations of models and can yield errors.


Evaluating for GPS-hallucination-dataset.csv...
Number of samples: 26
Precision: 0.917
Recall: 0.846
Execution Time per sample: 57.36 ms
--------------------------------------
Evaluating for OHS-hallucination-dataset.csv...
Number of samples: 36
Precision: 1.000
Recall: 0.944
Execution Time per sample: 50.88 ms
--------------------------------------
Evaluating for SubsystemController-hallucination-dataset.csv...
Number of samples: 40
Precision: 1.000
Recall: 0.900
Execution Time per sample: 35.96 ms
--------------------------------------


In [52]:
evaluate(model, csv_directory, "GPS-hallucination-dataset.csv", debug = True)


Source: General Performance Specification (GPS) v23.0 – Section 1.1.4 Operational Life and Availability: "Equipment shall be designed to meet the required Availability during scheduled Operating Hours throughout the Operational Life when operated and maintained per the vendor-provided Operation and Maintenance Manuals. Operating hours are defined as 16 hours per day, which is equivalent to 5840 hours per year."
Response: The required availability for system-level ride and show equipment is 99.5%, and the operational life is 20 years.
SCORE: 0.01

Source: General Performance Specification (GPS) v23.0 – Section 1.1.4 Operational Life and Availability: "Equipment shall be designed to meet the required Availability during scheduled Operating Hours throughout the Operational Life when operated and maintained per the vendor-provided Operation and Maintenance Manuals. Operating hours are defined as 16 hours per day, which is equivalent to 5840 hours per year."
Response: The required availabil