# Benchmarking Using HellaSwag Dataset

In [1]:
# ind: question index
# id: question id
# activity_label: A short phrase describing the events in the question
# ctx: The full context for the question
# ctx_a: The first sentence of the context
# ctx_b: The second sentence of the context
# dataset: Domain of the question -- e.g. activitynet / wikihow
# ending_options: A list of four ending choices

# This repo contains a sample of the HellaSwag dataset
# located under prompttools/data/benchmarking/hellaswag
# To download the full dataset, please visit:
# https://leaderboard.allenai.org/hellaswag/submissions/get-started
# The link to the dataset is ~1/4 down the page

In [2]:
from prompttools.benchmarks import Benchmark
from prompttools.experiment import (
    LlamaCppExperiment,
    OpenAIChatExperiment,
    HuggingFaceHubExperiment,
)
from prompttools.utils import semantic_similarity

import pandas as pd
import datetime
import json

## Setup HellaSwag Dataset for Benchmark

In [3]:
# Sample size to test
sample_size = 3

data = []
with open("prompttools/data/benchmarking/hellaswag/hellaswag_dataset.jsonl", "r") as file:
    for line in file:
        try:
            json_obj = json.loads(line)
            data.append([json_obj["ctx"], json_obj["ending_options"]])
        except json.JSONDecodeError:
            print(f"Skipped invalid JSON: {line}")
labels = []
with open("prompttools/data/benchmarking/hellaswag/hellaswag_labels.lst", "r") as file:
    for line in file:
        try:
            json_obj = json.loads(line)
            labels.append(json_obj)
        except json.JSONDecodeError:
            print(f"Skipped invalid JSON: {line}")

hella_swag = pd.DataFrame(data, columns=["ctx", "ending_options"])
hella_swag["labels"] = labels
hella_swag = hella_swag.head(sample_size)  # sample

sample_ctxs = hella_swag["ctx"].values
sample_ending_options = hella_swag["ending_options"].values
sample_labels = hella_swag["labels"].values

## Model Params

In [4]:
temperatures = [0.5]

## Setup Experiments to Benchmark

In [5]:
# Google Flan T5 XXL
models = ["google/flan-t5-xxl"]
prompts = sample_ctxs
task = ["text-generation"]
google_flan_t5_xxl_experiment = HuggingFaceHubExperiment(models, prompts, task, temperature=temperatures)
benchmarking_google_flan_t5_xxl = Benchmark(
    experiment=google_flan_t5_xxl_experiment,
    eval_method=semantic_similarity,
    prompts=sample_ctxs,
    response_options=sample_ending_options,
    correct_response_indices=sample_labels,
)

# Vicuna 7b
vicuna7b_experiment = LlamaCppExperiment(
    [
        "../llama/llama.cpp/models/7b/ggml-vicuna-7b-1.1-q4_0.bin",
    ],
    sample_ctxs,
    call_params=dict(temperature=temperatures),
)
benchmarking_vicuna7b = Benchmark(
    experiment=vicuna7b_experiment,
    eval_method=semantic_similarity,
    prompts=sample_ctxs,
    response_options=sample_ending_options,
    correct_response_indices=sample_labels,
)

# Vicuna 13b
vicuna13b_experiment = LlamaCppExperiment(
    [
        "../llama/llama.cpp/models/13b/ggml-vic13b-uncensored-q4_0.bin",
    ],
    sample_ctxs,
    call_params=dict(temperature=temperatures),
)
benchmarking_vicuna13b = Benchmark(
    experiment=vicuna13b_experiment,
    eval_method=semantic_similarity,
    prompts=sample_ctxs,
    response_options=sample_ending_options,
    correct_response_indices=sample_labels,
)

# OpenAI Chat
openai_chat_experiment = OpenAIChatExperiment(
    ["gpt-3.5-turbo"], [[{"role": "system", "content": c}] for c in sample_ctxs], temperature=temperatures
)
benchmarking_openai_chat = Benchmark(
    experiment=openai_chat_experiment,
    eval_method=semantic_similarity,
    prompts=sample_ctxs,
    response_options=sample_ending_options,
    correct_response_indices=sample_labels,
)

## Run Benchmarks

In [6]:
start = datetime.datetime.now()
print(f"Start time: {start}")

google_flan_t5_xxl_results = benchmarking_google_flan_t5_xxl.multiple_choice_benchmark()
vicuna7b_results = benchmarking_vicuna7b.multiple_choice_benchmark()
vicuna13b_results = benchmarking_vicuna13b.multiple_choice_benchmark()
openai_chat_results = benchmarking_openai_chat.multiple_choice_benchmark()

print("Time taken: ", datetime.datetime.now() - start)

Start time: 2023-08-14 22:13:49.096696


You're using a different task than the one specified in the repository. Be sure to know what you're doing :)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  benchmark_df["response_options"] = self.response_options
llama.cpp: loading model from ../llama/llama.cpp/models/13b/ggml-vic13b-uncensored-q4_0.bin
llama_model_load_internal: format     = ggjt v3 (latest)
llama_model_load_internal: n_vocab    = 32000
llama_model_load_internal: n_ctx      = 512
llama_model_load_internal: n_embd     = 5120
llama_model_load_internal: n_mult     = 256
llama_model_load_internal: n_head     = 40
llama_model_load_internal: n_layer    = 40
llama_model_load_internal: n_rot      = 128
llama_model_load_internal: ftype      = 2 (mostly Q4_0)
llama_model_load_internal: n_ff       = 13824
llama_mod

Time taken:  0:01:51.087746


In [7]:
print(
    pd.DataFrame(
        data=[[google_flan_t5_xxl_results, vicuna7b_results, vicuna13b_results, openai_chat_results]],
        columns=["google_flan_t5_xxl", "vicuna7b", "vicuna13b", "openai_chat"],
    )
)

   google_flan_t5_xxl  vicuna7b  vicuna13b  openai_chat
0  0.333333            0.333333  0.333333   0.666667   
