# Benchmarking Using HellaSwag Dataset

In [1]:
# ind: question index
# id: question id
# activity_label: A short phrase describing the events in the question
# ctx: The full context for the question
# ctx_a: The first sentence of the context
# ctx_b: The second sentence of the context
# dataset: Domain of the question -- e.g. activitynet / wikihow
# ending_options: A list of four ending choices

In [2]:
from prompttools.benchmarks import Benchmark
from prompttools.experiment import (
    LlamaCppExperiment,
    OpenAIChatExperiment,
    HuggingFaceHubExperiment,
)
from prompttools.utils import semantic_similarity

import pandas as pd
import datetime
import json

data = []
with open('prompttools/data/benchmarking/hellaswag/train.jsonl', 'r') as file:
    for line in file:
        try:
            json_obj = json.loads(line)
            data.append([json_obj['ctx'], json_obj['ending_options']])
        except json.JSONDecodeError:
            print(f"Skipped invalid JSON: {line}")
labels = []
with open('prompttools/data/benchmarking/hellaswag/train-labels.lst', 'r') as file:
    for line in file:
        try:
            json_obj = json.loads(line)
            labels.append(json_obj)
        except json.JSONDecodeError:
            print(f"Skipped invalid JSON: {line}")

hella_swag = pd.DataFrame(data, columns=['ctx', 'ending_options'])
hella_swag["labels"] = labels
hella_swag = hella_swag.head(5)

sample_ctxs = hella_swag['ctx'].values
sample_ending_options = hella_swag['ending_options'].values
sample_labels = hella_swag['labels'].values

start = datetime.datetime.now()
print(f"Start time: {start}")

temperatures = [0.5]

models = ["google/flan-t5-xxl"]
prompts = sample_ctxs
task = ["text-generation"]
google_flan_t5_xxl_experiment = HuggingFaceHubExperiment(
    models, prompts, task, temperature=temperatures
)
benchmarking_google_flan_t5_xxl = Benchmark(
    experiment=google_flan_t5_xxl_experiment,
    eval_method=semantic_similarity,
    prompts=sample_ctxs,
    response_options=sample_ending_options,
    correct_response_indices=sample_labels
)
google_flan_t5_xxl_results = benchmarking_google_flan_t5_xxl.multiple_choice_benchmark()

vicuna7b_experiment = LlamaCppExperiment(
    [
        "../llama/llama.cpp/models/7b/ggml-vicuna-7b-1.1-q4_0.bin",
    ],
    sample_ctxs,
    call_params=dict(temperature=temperatures),
)
benchmarking_vicuna7b = Benchmark(
    experiment=vicuna7b_experiment,
    eval_method=semantic_similarity,
    prompts=sample_ctxs,
    response_options=sample_ending_options,
    correct_response_indices=sample_labels
)
vicuna7b_results = benchmarking_vicuna7b.multiple_choice_benchmark()

vicuna13b_experiment = LlamaCppExperiment(
    [
        "../llama/llama.cpp/models/13b/ggml-vic13b-uncensored-q4_0.bin",
    ],
    sample_ctxs,
    call_params=dict(temperature=temperatures),
)
benchmarking_vicuna13b = Benchmark(
    experiment=vicuna13b_experiment,
    eval_method=semantic_similarity,
    prompts=sample_ctxs,
    response_options=sample_ending_options,
    correct_response_indices=sample_labels
)
vicuna13b_results = benchmarking_vicuna13b.multiple_choice_benchmark()

openai_chat_experiment = OpenAIChatExperiment(
    ["gpt-3.5-turbo"],
    [
        [{"role": "system", "content": c}]
        for c in sample_ctxs
    ],
    temperature=temperatures
)
benchmarking_openai_chat = Benchmark(
    experiment=openai_chat_experiment,
    eval_method=semantic_similarity,
    prompts=sample_ctxs,
    response_options=sample_ending_options,
    correct_response_indices=sample_labels,
)
openai_chat_results = benchmarking_openai_chat.multiple_choice_benchmark()

print("Time taken: ", datetime.datetime.now() - start)


Start time: 2023-08-14 18:17:54.370561


You're using a different task than the one specified in the repository. Be sure to know what you're doing :)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.benchmark_df["response_options"] = self.response_options
llama.cpp: loading model from ../llama/llama.cpp/models/7b/ggml-vicuna-7b-1.1-q4_0.bin
llama_model_load_internal: format     = ggjt v3 (latest)
llama_model_load_internal: n_vocab    = 32000
llama_model_load_internal: n_ctx      = 512
llama_model_load_internal: n_embd     = 4096
llama_model_load_internal: n_mult     = 256
llama_model_load_internal: n_head     = 32
llama_model_load_internal: n_head_kv  = 32
llama_model_load_internal: n_layer    = 32
llama_model_load_internal: n_rot      = 128
llama_model_load_internal: n_gqa      = 1
llama_model_load_internal:

Time taken:  0:12:26.063200


In [3]:
print(pd.DataFrame(
    data=[[google_flan_t5_xxl_results, vicuna7b_results, vicuna13b_results, openai_chat_results]],
    columns=["google_flan_t5_xxl", "vicuna7b", "vicuna13b", "openai_chat"]))

   google_flan_t5_xxl  vicuna7b  vicuna13b  openai_chat
0  0.2                 0.0       0.2        0.4        
