In [1]:
from datasets import load_dataset
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


# Creating new datasets

The goal of this notebook is to make two unified datasets out of the following larger datasets: MMLU, TruthfulQA, HellaSwag. 

The first CSV should have the following columns: Type, Question, Options, Answer. Question refers to the actual prompt. Type refers to the category of question (math, history, law, fact, hellaswag). Options will be the multiple choice options. Answer will be the correct answer, in 1/2/3/4 format. This dataset will be evaluated using probability scoring each option.

The second CSV will be focused on generation quality. This will also be made from TruthfulQA, HellaSwag, etc, but will utilize an LLM-as-a-judge for generation quality estimation. 

In [2]:
df_final = pd.DataFrame({})

## 1. MMLU

In [3]:
data_clinical = load_dataset("cais/mmlu", "clinical_knowledge")['test'].to_pandas()
data_law = load_dataset("cais/mmlu", "international_law")['test'].to_pandas()
data_cs = load_dataset("cais/mmlu", "college_computer_science")['test'].to_pandas()

In [4]:
data_clinical.head(3)

Unnamed: 0,question,subject,choices,answer
0,What size of cannula would you use in a patien...,clinical_knowledge,"[18 gauge., 20 gauge., 22 gauge., 24 gauge.]",0
1,The key attribute in successful marathon runni...,clinical_knowledge,"[strength., power., stride length., stamina.]",3
2,Which of the following is the commonest cause ...,clinical_knowledge,"[Alzheimer's disease., Cerebrovascular (stroke...",0


In [5]:
# add each of the dataframes to the final dataframe
df_final = pd.concat([df_final, data_clinical, data_law, data_cs])

# rename the answer column to answer
df_final = df_final.rename(columns={'question': 'Question', 'subject': 'Type', 'choices': 'Options', 'answer': 'Answer'})


# 2. TruthfulQA

In [6]:
data_truth = load_dataset("EleutherAI/truthful_qa_mc")['validation'].to_pandas()

In [7]:
data_truth = data_truth.rename(columns={'question': 'Question', 'choices': 'Options', 'label': 'Answer'}, inplace=False)
data_truth['Type'] = 'TruthfulQA'

df_final = pd.concat([df_final, data_truth])

## 3. HellaSwag

In [8]:
# get subset of data from hellaswag (only 400 rows)
data_hs = load_dataset("Rowan/hellaswag", split="train").to_pandas()[:400]

In [13]:
data_hs = data_hs.rename(columns={'ctx': 'Question', 'endings': 'Options', 'label': 'Answer'}, inplace=False)
data_hs['Type'] = 'HellaSwag'
data_hs = data_hs[['Question', 'Options', 'Answer', 'Type']]

df_final = pd.concat([df_final, data_hs])

In [10]:
df_final.to_json("data_mc.json", orient="records", lines=True)