In [2]:
import os
import dotenv
from datasets import load_dataset, DatasetDict
from huggingface_hub import login, HfApi
dotenv.load_dotenv()

NEW_DATASET_NAME = "RAGEVALUATION-HJKMY/ragbench_emanual_400row"  
SPLITS_TO_SAMPLE = ['train']

original_dataset = load_dataset("rungalileo/ragbench", "emanual")

new_dataset_dict = {}
for split in SPLITS_TO_SAMPLE:
    if split in original_dataset:
        filtered_split = original_dataset[split].filter(lambda x: x['adherence_score'] == True)
        sampled_split = filtered_split.select(range(min(400, len(filtered_split))))  # Ensure there are enough rows
        new_dataset_dict[split] = sampled_split

new_dataset = DatasetDict(new_dataset_dict)


login(token=os.getenv('HF_TOKEN'))

new_dataset.push_to_hub(NEW_DATASET_NAME)

print(f"Dataset uploaded successfully to {NEW_DATASET_NAME}")

Downloading data: 100%|██████████| 1.70M/1.70M [00:00<00:00, 3.09MB/s]
Downloading data: 100%|██████████| 288k/288k [00:00<00:00, 906kB/s]
Downloading data: 100%|██████████| 305k/305k [00:00<00:00, 1.19MB/s]
Generating train split: 100%|██████████| 1054/1054 [00:00<00:00, 30800.72 examples/s]
Generating validation split: 100%|██████████| 132/132 [00:00<00:00, 17557.18 examples/s]
Generating test split: 100%|██████████| 132/132 [00:00<00:00, 19518.02 examples/s]
Filter: 100%|██████████| 1054/1054 [00:00<00:00, 5471.09 examples/s]
Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 14.20ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.15s/it]


Dataset uploaded successfully to RAGEVALUATION-HJKMY/ragbench_emanual_400row


In [None]:
import pandas as pd
from datasets import Dataset
NEW_DATASET_NAME = "RAGEVALUATION-HJKMY/ragbench_delucionqa_400row"  
SPLITS_TO_SAMPLE = 'train'
# load local train.csv
# the first title line: sample_id,Retreival Setting,Question,Context,Answer,Answer_sent_tokenized,Sentence_labels,Label,Answerable,Does_not_answer
local_data = pd.read_csv("/Users/likai/IdeaProjects/RAG-LLM-Metric/personal/kai/train.csv")
# keep data that Label = Not Hallucinated and Answerable = TRUE
local_filtered = local_data[(local_data['Label'] == 'Not Hallucinated') & (local_data['Answerable'] == True)]
local_filtered = local_filtered.add_prefix("delu_")
# local data from ragbench
ragbench_dataset = load_dataset("rungalileo/ragbench", "delucionqa")
filtered_rag = ragbench_dataset[SPLITS_TO_SAMPLE].filter(lambda x: x['adherence_score'] == True)
rag_df = filtered_rag.to_pandas()
# keep and deduplicate the question and documents column of filtered_rag
rag_df = rag_df.drop_duplicates(['question'])

# left join the local data with the filtered_rag where filtered_rag['question'] = local['question']
merged_data = local_filtered.merge(rag_df, left_on="delu_Question", right_on="question")
# keep 400 rows data
final_dataset = Dataset.from_pandas(merged_data.iloc[:400])
print(final_dataset)
new_dataset_dict[SPLITS_TO_SAMPLE] = final_dataset

new_dataset = DatasetDict(new_dataset_dict)


login(token=os.getenv('HF_TOKEN'))

new_dataset.push_to_hub(NEW_DATASET_NAME)

print(f"Dataset uploaded successfully to {NEW_DATASET_NAME}")

Dataset({
    features: ['delu_sample_id', 'delu_Retreival Setting', 'delu_Question', 'delu_Context', 'delu_Answer', 'delu_Answer_sent_tokenized', 'delu_Sentence_labels', 'delu_Label', 'delu_Answerable', 'delu_Does_not_answer', 'id', 'question', 'documents', 'response', 'generation_model_name', 'annotating_model_name', 'dataset_name', 'documents_sentences', 'response_sentences', 'sentence_support_information', 'unsupported_response_sentence_keys', 'adherence_score', 'overall_supported_explanation', 'relevance_explanation', 'all_relevant_sentence_keys', 'all_utilized_sentence_keys', 'trulens_groundedness', 'trulens_context_relevance', 'ragas_faithfulness', 'ragas_context_relevance', 'gpt3_adherence', 'gpt3_context_relevance', 'gpt35_utilization', 'relevance_score', 'utilization_score', 'completeness_score'],
    num_rows: 400
})
