In [1]:
from pprint import pprint
import os
from huggingface_hub import login
from datasets import load_dataset

In [2]:
login(token=os.environ.get("HF_TOKEN", ""), add_to_git_credential=True)

Token is valid (permission: read).
Your token has been saved in your configured git credential helpers (osxkeychain).
Your token has been saved to /Users/hermeschen/.cache/huggingface/token
Login successful


# Load Dataset

In [3]:
dataset_name: str = "benjaminbeilharz/better_daily_dialog"

In [4]:
dataset = load_dataset(dataset_name, num_proc=8, trust_remote_code=True)

# Convert Emotion ID to Text Label

In [5]:
dataset = dataset.rename_column("emotion", "emotion_id")

In [6]:
emotion_labels: list = ["neutral", "anger", "disgust", "fear", "happiness", "sadness", "surprise"]
emotion_id2labels: dict = {i: emotion for i, emotion in enumerate(emotion_labels)}
pprint(emotion_id2labels)

{0: 'neutral',
 1: 'anger',
 2: 'disgust',
 3: 'fear',
 4: 'happiness',
 5: 'sadness',
 6: 'surprise'}


In [7]:
dataset = dataset.map(lambda samples: {
	"emotion": [emotion_id2labels[sample] for sample in samples["emotion_id"]]
}, batched=True)

In [8]:
dataset["train"][0]

{'dialog_id': 0,
 'utterance': 'Say , Jim , how about going for a few beers after dinner ? ',
 'turn_type': 3,
 'emotion_id': 0,
 'emotion': 'neutral'}

# Edit Features

In [9]:
dataset = dataset.remove_columns(["turn_type", "emotion_id"])

In [10]:
dataset = dataset.rename_column("dialog_id", "id")

In [11]:
dataset["train"][0]

{'id': 0,
 'utterance': 'Say , Jim , how about going for a few beers after dinner ? ',
 'emotion': 'neutral'}

# Save Dataset to Local

In [12]:
# creat data fold in the same directory at first
for split in ["train", "validation", "test"]:
	dataset[split].to_json(f"./data/daily_dialog_{split}.json", num_proc=8)

Creating json from Arrow format:   0%|          | 0/88 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/9 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]