In [8]:
import os
from pathlib import Path
from dotenv import load_dotenv
from datasets import Dataset
from huggingface_hub import login

load_dotenv()


HF_TOKEN = os.getenv("HF_TOKEN")

login(token=HF_TOKEN, add_to_git_credential=True)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [2]:
root_dir = Path(".").parent.resolve()
data_dir = root_dir / "storage"
print(data_dir)

/home/octoopt/workspace/projects/learn-from-basics/the-notes/dataset/storage


In [3]:
excel_files = list(data_dir.glob("**/*.xlsx"))
excel_files

[PosixPath('/home/octoopt/workspace/projects/learn-from-basics/the-notes/dataset/storage/train_nor_811.xlsx'),
 PosixPath('/home/octoopt/workspace/projects/learn-from-basics/the-notes/dataset/storage/test_nor_811.xlsx'),
 PosixPath('/home/octoopt/workspace/projects/learn-from-basics/the-notes/dataset/storage/valid_nor_811.xlsx')]

In [None]:
import polars as pl


list_of_dfs = [pl.read_excel(f).columns for f in excel_files]

In [6]:
list_of_dfs

[shape: (5_548, 3)
 ┌──────────────┬───────────┬─────────────────────────────────┐
 │ __UNNAMED__0 ┆ Emotion   ┆ Sentence                        │
 │ ---          ┆ ---       ┆ ---                             │
 │ i64          ┆ str       ┆ str                             │
 ╞══════════════╪═══════════╪═════════════════════════════════╡
 │ 188          ┆ Other     ┆ cho mình xin bài nhạc tên là g… │
 │ 166          ┆ Disgust   ┆ cho đáng đời con quỷ . về nhà … │
 │ 1345         ┆ Disgust   ┆ lo học đi . yêu đương lol gì h… │
 │ 316          ┆ Enjoyment ┆ uớc gì sau này về già vẫn có t… │
 │ 1225         ┆ Enjoyment ┆ mỗi lần có video của con là cứ… │
 │ …            ┆ …         ┆ …                               │
 │ 1332         ┆ Disgust   ┆ đường của nhà cụ hay sao mà cụ… │
 │ 825          ┆ Other     ┆ nhìn mặt héo queo luôn          │
 │ 165          ┆ Other     ┆ tao đi xe máy mỗi lần muốn để … │
 │ 363          ┆ Enjoyment ┆ thích thân hình boss rồi nhan … │
 │ 1242         ┆ Sad

In [10]:
# Concatenate all DataFrames into one
combined_df = pl.concat(list_of_dfs)
print("Successfully combined all Excel files.")
print("Combined DataFrame schema:", combined_df.schema)
print("Combined DataFrame shape:", combined_df.shape)

if "__UNNAMED__0" in combined_df.columns:
    combined_df = combined_df.drop("__UNNAMED__0")
    print("Successfully dropped '__UNNAMED__0' column.")

# Convert the Polars DataFrame to a Hugging Face Dataset
hf_dataset = Dataset.from_polars(combined_df)
print("Converted DataFrame to Hugging Face Dataset.")
print(hf_dataset)

Successfully combined all Excel files.
Combined DataFrame schema: Schema([('__UNNAMED__0', Int64), ('Emotion', String), ('Sentence', String)])
Combined DataFrame shape: (6927, 3)
Successfully dropped '__UNNAMED__0' column.
Converted DataFrame to Hugging Face Dataset.
Dataset({
    features: ['Emotion', 'Sentence'],
    num_rows: 6927
})


In [None]:
combined_df.get_column("Emotion").unique()

Emotion
str
"""Fear"""
"""Sadness"""
"""Anger"""
"""Surprise"""
"""Enjoyment"""
"""Other"""
"""Disgust"""


In [20]:
# Frequency table of each distinct label in the column
label_counts = combined_df["Emotion"].value_counts()  # returns a DataFrame
print(label_counts)

shape: (7, 2)
┌───────────┬───────┐
│ Emotion   ┆ count │
│ ---       ┆ ---   │
│ str       ┆ u32   │
╞═══════════╪═══════╡
│ Surprise  ┆ 309   │
│ Sadness   ┆ 1149  │
│ Other     ┆ 1291  │
│ Enjoyment ┆ 1965  │
│ Fear      ┆ 395   │
│ Disgust   ┆ 1338  │
│ Anger     ┆ 480   │
└───────────┴───────┘


In [23]:
combined_df[:3]

Emotion,Sentence
str,str
"""Other""","""cho mình xin bài nhạc tên là g…"
"""Disgust""","""cho đáng đời con quỷ . về nhà …"
"""Disgust""","""lo học đi . yêu đương lol gì h…"


In [11]:
# Push the dataset to the Hub
repo_id = "minhleduc/sentiment-classification-vietnamese-v1"
print(f"Pushing dataset to '{repo_id}' on the Hugging Face Hub...")
# To make the dataset private, add `private=True`
hf_dataset.push_to_hub(repo_id)

print("\nPush complete!")
print(f"You can view your dataset at: https://huggingface.co/datasets/{repo_id}")

Pushing dataset to 'minhleduc/sentiment-classification-vietnamese-v1' on the Hugging Face Hub...


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/7 [00:00<?, ?ba/s]


Push complete!
You can view your dataset at: https://huggingface.co/datasets/minhleduc/sentiment-classification-vietnamese-v1


In [47]:
str(datasets[1])

'/home/octoopt/workspace/projects/learn-from-basics/the-notes/dataset/storage/vlmu_dialog_v1/vi_dialogue_question_only.json'

In [43]:
from datasets import load_dataset

dataset = load_dataset(
    "json", data_files=str(datasets[2]), split="train"
).remove_columns("__count__")

dataset

Generating train split: 3310 examples [00:00, 32542.34 examples/s]


Dataset({
    features: ['data'],
    num_rows: 3310
})

In [44]:
ds = dataset.train_test_split(test_size=0.2)
ds

DatasetDict({
    train: Dataset({
        features: ['data'],
        num_rows: 2648
    })
    test: Dataset({
        features: ['data'],
        num_rows: 662
    })
})

In [45]:
ds.push_to_hub(
    "StoicCodingLab/vi_squad_benchmark_question_only",
    # private=True
)

Creating parquet from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 45.05ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:03<00:00,  3.21s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 38.70ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:02<00:00,  2.26s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/StoicCodingLab/vi_squad_benchmark_question_only/commit/15db0cb39f6e6fc8a575940b857ae8d94723a179', commit_message='Upload dataset', commit_description='', oid='15db0cb39f6e6fc8a575940b857ae8d94723a179', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/StoicCodingLab/vi_squad_benchmark_question_only', endpoint='https://huggingface.co', repo_type='dataset', repo_id='StoicCodingLab/vi_squad_benchmark_question_only'), pr_revision=None, pr_num=None)

In [None]:
dataset = load_dataset("organization/dataset_name", token=True)