In [1]:
from datasets import load_dataset, Dataset
from collections import Counter
import pandas as pd

# 1. Wczytaj dane
dataset = load_dataset("go_emotions")
single_label_dataset = dataset['train'].filter(lambda x: len(x['labels']) == 1)
single_label_dataset = single_label_dataset.map(lambda x: {"label": x['labels'][0]})

# 2. Zlicz klasy
label_counts = Counter(single_label_dataset['label'])

# 3. Zostaw tylko klasy, które mają co najmniej 100 przykładów
valid_labels = [label for label, count in label_counts.items() if count >= 100]
filtered_dataset = single_label_dataset.filter(lambda x: x['label'] in valid_labels)

# 4. Przekształć do DataFrame
df = pd.DataFrame(filtered_dataset)

# 5. Wybierz dokładnie po 80 przykładów z każdej klasy
dfs = []
for label in valid_labels:
    class_df = df[df['label'] == label].sample(n=80, random_state=42)
    dfs.append(class_df)

balanced_df = pd.concat(dfs).sample(frac=1, random_state=42).reset_index(drop=True)

# 6. (Opcjonalnie) Konwersja do Hugging Face Dataset
final_dataset = Dataset.from_pandas(balanced_df)

# ✅ Wynik
print(f"Liczba klas: {len(valid_labels)}")
print(f"Łącznie przykładów: {len(balanced_df)}")
print(balanced_df['label'].value_counts())
print(balanced_df.head())


  from .autonotebook import tqdm as notebook_tqdm


Liczba klas: 24
Łącznie przykładów: 1920
label
10    80
7     80
14    80
11    80
3     80
24    80
27    80
13    80
22    80
17    80
12    80
25    80
2     80
8     80
4     80
18    80
9     80
1     80
26    80
20    80
15    80
0     80
6     80
5     80
Name: count, dtype: int64
                                                text labels       id  label
0  Ok.... he can *call* it the State of the Union...   [10]  eet0t1s     10
1                              Wheres the telescope?    [7]  eevpaam      7
2  > Why create more jobs when no one is availabl...    [6]  efgb4m4      6
3     [NAME] deserves to play for a proper club ffs.    [0]  ee96vdc      0
4  I'm sorry, do people NOT listen to [NAME] 9th ...   [15]  ed3ovpu     15


In [2]:
train_subset = balanced_df[['text', 'label']]

In [3]:
train_subset

Unnamed: 0,text,label
0,Ok.... he can *call* it the State of the Union...,10
1,Wheres the telescope?,7
2,> Why create more jobs when no one is availabl...,6
3,[NAME] deserves to play for a proper club ffs.,0
4,"I'm sorry, do people NOT listen to [NAME] 9th ...",15
...,...,...
1915,"Yeah, i dont know bout you guys, but my parent...",4
1916,This is hilarious! Sounds similar to my buddy'...,1
1917,But that jumper looks hella cosy,17
1918,Sorry. This thing knows whats up!,24


In [4]:
label_names = dataset['train'].features['labels'].feature.names
train_subset['label'] = train_subset['label'].apply(lambda x: label_names[x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_subset['label'] = train_subset['label'].apply(lambda x: label_names[x])


In [5]:
train_subset

Unnamed: 0,text,label
0,Ok.... he can *call* it the State of the Union...,disapproval
1,Wheres the telescope?,curiosity
2,> Why create more jobs when no one is availabl...,confusion
3,[NAME] deserves to play for a proper club ffs.,admiration
4,"I'm sorry, do people NOT listen to [NAME] 9th ...",gratitude
...,...,...
1915,"Yeah, i dont know bout you guys, but my parent...",approval
1916,This is hilarious! Sounds similar to my buddy'...,amusement
1917,But that jumper looks hella cosy,joy
1918,Sorry. This thing knows whats up!,remorse


In [8]:
train_subset['label'].value_counts()

label
disapproval       80
curiosity         80
fear              80
disgust           80
annoyance         80
remorse           80
neutral           80
excitement        80
realization       80
joy               80
embarrassment     80
sadness           80
anger             80
desire            80
approval          80
love              80
disappointment    80
amusement         80
surprise          80
optimism          80
gratitude         80
admiration        80
confusion         80
caring            80
Name: count, dtype: int64

In [7]:
# train_subset.to_csv("train_subset.csv", index=False)