In [6]:
import pandas as pd

# Get training dataset
splits = {'train': 'data/train-00000-of-00001.parquet', 'test': 'data/test-00000-of-00001.parquet'}
df_train = pd.read_parquet("hf://datasets/fancyzhx/ag_news/" + splits["train"])
df_train.head()

# Get testing dataset
df_test = pd.read_parquet("hf://datasets/fancyzhx/ag_news/" + splits["test"])
df_test.head()

Unnamed: 0,text,label
0,Fears for T N pension after talks Unions repre...,2
1,The Race is On: Second Private Team Sets Launc...,3
2,Ky. Company Wins Grant to Study Peptides (AP) ...,3
3,Prediction Unit Helps Forecast Wildfires (AP) ...,3
4,Calif. Aims to Limit Farm-Related Smog (AP) AP...,3


In [None]:
from dotenv import load_dotenv
import os
import requests
import time
import json


load_dotenv()
API_KEY = os.getenv("API_KEY")

# Sample out an balanced training data
n_rows_per_class = 1000
balanced_dfs = []

for label in sorted(df_train["label"].unique()):
    class_samples = df_train[df_train["label"] == label]
    balanced_dfs.append(class_samples.sample(n_rows_per_class, random_state=42))

balanced_data = pd.concat(balanced_dfs)

# Sample out an imbalanced training data (Assume label 0 as the majority class)
n_majority = 1000
n_minority = 300

imbalanced_dfs = []
label_0_class = df_train[df_train["label"] == 0]
imbalanced_dfs.append(label_0_class.sample(n_majority, random_state=42))

for label in sorted(df_train["label"].unique())[1:]:
    class_samples = df_train[df_train['label'] == label]
    imbalanced_dfs.append(class_samples.sample(n_minority, random_state=42))

imbalanced_data = pd.concat(imbalanced_dfs)
# Shuffle the imbalanced dataset to mix the classes
imbalanced_data = imbalanced_data.sample(frac=1, random_state=42).reset_index(drop=True)

print("\nBalanced Training Set Label Distribution:")
print(balanced_data['label'].value_counts().sort_index())

print("\nImbalanced Training Set Label Distribution:")
print(imbalanced_data['label'].value_counts().sort_index())




Balanced Training Set Label Distribution:
label
0    1000
1    1000
2    1000
3    1000
Name: count, dtype: int64

Imbalanced Training Set Label Distribution:
label
0    1000
1     300
2     300
3     300
Name: count, dtype: int64


In [12]:
# Create a small testing set
n_rows_per_class = 1000
test_balanced_dfs = []
for label in sorted(df_test["label"].unique()):
    test_samples = df_test[df_test["label"] == label]
    test_balanced_dfs.append(test_samples.sample(n_rows_per_class, random_state=42))

testing_set = pd.concat(test_balanced_dfs)
testing_set = testing_set.sample(frac=1, random_state=42).reset_index(drop=True)



In [14]:
# Save datasets to parquet for later use
balanced_data.to_parquet('Data/ag_news_train_balanced.parquet')
imbalanced_data.to_parquet('Data/ag_news_train_imbalanced.parquet')
testing_set.to_parquet('Data/ag_news_test_small.parquet')