# Setup

In [145]:
import os
import sys
from pathlib import Path
import arff
import pandas as pd
import numpy as np

In [146]:
sys.path.append(os.path.abspath("../../../.."))

In [147]:
from src.experiment.helpers.variables import dataset_root_dir
from src.experiment.helpers.utils import get_sorted_class_percentages, get_cardinality

In [148]:
NAME = "stackexcooking"

In [149]:
DATASET_DIR = dataset_root_dir / 'multilabel' / NAME
print(DATASET_DIR)

c:\VisualStudioRepositories\MUSIC_DATA\datasets\multilabel\stackexcooking


In [150]:
trimmed_name = NAME + "_trimmed"
DATASET_DIR_TRIMMED = dataset_root_dir / 'multilabel' / trimmed_name
print(DATASET_DIR_TRIMMED)

c:\VisualStudioRepositories\MUSIC_DATA\datasets\multilabel\stackexcooking_trimmed


# Dataset loading

In [177]:
X = pd.read_csv(DATASET_DIR / 'X.csv')
y = pd.read_csv(DATASET_DIR / 'y.csv')

In [178]:
X

Unnamed: 0,achiev,acid,add,addit,adjust,advanc,advic,affect,ago,air,...,word,work,worri,wouldnt,wrap,wrong,year,yeast,yesterday,yolk
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10486,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10487,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10488,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10489,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [179]:
y

Unnamed: 0,tag_baking,tag_food-safety,tag_substitutions,tag_equipment,tag_bread,tag_storage-method,tag_chicken,tag_eggs,tag_sauce,tag_meat,...,tag_fondant,tag_fudge,tag_pot-roast,tag_polenta,tag_food-transport,tag_skin,tag_calories,tag_broccoli,tag_breadcrumbs,tag_bell-peppers
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10486,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10487,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10488,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10489,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Dataset info

In [154]:
# number of samples
len(X)

10491

In [155]:
# number of classes
len(y.columns)

400

In [156]:
class_percentages_sorted = get_sorted_class_percentages(y)
print(class_percentages_sorted)

tag_baking           9.150701
tag_food-safety      6.843961
tag_substitutions    6.252979
tag_equipment        5.890764
tag_bread            4.356115
                       ...   
tag_skin             0.104852
tag_calories         0.104852
tag_broccoli         0.104852
tag_breadcrumbs      0.104852
tag_bell-peppers     0.104852
Length: 400, dtype: float64


In [157]:
cardinality = get_cardinality(y)
print("Cardinality:", cardinality)

Cardinality: 2.2247640835001428


# Functions

In [158]:
def get_least_represented_class_names(class_percentages_sorted, percent):
    num_classes = len(class_percentages_sorted)
    bottom_percent_count = int(np.ceil(num_classes * percent))
    least_represented_classes = class_percentages_sorted.tail(bottom_percent_count).index.tolist()
    return least_represented_classes

In [159]:
# removes SAMPLES
def remove_few_samples_of_class(X, y, class_name, step, protected_classes):
    y_numeric = y.apply(pd.to_numeric)

    available_indices = y_numeric.index[
        (y_numeric[class_name] == 1) &
        (y_numeric[protected_classes].sum(axis=1) == 0)
    ].tolist()

    np.random.seed(47)
    remove_indices = np.random.choice(available_indices , step, replace=False)

    X_trimmed = X.drop(remove_indices).reset_index(drop=True)
    y_trimmed = y.drop(remove_indices).reset_index(drop=True)
    
    return X_trimmed, y_trimmed

In [None]:
# removes a CLASS
# if samples that belong only to that class exist, they will be deleted
# samples that belong to multiple classes will be kept
def remove_class(X, y, least_represented=True):
    class_counts = y.sum()
    target_class = class_counts.idxmin() if least_represented else class_counts.idxmax()
    # Find samples where only this class is present
    only_target = (y.sum(axis=1) == 1) & (y[target_class] == 1)
    # Remove those samples
    X_new = X.loc[~only_target].reset_index(drop=True)
    y_new = y.loc[~only_target].drop(columns=[target_class]).reset_index(drop=True)
    return X_new, y_new

In [161]:
# removes CLASSES
# if samples that belong only to those classes exist, they will be deleted
# samples that still have other classes will be kept
def remove_bottom_percent_classes(X, y, percent):
    class_percentages_sorted = get_sorted_class_percentages(y)
    bottom_classes = get_least_represented_class_names(class_percentages_sorted, percent)

    # Find samples that belong only to the bottom classes
    only_bottom = (y[bottom_classes].sum(axis=1) == 1) & (y.drop(columns=bottom_classes).sum(axis=1) == 0)
    indices_to_remove = y.index[only_bottom]

    X_new = X.drop(indices_to_remove).reset_index(drop=True)
    y_new = y.drop(indices_to_remove).reset_index(drop=True).drop(columns=bottom_classes)

    return X_new, y_new

In [162]:
def show_summary(X, y):
    print("Number of samples:", len(X))
    print("Number of classes:", y.shape[1])
    
    cardinality = get_cardinality(y)
    print("Cardinality:", round(cardinality, 3))

    class_percentages_sorted = get_sorted_class_percentages(y)
    print(class_percentages_sorted)
    return class_percentages_sorted

In [163]:
# removes SAMPLES
def remove_samples_with_one_class(X, y, step, protected_classes, random_state=42):
    y_numeric = y.apply(pd.to_numeric)

    # Find indices where none of the protected classes are present
    available_indices = y_numeric.index[
        (y_numeric[protected_classes].sum(axis=1) == 0)
    ]

    # From those, select indices where only one class is present
    single_class_indices = y_numeric.loc[available_indices].index[
        y_numeric.loc[available_indices].sum(axis=1) == 1
    ].tolist()
    
    if step > len(single_class_indices):
        raise ValueError(f"Requested to remove {step} samples, but only {len(single_class_indices)} single-class samples from non-protected classes are available.")
    
    np.random.seed(random_state)
    remove_indices = np.random.choice(single_class_indices, step, replace=False)
    X_new = X.drop(remove_indices).reset_index(drop=True)
    y_new = y.drop(remove_indices).reset_index(drop=True)
    return X_new, y_new

# Dataset trimming

In [180]:
X, y  = remove_bottom_percent_classes(X, y, percent=0.5)
class_percentages_sorted = show_summary(X, y)

Number of samples: 10160
Number of classes: 200
Cardinality: 1.949
tag_baking            9.448819
tag_food-safety       7.066929
tag_substitutions     6.456693
tag_equipment         6.082677
tag_bread             4.498031
                        ...   
tag_gas               0.285433
tag_pumpkin           0.285433
tag_seasoning-pans    0.275591
tag_color             0.275591
tag_bananas           0.275591
Length: 200, dtype: float64


In [327]:
X, y  = remove_all_class_samples(X, y, least_represented=True)
class_percentages_sorted = show_summary(X, y)

Number of samples: 739
Number of classes: 6
Cardinality: 0.969
tag_baking           21.786198
tag_bread            20.027064
tag_equipment        17.456022
tag_food-safety      15.020298
tag_coffee           12.178620
tag_substitutions    10.419486
dtype: float64


In [120]:
X, y  = remove_all_class_samples(X, y, least_represented=False)
class_percentages_sorted = show_summary(X, y)

Number of samples: 237
Number of classes: 4
Cardinality: 0.46
tag_cheese            16.033755
tag_cake              11.814346
tag_cookies            9.282700
tag_storage-method     8.860759
dtype: float64


In [None]:
class_percentages_sorted = get_sorted_class_percentages(y)

least_represented_classes = get_least_represented_class_names(class_percentages_sorted, 0.3)
most_represented_class = class_percentages_sorted.index[0]
print("Most populated class:", most_represented_class)

X, y = trim_class_samples(X, y, most_represented_class, 3, least_represented_classes)
class_percentages_sorted = show_summary(X, y)

Most populated class: street_food
Number of samples: 213
Number of classes: 5
Cardinality: 1.61
street_food        68.075117
sweets_desserts    28.638498
gourmet            24.882629
snacks             23.474178
brazilian_food     15.962441
dtype: float64


In [107]:
least_represented_classes = get_least_represented_class_names(class_percentages_sorted, 0.5)

X, y = remove_single_class_samples(X, y, step=50, protected_classes=least_represented_classes)
class_percentages_sorted = show_summary(X, y)

Number of samples: 10341
Number of classes: 400
Cardinality: 2.243
tag_baking            9.196403
tag_food-safety       6.827193
tag_substitutions     6.314670
tag_equipment         5.918190
tag_bread             4.390291
                        ...   
tag_food-transport    0.106373
tag_calories          0.106373
tag_broccoli          0.106373
tag_breadcrumbs       0.106373
tag_bell-peppers      0.106373
Length: 400, dtype: float64


# Trimmed dataset - info

In [27]:
class_percentages_sorted = show_summary(X, y)

Number of samples: 10491
Number of classes: 400
Cardinality: 2.225
tag_baking           9.150701
tag_food-safety      6.843961
tag_substitutions    6.252979
tag_equipment        5.890764
tag_bread            4.356115
                       ...   
tag_skin             0.104852
tag_calories         0.104852
tag_broccoli         0.104852
tag_breadcrumbs      0.104852
tag_bell-peppers     0.104852
Length: 400, dtype: float64


# Save

In [2170]:
X.to_csv(DATASET_DIR_TRIMMED / "X.csv", index=False)
y.to_csv(DATASET_DIR_TRIMMED / "y.csv", index=False)

In [2171]:
class_percentages_sorted.to_csv(DATASET_DIR_TRIMMED / "class_percentages.csv", header=False)