# Setup

In [530]:
import os
import sys
from pathlib import Path
import pandas as pd
import numpy as np

In [531]:
sys.path.append(os.path.abspath("../../../.."))

In [532]:
from src.experiment.helpers.variables import dataset_root_dir
from src.experiment.helpers.utils import get_sorted_class_percentages, get_cardinality

In [533]:
NAME = "yelp"

In [534]:
DATASET_DIR = dataset_root_dir / 'multilabel' / NAME
print(DATASET_DIR)

D:\m\datasets\multilabel\yelp


In [535]:
balanced_name = NAME + "_balanced"
DATASET_DIR_BALANCED = dataset_root_dir / 'multilabel' / balanced_name
print(DATASET_DIR_BALANCED)

D:\m\datasets\multilabel\yelp_balanced


In [536]:
imbalanced_name = NAME + "_imbalanced"
DATASET_DIR_IMBALANCED = dataset_root_dir / 'multilabel' / imbalanced_name
print(DATASET_DIR_IMBALANCED)

D:\m\datasets\multilabel\yelp_imbalanced


# Dataset loading

In [537]:
X = pd.read_csv(DATASET_DIR / 'X.csv')
y = pd.read_csv(DATASET_DIR / 'y.csv')

In [538]:
X

Unnamed: 0,back_try,not_good,i_have_to,about_place,portions,come_back,the_food,that_it_was,really_like,food_just,...,the_bar_area,about_this_place,chinese_food,the_food_was,and_they_were,to_choose_from,will_definitely,IsRatingBad,IsRatingModerate,IsRatingGood
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10801,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
10802,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
10803,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
10804,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [539]:
y

Unnamed: 0,IsFoodGood,IsAmbianceGood,IsServiceGood,IsPriceGood,IsDealsGood
0,1,0,0,0,1
1,1,1,1,0,1
2,1,1,0,0,1
3,1,0,1,1,1
4,1,1,1,0,1
...,...,...,...,...,...
10801,0,0,0,0,0
10802,0,0,0,0,0
10803,0,0,0,0,0
10804,0,0,0,0,0


# Cut classes to match balanced dataset

In [540]:
y_balanced = pd.read_csv(DATASET_DIR_BALANCED / 'y.csv')
y_balanced

Unnamed: 0,IsFoodGood,IsAmbianceGood,IsServiceGood,IsPriceGood
0,1,1,1,0
1,1,0,1,1
2,1,1,1,0
3,1,0,0,0
4,1,1,0,1
...,...,...,...,...
4713,1,0,0,0
4714,1,0,0,0
4715,1,0,0,0
4716,1,0,0,0


In [541]:
y = y[y_balanced.columns]

# Dataset info

In [542]:
# number of samples
len(X)

10806

In [543]:
# number of classes
len(y.columns)

4

In [544]:
class_percentages_sorted = get_sorted_class_percentages(y)
print(class_percentages_sorted)

IsFoodGood        58.754396
IsAmbianceGood    38.080696
IsServiceGood     37.460670
IsPriceGood       21.765686
dtype: float64


In [545]:
cardinality = get_cardinality(y)
print("Cardinality:", cardinality)

Cardinality: 1.560614473440681


# Functions

In [309]:
def get_least_represented_class_names(class_percentages_sorted, percent):
    num_classes = len(class_percentages_sorted)
    bottom_percent_count = int(np.ceil(num_classes * percent))
    least_represented_classes = class_percentages_sorted.tail(bottom_percent_count).index.tolist()
    return least_represented_classes

In [310]:
def get_most_represented_class_name(class_percentages_sorted, percent):
    num_classes = len(class_percentages_sorted)
    top_percent_count = int(np.ceil(num_classes * percent))
    most_represented_classes = class_percentages_sorted.head(top_percent_count).index.tolist()
    return most_represented_classes

In [311]:
# removes SAMPLES
def remove_few_samples_of_class(X, y, class_name, step, protected_classes=[]):
    y_numeric = y.apply(pd.to_numeric)

    available_indices = y_numeric.index[
        (y_numeric[class_name] == 1) &
        (y_numeric[protected_classes].sum(axis=1) == 0)
    ].tolist()

    np.random.seed(47)
    remove_indices = np.random.choice(available_indices , step, replace=False)

    X_trimmed = X.drop(remove_indices).reset_index(drop=True)
    y_trimmed = y.drop(remove_indices).reset_index(drop=True)
    
    return X_trimmed, y_trimmed

In [312]:
# removes a CLASS
# if samples that belong only to that class exist, they will be deleted
# samples that belong to multiple classes will be kept
def remove_class(X, y, least_represented=True):
    class_counts = y.sum()
    target_class = class_counts.idxmin() if least_represented else class_counts.idxmax()
    # Find samples where only this class is present
    only_target = (y.sum(axis=1) == 1) & (y[target_class] == 1)
    # Remove those samples
    X_new = X.loc[~only_target].reset_index(drop=True)
    y_new = y.loc[~only_target].drop(columns=[target_class]).reset_index(drop=True)
    return X_new, y_new

In [313]:
# removes CLASSES
# if samples that belong only to those classes exist, they will be deleted
# samples that still have other classes will be kept
def remove_bottom_percent_classes(X, y, percent):
    class_percentages_sorted = get_sorted_class_percentages(y)
    bottom_classes = get_least_represented_class_names(class_percentages_sorted, percent)

    # Remove the bottom classes from y
    y_new = y.drop(columns=bottom_classes)

    # Find samples that would have zero classes after dropping bottom classes
    zero_class_samples = y_new.sum(axis=1) == 0

    # Remove those samples from both X and y_new
    X_new = X.loc[~zero_class_samples].reset_index(drop=True)
    y_new = y_new.loc[~zero_class_samples].reset_index(drop=True)

    return X_new, y_new

In [314]:
def show_summary(X, y):
    print("Number of samples:", len(X))
    print("Number of classes:", y.shape[1])
    
    cardinality = get_cardinality(y)
    print("Cardinality:", round(cardinality, 3))
    
    num_samples_no_labels = (y.sum(axis=1) == 0).sum()
    print("Samples with no labels:", num_samples_no_labels)

    class_percentages_sorted = get_sorted_class_percentages(y)
    print(class_percentages_sorted)
    return class_percentages_sorted

In [315]:
# removes SAMPLES
def remove_samples_with_one_class(X, y, step, protected_classes=[], random_state=42):
    y_numeric = y.apply(pd.to_numeric)

    # Find indices where none of the protected classes are present
    available_indices = y_numeric.index[
        (y_numeric[protected_classes].sum(axis=1) == 0)
    ]

    # From those, select indices where only one class is present
    single_class_indices = y_numeric.loc[available_indices].index[
        y_numeric.loc[available_indices].sum(axis=1) == 1
    ].tolist()
    
    if step > len(single_class_indices):
        raise ValueError(f"Requested to remove {step} samples, but only {len(single_class_indices)} single-class samples from non-protected classes are available.")
    
    np.random.seed(random_state)
    remove_indices = np.random.choice(single_class_indices, step, replace=False)
    X_new = X.drop(remove_indices).reset_index(drop=True)
    y_new = y.drop(remove_indices).reset_index(drop=True)
    return X_new, y_new

In [316]:
def remove_samples_with_zero_classes(X, y, step, random_state=42):
    y_numeric = y.apply(pd.to_numeric)
    zero_class_indices = y_numeric.index[y_numeric.sum(axis=1) == 0].tolist()
    
    if step > len(zero_class_indices):
        raise ValueError(f"Requested to remove {step} samples, but only {len(zero_class_indices)} samples with zero classes are available.")
    
    np.random.seed(random_state)
    remove_indices = np.random.choice(zero_class_indices, step, replace=False)
    X_new = X.drop(remove_indices).reset_index(drop=True)
    y_new = y.drop(remove_indices).reset_index(drop=True)
    return X_new, y_new

# Dataset trimming

In [2413]:
X, y  = remove_bottom_percent_classes(X, y, percent=0.5)
class_percentages_sorted = show_summary(X, y)

Number of samples: 203690
Number of classes: 40
Cardinality: 2.298
Samples with no labels: 0
sky           36.422996
clouds        26.553586
person        25.321322
water         17.312583
animal        16.636556
grass         11.076145
buildings      8.755953
window         7.389170
plants         7.042565
lake           6.574697
ocean          5.551083
road           4.675733
flowers        4.224557
sunset         4.132751
reflection     3.866169
rocks          3.106191
vehicle        2.994256
snow           2.653051
tree           2.627522
beach          2.572046
mountain       2.503314
boats          1.982424
nighttime      1.926948
house          1.922529
valley         1.886691
birds          1.855761
sun            1.789484
military       1.639747
garden         1.437969
toy            1.359910
food           1.318180
tower          1.317198
plane          1.307379
street         1.254848
dog            1.229319
cat            1.166478
town           1.150768
bridge         1.14

In [2425]:
# remove top class
X, y  = remove_class(X, y, least_represented=False)
class_percentages_sorted = show_summary(X, y)

Number of samples: 85542
Number of classes: 28
Cardinality: 1.346
Samples with no labels: 0
flowers       10.059386
sunset         9.840780
reflection     9.206004
rocks          7.396367
vehicle        7.129831
snow           6.317365
tree           6.256576
beach          6.124477
mountain       5.960815
boats          4.720488
nighttime      4.588389
house          4.577868
valley         4.492530
birds          4.418882
sun            4.261065
military       3.904515
garden         3.424049
toy            3.238175
food           3.138809
tower          3.136471
plane          3.113091
street         2.988006
dog            2.927217
cat            2.777583
town           2.740174
bridge         2.731991
cityscape      2.613921
sand           2.471301
dtype: float64


In [2191]:
# remove bottom class
X, y  = remove_class(X, y, least_represented=True)
class_percentages_sorted = show_summary(X, y)

Number of samples: 209291
Number of classes: 80
Cardinality: 2.407
Samples with no labels: 0
sky           35.448251
clouds        25.842965
person        24.643678
water         16.849267
animal        16.191332
                ...    
book           0.161498
zebra          0.147641
surf           0.092216
soccer         0.072626
earthquake     0.030102
Length: 80, dtype: float64


In [814]:
# trim top class of samples
class_percentages_sorted = get_sorted_class_percentages(y)

chosen_class = class_percentages_sorted.index[0]
print("Most populated class:", chosen_class)

X, y = remove_few_samples_of_class(X, y, chosen_class, 30, protected_classes=get_least_represented_class_names(class_percentages_sorted, 0.7))
class_percentages_sorted = show_summary(X, y)

Most populated class: IsFoodGood
Number of samples: 4718
Number of classes: 4
Cardinality: 2.208
Samples with no labels: 0
IsFoodGood        99.851632
IsServiceGood     55.998304
IsAmbianceGood    37.621874
IsPriceGood       27.342094
dtype: float64


In [693]:
# trim bottom class of samples
class_percentages_sorted = get_sorted_class_percentages(y)

chosen_class = class_percentages_sorted.index[-2]
print("Least populated class:", chosen_class)

X, y = remove_few_samples_of_class(X, y, chosen_class, 10, protected_classes=get_most_represented_class_name(class_percentages_sorted, 0.1))
class_percentages_sorted = show_summary(X, y)

Least populated class: IsAmbianceGood


ValueError: Cannot take a larger sample than population when 'replace=False'

In [732]:
# remove samples of top classes with only 1 class
X, y = remove_samples_with_one_class(X, y, step=10, protected_classes=get_least_represented_class_names(class_percentages_sorted, 0.1))
class_percentages_sorted = show_summary(X, y)

Number of samples: 5978
Number of classes: 4
Cardinality: 2.007
Samples with no labels: 0
IsFoodGood        99.882904
IsServiceGood     46.738039
IsAmbianceGood    31.431917
IsPriceGood       22.666444
dtype: float64


In [756]:
# remove samples of bottom classes with only 1 class
X, y = remove_samples_with_one_class(X, y, step=10, protected_classes=get_most_represented_class_name(class_percentages_sorted, 0.0))
class_percentages_sorted = show_summary(X, y)

Number of samples: 5758
Number of classes: 4
Cardinality: 2.046
Samples with no labels: 0
IsFoodGood        99.878430
IsServiceGood     48.523793
IsAmbianceGood    32.632859
IsPriceGood       23.532477
dtype: float64


In [547]:
X, y = remove_samples_with_zero_classes(X, y, step=1018)
class_percentages_sorted = show_summary(X, y)

Number of samples: 9788
Number of classes: 4
Cardinality: 1.723
Samples with no labels: 0
IsFoodGood        64.865141
IsAmbianceGood    42.041275
IsServiceGood     41.356763
IsPriceGood       24.029424
dtype: float64


# Trimmed dataset - info

In [694]:
class_percentages_sorted = show_summary(X, y)

Number of samples: 6358
Number of classes: 4
Cardinality: 1.947
Samples with no labels: 0
IsFoodGood        99.858446
IsServiceGood     43.944637
IsAmbianceGood    29.584775
IsPriceGood       21.311733
dtype: float64


# Save

### Balanced

In [None]:
# DATASET_DIR_BALANCED.mkdir(parents=True, exist_ok=True)
# X.to_csv(DATASET_DIR_BALANCED / "X.csv", index=False)
# y.to_csv(DATASET_DIR_BALANCED / "y.csv", index=False)

In [None]:
# class_percentages_sorted.to_csv(DATASET_DIR_BALANCED / "class_percentages.csv", header=False)

### Imbalanced

In [815]:
DATASET_DIR_IMBALANCED.mkdir(parents=True, exist_ok=True)
X.to_csv(DATASET_DIR_IMBALANCED / "X.csv", index=False)
y.to_csv(DATASET_DIR_IMBALANCED / "y.csv", index=False)

In [816]:
class_percentages_sorted.to_csv(DATASET_DIR_IMBALANCED / "class_percentages.csv", header=False)