# Setup

In [7170]:
import os
import sys
from pathlib import Path
import arff
import pandas as pd
import numpy as np

In [7171]:
sys.path.append(os.path.abspath("../../../.."))

In [7172]:
from src.experiment.helpers.variables import dataset_root_dir
from src.experiment.helpers.utils import get_sorted_class_percentages, get_cardinality

In [7173]:
NAME = "yelp"

In [7174]:
DATASET_DIR = dataset_root_dir / 'multilabel' / NAME
print(DATASET_DIR)

c:\VisualStudioRepositories\MUSIC_DATA\datasets\multilabel\yelp


In [7175]:
balanced_name = NAME + "_balanced"
DATASET_DIR_BALANCED = dataset_root_dir / 'multilabel' / balanced_name
print(DATASET_DIR_BALANCED)

c:\VisualStudioRepositories\MUSIC_DATA\datasets\multilabel\yelp_balanced


In [7176]:
unbalanced_name = NAME + "_unbalanced"
DATASET_DIR_UNBALANCED = dataset_root_dir / 'multilabel' / unbalanced_name
print(DATASET_DIR_UNBALANCED)

c:\VisualStudioRepositories\MUSIC_DATA\datasets\multilabel\yelp_unbalanced


# Dataset loading

In [7177]:
X = pd.read_csv(DATASET_DIR / 'X.csv')
y = pd.read_csv(DATASET_DIR / 'y.csv')

In [7178]:
X

Unnamed: 0,back_try,not_good,i_have_to,about_place,portions,come_back,the_food,that_it_was,really_like,food_just,...,the_bar_area,about_this_place,chinese_food,the_food_was,and_they_were,to_choose_from,will_definitely,IsRatingBad,IsRatingModerate,IsRatingGood
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10801,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
10802,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
10803,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
10804,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [7179]:
y

Unnamed: 0,IsFoodGood,IsAmbianceGood,IsServiceGood,IsPriceGood,IsDealsGood
0,1,0,0,0,1
1,1,1,1,0,1
2,1,1,0,0,1
3,1,0,1,1,1
4,1,1,1,0,1
...,...,...,...,...,...
10801,0,0,0,0,0
10802,0,0,0,0,0
10803,0,0,0,0,0
10804,0,0,0,0,0


# Cut classes to match balanced dataset

In [390]:
y_balanced = pd.read_csv(DATASET_DIR_BALANCED / 'y.csv')
y_balanced

Unnamed: 0,TAG_2006,TAG_analysis,TAG_antibody,TAG_apob,TAG_bettasplendens,TAG_bibteximport,TAG_design,TAG_dynamics,TAG_electrochemistry,TAG_evolution,...,TAG_social,TAG_software,TAG_systems,TAG_tagging,TAG_theory,TAG_topic10,TAG_topic11,TAG_topic3,TAG_toread,TAG_web
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,0,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2882,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2883,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2884,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2885,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [391]:
y = y[y_balanced.columns]

# Dataset info

In [7012]:
# number of samples
len(X)

10806

In [7013]:
# number of classes
len(y.columns)

5

In [7014]:
class_percentages_sorted = get_sorted_class_percentages(y)
print(class_percentages_sorted)

IsFoodGood        58.754396
IsAmbianceGood    38.080696
IsServiceGood     37.460670
IsPriceGood       21.765686
IsDealsGood        7.764205
dtype: float64


In [7015]:
cardinality = get_cardinality(y)
print("Cardinality:", cardinality)

Cardinality: 1.6382565241532483


# Functions

In [7016]:
def get_least_represented_class_names(class_percentages_sorted, percent):
    num_classes = len(class_percentages_sorted)
    bottom_percent_count = int(np.ceil(num_classes * percent))
    least_represented_classes = class_percentages_sorted.tail(bottom_percent_count).index.tolist()
    return least_represented_classes

In [7017]:
def get_most_represented_class_name(class_percentages_sorted, percent):
    num_classes = len(class_percentages_sorted)
    top_percent_count = int(np.ceil(num_classes * percent))
    most_represented_classes = class_percentages_sorted.head(top_percent_count).index.tolist()
    return most_represented_classes

In [7018]:
# removes SAMPLES
def remove_few_samples_of_class(X, y, class_name, step, protected_classes=[]):
    y_numeric = y.apply(pd.to_numeric)

    available_indices = y_numeric.index[
        (y_numeric[class_name] == 1) &
        (y_numeric[protected_classes].sum(axis=1) == 0)
    ].tolist()

    np.random.seed(47)
    remove_indices = np.random.choice(available_indices , step, replace=False)

    X_trimmed = X.drop(remove_indices).reset_index(drop=True)
    y_trimmed = y.drop(remove_indices).reset_index(drop=True)
    
    return X_trimmed, y_trimmed

In [7019]:
# removes a CLASS
# if samples that belong only to that class exist, they will be deleted
# samples that belong to multiple classes will be kept
def remove_class(X, y, least_represented=True):
    class_counts = y.sum()
    target_class = class_counts.idxmin() if least_represented else class_counts.idxmax()
    # Find samples where only this class is present
    only_target = (y.sum(axis=1) == 1) & (y[target_class] == 1)
    # Remove those samples
    X_new = X.loc[~only_target].reset_index(drop=True)
    y_new = y.loc[~only_target].drop(columns=[target_class]).reset_index(drop=True)
    return X_new, y_new

In [7020]:
# removes CLASSES
# if samples that belong only to those classes exist, they will be deleted
# samples that still have other classes will be kept
def remove_bottom_percent_classes(X, y, percent):
    class_percentages_sorted = get_sorted_class_percentages(y)
    bottom_classes = get_least_represented_class_names(class_percentages_sorted, percent)

    # Remove the bottom classes from y
    y_new = y.drop(columns=bottom_classes)

    # Find samples that would have zero classes after dropping bottom classes
    zero_class_samples = y_new.sum(axis=1) == 0

    # Remove those samples from both X and y_new
    X_new = X.loc[~zero_class_samples].reset_index(drop=True)
    y_new = y_new.loc[~zero_class_samples].reset_index(drop=True)

    return X_new, y_new

In [7021]:
def show_summary(X, y):
    print("Number of samples:", len(X))
    print("Number of classes:", y.shape[1])
    
    cardinality = get_cardinality(y)
    print("Cardinality:", round(cardinality, 3))
    
    num_samples_no_labels = (y.sum(axis=1) == 0).sum()
    print("Samples with no labels:", num_samples_no_labels)

    class_percentages_sorted = get_sorted_class_percentages(y)
    print(class_percentages_sorted)
    return class_percentages_sorted

In [7022]:
# removes SAMPLES
def remove_samples_with_one_class(X, y, step, protected_classes=[], random_state=42):
    y_numeric = y.apply(pd.to_numeric)

    # Find indices where none of the protected classes are present
    available_indices = y_numeric.index[
        (y_numeric[protected_classes].sum(axis=1) == 0)
    ]

    # From those, select indices where only one class is present
    single_class_indices = y_numeric.loc[available_indices].index[
        y_numeric.loc[available_indices].sum(axis=1) == 1
    ].tolist()
    
    if step > len(single_class_indices):
        raise ValueError(f"Requested to remove {step} samples, but only {len(single_class_indices)} single-class samples from non-protected classes are available.")
    
    np.random.seed(random_state)
    remove_indices = np.random.choice(single_class_indices, step, replace=False)
    X_new = X.drop(remove_indices).reset_index(drop=True)
    y_new = y.drop(remove_indices).reset_index(drop=True)
    return X_new, y_new

In [7023]:
def remove_samples_with_zero_classes(X, y, step, random_state=42):
    y_numeric = y.apply(pd.to_numeric)
    zero_class_indices = y_numeric.index[y_numeric.sum(axis=1) == 0].tolist()
    
    if step > len(zero_class_indices):
        raise ValueError(f"Requested to remove {step} samples, but only {len(zero_class_indices)} samples with zero classes are available.")
    
    np.random.seed(random_state)
    remove_indices = np.random.choice(zero_class_indices, step, replace=False)
    X_new = X.drop(remove_indices).reset_index(drop=True)
    y_new = y.drop(remove_indices).reset_index(drop=True)
    return X_new, y_new

# Dataset trimming

In [6159]:
X, y  = remove_bottom_percent_classes(X, y, percent=0.5)
class_percentages_sorted = show_summary(X, y)

Number of samples: 27700
Number of classes: 11
Cardinality: 2.012
Samples with no labels: 0
class02    61.075812
class19    32.570397
class06    30.119134
class12    16.288809
class05    14.202166
class08    10.703971
class13     9.963899
class07     8.498195
class01     6.729242
class14     5.696751
class18     5.368231
dtype: float64


In [7138]:
# remove top class
X, y  = remove_class(X, y, least_represented=False)
class_percentages_sorted = show_summary(X, y)

Number of samples: 8351
Number of classes: 3
Cardinality: 1.259
Samples with no labels: 801
IsAmbianceGood    49.275536
IsServiceGood     48.473237
IsPriceGood       28.164292
dtype: float64


In [7180]:
# remove bottom class
X, y  = remove_class(X, y, least_represented=True)
class_percentages_sorted = show_summary(X, y)

Number of samples: 10589
Number of classes: 4
Cardinality: 1.593
Samples with no labels: 801
IsFoodGood        59.958447
IsAmbianceGood    38.861082
IsServiceGood     38.228350
IsPriceGood       22.211729
dtype: float64


In [7303]:
# trim top class of samples
class_percentages_sorted = get_sorted_class_percentages(y)

chosen_class = class_percentages_sorted.index[0]
print("Most populated class:", chosen_class)

X, y = remove_few_samples_of_class(X, y, chosen_class, 20, get_least_represented_class_names(class_percentages_sorted, 0.3))
class_percentages_sorted = show_summary(X, y)

Most populated class: IsAmbianceGood
Number of samples: 4638
Number of classes: 4
Cardinality: 2.151
Samples with no labels: 0
IsFoodGood        54.937473
IsServiceGood     54.829668
IsAmbianceGood    54.614058
IsPriceGood       50.711514
dtype: float64


In [None]:
# trim bottom class of samples
class_percentages_sorted = get_sorted_class_percentages(y)

chosen_class = class_percentages_sorted.index[-1]
print("Least populated class:", chosen_class)

X, y = remove_few_samples_of_class(X, y, chosen_class, 30, get_most_represented_class_name(class_percentages_sorted, 0.3))
class_percentages_sorted = show_summary(X, y)

Least populated class: Music
Number of samples: 43940
Number of classes: 10
Cardinality: 1.419
Samples with no labels: 0
Crime        18.049613
Family       17.967683
Horror       17.851616
Adventure    17.835685
Animation    16.973145
Sci-Fi       12.489759
Mystery      11.843423
Fantasy      11.745562
Western       9.535731
Music         7.571689
dtype: float64


In [7246]:
# remove samples of top classes with only 1 class
X, y = remove_samples_with_one_class(X, y, step=10, protected_classes=get_least_represented_class_names(class_percentages_sorted, 0.6))
class_percentages_sorted = show_summary(X, y)

Number of samples: 9138
Number of classes: 4
Cardinality: 1.774
Samples with no labels: 0
IsFoodGood        62.365944
IsAmbianceGood    45.031736
IsServiceGood     44.298534
IsPriceGood       25.738674
dtype: float64


In [451]:
# remove samples of bottom classes with only 1 class
X, y = remove_samples_with_one_class(X, y, step=1, protected_classes=get_most_represented_class_name(class_percentages_sorted, 0.1))
class_percentages_sorted = show_summary(X, y)

Number of samples: 4735
Number of classes: 35
Cardinality: 1.517
Samples with no labels: 0
TAG_bibteximport         11.024287
TAG_software              9.355861
TAG_learning              6.483633
TAG_evolution             6.209081
TAG_apob                  6.103485
TAG_immunoassay           5.237592
TAG_design                4.899683
TAG_ontology              4.899683
TAG_semantic              4.899683
TAG_model                 4.899683
TAG_web                   4.815206
TAG_social                4.392819
TAG_theory                4.329461
TAG_analysis              4.308342
TAG_networks              4.287223
TAG_electrochemistry      4.076030
TAG_mathematics           3.991552
TAG_knowledge             3.843717
TAG_requirements          3.801478
TAG_myown                 3.738120
TAG_semanticweb           3.695882
TAG_network               3.674762
TAG_information           3.569166
TAG_dynamics              3.526927
TAG_topic3                3.189018
TAG_bettasplendens        2.998944

In [7189]:
X, y = remove_samples_with_zero_classes(X, y, step=801)
class_percentages_sorted = show_summary(X, y)

Number of samples: 9708
Number of classes: 4
Cardinality: 1.729
Samples with no labels: 0
IsFoodGood        64.575608
IsAmbianceGood    42.387721
IsServiceGood     41.697569
IsPriceGood       24.227441
dtype: float64


# Trimmed dataset - info

In [7136]:
class_percentages_sorted = show_summary(X, y)

Number of samples: 10806
Number of classes: 5
Cardinality: 1.638
Samples with no labels: 801
IsFoodGood        58.754396
IsAmbianceGood    38.080696
IsServiceGood     37.460670
IsPriceGood       21.765686
IsDealsGood        7.764205
dtype: float64


# Save

### Balanced

In [7122]:
DATASET_DIR_BALANCED.mkdir(parents=True, exist_ok=True)
X.to_csv(DATASET_DIR_BALANCED / "X.csv", index=False)
y.to_csv(DATASET_DIR_BALANCED / "y.csv", index=False)

In [7123]:
class_percentages_sorted.to_csv(DATASET_DIR_BALANCED / "class_percentages.csv", header=False)

### Unbalanced

In [195]:
DATASET_DIR_UNBALANCED.mkdir(parents=True, exist_ok=True)
X.to_csv(DATASET_DIR_UNBALANCED / "X.csv", index=False)
y.to_csv(DATASET_DIR_UNBALANCED / "y.csv", index=False)

In [196]:
class_percentages_sorted.to_csv(DATASET_DIR_UNBALANCED / "class_percentages.csv", header=False)