# Setup

In [150]:
import os
import sys
from pathlib import Path
import arff
import pandas as pd
import numpy as np

In [151]:
sys.path.append(os.path.abspath("../../../.."))

In [152]:
from src.experiment.helpers.variables import dataset_root_dir
from src.experiment.helpers.utils import get_sorted_class_percentages, get_cardinality

In [153]:
NAME = "bibtex"

In [154]:
DATASET_DIR = dataset_root_dir / 'multilabel' / NAME
print(DATASET_DIR)

c:\VisualStudioRepositories\MUSIC_DATA\datasets\multilabel\bibtex


In [None]:
balanced_name = NAME + "_balanced"
DATASET_DIR_BALANCED = dataset_root_dir / 'multilabel' / balanced_name
print(DATASET_DIR_BALANCED)

c:\VisualStudioRepositories\MUSIC_DATA\datasets\multilabel\bibtex_trimmed


In [None]:
unbalanced_name = NAME + "_unbalanced"
DATASET_DIR_UNBALANCED = dataset_root_dir / 'multilabel' / unbalanced_name
print(DATASET_DIR_UNBALANCED)

# Dataset loading

In [824]:
X = pd.read_csv(DATASET_DIR / 'X.csv')
y = pd.read_csv(DATASET_DIR / 'y.csv')

In [825]:
X

Unnamed: 0,0,000,02,05,06,1,10,100,11,12,...,years,yet,yield,yields,you,young,z,zero,zu,zur
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,1,0,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7390,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7391,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7392,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7393,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [826]:
y

Unnamed: 0,TAG_2005,TAG_2006,TAG_2007,TAG_agdetection,TAG_algorithms,TAG_amperometry,TAG_analysis,TAG_and,TAG_annotation,TAG_antibody,...,TAG_topic7,TAG_topic8,TAG_topic9,TAG_toread,TAG_transition,TAG_visual,TAG_visualization,TAG_web,TAG_web20,TAG_wiki
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7390,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
7391,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7392,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7393,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


# Dataset info

In [228]:
# number of samples
len(X)

7395

In [229]:
# number of classes
len(y.columns)

159

In [230]:
class_percentages_sorted = get_sorted_class_percentages(y)
print(class_percentages_sorted)

TAG_statphys23      14.090602
TAG_bibteximport     7.058824
TAG_software         5.990534
TAG_learning         4.151454
TAG_evolution        3.975659
                      ...    
TAG_disability       0.703178
TAG_empirical        0.703178
TAG_computing        0.689655
TAG_cortex           0.689655
TAG_survey           0.689655
Length: 159, dtype: float64


In [231]:
cardinality = get_cardinality(y)
print("Cardinality:", cardinality)

Cardinality: 2.401893171061528


# Functions

In [232]:
def get_least_represented_class_names(class_percentages_sorted, percent):
    num_classes = len(class_percentages_sorted)
    bottom_percent_count = int(np.ceil(num_classes * percent))
    least_represented_classes = class_percentages_sorted.tail(bottom_percent_count).index.tolist()
    return least_represented_classes

In [233]:
# removes SAMPLES
def remove_few_samples_of_class(X, y, class_name, step, protected_classes):
    y_numeric = y.apply(pd.to_numeric)

    available_indices = y_numeric.index[
        (y_numeric[class_name] == 1) &
        (y_numeric[protected_classes].sum(axis=1) == 0)
    ].tolist()

    np.random.seed(47)
    remove_indices = np.random.choice(available_indices , step, replace=False)

    X_trimmed = X.drop(remove_indices).reset_index(drop=True)
    y_trimmed = y.drop(remove_indices).reset_index(drop=True)
    
    return X_trimmed, y_trimmed

In [234]:
# removes a CLASS
# if samples that belong only to that class exist, they will be deleted
# samples that belong to multiple classes will be kept
def remove_class(X, y, least_represented=True):
    class_counts = y.sum()
    target_class = class_counts.idxmin() if least_represented else class_counts.idxmax()
    # Find samples where only this class is present
    only_target = (y.sum(axis=1) == 1) & (y[target_class] == 1)
    # Remove those samples
    X_new = X.loc[~only_target].reset_index(drop=True)
    y_new = y.loc[~only_target].drop(columns=[target_class]).reset_index(drop=True)
    return X_new, y_new

In [235]:
# removes CLASSES
# if samples that belong only to those classes exist, they will be deleted
# samples that still have other classes will be kept
def remove_bottom_percent_classes(X, y, percent):
    class_percentages_sorted = get_sorted_class_percentages(y)
    bottom_classes = get_least_represented_class_names(class_percentages_sorted, percent)

    # Remove the bottom classes from y
    y_new = y.drop(columns=bottom_classes)

    # Find samples that would have zero classes after dropping bottom classes
    zero_class_samples = y_new.sum(axis=1) == 0

    # Remove those samples from both X and y_new
    X_new = X.loc[~zero_class_samples].reset_index(drop=True)
    y_new = y_new.loc[~zero_class_samples].reset_index(drop=True)

    return X_new, y_new

In [236]:
def show_summary(X, y):
    print("Number of samples:", len(X))
    print("Number of classes:", y.shape[1])
    
    cardinality = get_cardinality(y)
    print("Cardinality:", round(cardinality, 3))
    
    num_samples_no_labels = (y.sum(axis=1) == 0).sum()
    print("Samples with no labels:", num_samples_no_labels)

    class_percentages_sorted = get_sorted_class_percentages(y)
    print(class_percentages_sorted)
    return class_percentages_sorted

In [237]:
# removes SAMPLES
def remove_samples_with_one_class(X, y, step, protected_classes, random_state=42):
    y_numeric = y.apply(pd.to_numeric)

    # Find indices where none of the protected classes are present
    available_indices = y_numeric.index[
        (y_numeric[protected_classes].sum(axis=1) == 0)
    ]

    # From those, select indices where only one class is present
    single_class_indices = y_numeric.loc[available_indices].index[
        y_numeric.loc[available_indices].sum(axis=1) == 1
    ].tolist()
    
    if step > len(single_class_indices):
        raise ValueError(f"Requested to remove {step} samples, but only {len(single_class_indices)} single-class samples from non-protected classes are available.")
    
    np.random.seed(random_state)
    remove_indices = np.random.choice(single_class_indices, step, replace=False)
    X_new = X.drop(remove_indices).reset_index(drop=True)
    y_new = y.drop(remove_indices).reset_index(drop=True)
    return X_new, y_new

In [238]:
def remove_samples_with_zero_classes(X, y, step, random_state=42):
    y_numeric = y.apply(pd.to_numeric)
    zero_class_indices = y_numeric.index[y_numeric.sum(axis=1) == 0].tolist()
    
    if step > len(zero_class_indices):
        raise ValueError(f"Requested to remove {step} samples, but only {len(zero_class_indices)} samples with zero classes are available.")
    
    np.random.seed(random_state)
    remove_indices = np.random.choice(zero_class_indices, step, replace=False)
    X_new = X.drop(remove_indices).reset_index(drop=True)
    y_new = y.drop(remove_indices).reset_index(drop=True)
    return X_new, y_new

# Dataset trimming

In [890]:
X, y  = remove_bottom_percent_classes(X, y, percent=0.1)
class_percentages_sorted = show_summary(X, y)

Number of samples: 4360
Number of classes: 37
Cardinality: 1.569
Samples with no labels: 0
TAG_learning             6.559633
TAG_software             6.490826
TAG_apob                 6.399083
TAG_bibteximport         6.077982
TAG_immunoassay          5.688073
TAG_semantic             5.321101
TAG_model                5.321101
TAG_web                  5.252294
TAG_ontology             5.229358
TAG_design               5.183486
TAG_evolution            4.908257
TAG_social               4.701835
TAG_analysis             4.564220
TAG_theory               4.541284
TAG_networks             4.495413
TAG_electrochemistry     4.380734
TAG_mathematics          4.197248
TAG_knowledge            4.151376
TAG_myown                4.036697
TAG_semanticweb          4.036697
TAG_information          3.922018
TAG_network              3.899083
TAG_requirements         3.876147
TAG_dynamics             3.784404
TAG_topic3               3.417431
TAG_topic11              3.233945
TAG_systems              

In [892]:
X, y  = remove_class(X, y, least_represented=True)
class_percentages_sorted = show_summary(X, y)

Number of samples: 4275
Number of classes: 35
Cardinality: 1.54
Samples with no labels: 0
TAG_learning             6.690058
TAG_software             6.619883
TAG_apob                 6.526316
TAG_bibteximport         6.198830
TAG_immunoassay          5.801170
TAG_semantic             5.426901
TAG_model                5.426901
TAG_web                  5.356725
TAG_ontology             5.333333
TAG_design               5.286550
TAG_evolution            5.005848
TAG_social               4.795322
TAG_analysis             4.654971
TAG_theory               4.631579
TAG_networks             4.584795
TAG_electrochemistry     4.467836
TAG_mathematics          4.280702
TAG_knowledge            4.233918
TAG_myown                4.116959
TAG_semanticweb          4.116959
TAG_information          4.000000
TAG_network              3.976608
TAG_requirements         3.953216
TAG_dynamics             3.859649
TAG_topic3               3.485380
TAG_topic11              3.298246
TAG_systems              3

In [827]:
X, y  = remove_class(X, y, least_represented=False)
class_percentages_sorted = show_summary(X, y)

Number of samples: 7377
Number of classes: 158
Cardinality: 2.267
Samples with no labels: 0
TAG_bibteximport    7.076047
TAG_software        6.005151
TAG_learning        4.161583
TAG_evolution       3.985360
TAG_apob            3.971804
                      ...   
TAG_disability      0.704894
TAG_empirical       0.704894
TAG_computing       0.691338
TAG_cortex          0.691338
TAG_survey          0.691338
Length: 158, dtype: float64


In [1077]:
class_percentages_sorted = get_sorted_class_percentages(y)

least_represented_classes = get_least_represented_class_names(class_percentages_sorted, 0.5)
most_represented_class = class_percentages_sorted.index[0]
print("Most populated class:", most_represented_class)

X, y = remove_few_samples_of_class(X, y, most_represented_class, 2, least_represented_classes)
class_percentages_sorted = show_summary(X, y)

Most populated class: TAG_model
Number of samples: 2887
Number of classes: 35
Cardinality: 1.764
Samples with no labels: 0
TAG_social               5.264981
TAG_immunoassay          5.264981
TAG_apob                 5.264981
TAG_network              5.264981
TAG_semantic             5.230343
TAG_analysis             5.230343
TAG_dynamics             5.230343
TAG_learning             5.230343
TAG_design               5.230343
TAG_electrochemistry     5.230343
TAG_software             5.195705
TAG_theory               5.195705
TAG_information          5.195705
TAG_ontology             5.195705
TAG_requirements         5.195705
TAG_networks             5.195705
TAG_model                5.195705
TAG_evolution            5.195705
TAG_bibteximport         5.195705
TAG_semanticweb          5.195705
TAG_knowledge            5.161067
TAG_mathematics          5.161067
TAG_topic3               5.161067
TAG_web                  5.161067
TAG_myown                5.161067
TAG_topic11              4.

In [1023]:
least_represented_classes = get_least_represented_class_names(class_percentages_sorted, 0.9)

X, y = remove_samples_with_one_class(X, y, step=10, protected_classes=least_represented_classes)
class_percentages_sorted = show_summary(X, y)

ValueError: Requested to remove 10 samples, but only 1 single-class samples from non-protected classes are available.

In [376]:
X, y = remove_samples_with_zero_classes(X, y, step=100000)
class_percentages_sorted = show_summary(X, y)

ValueError: Requested to remove 100000 samples, but only 0 samples with zero classes are available.

# Trimmed dataset - info

In [1078]:
class_percentages_sorted = show_summary(X, y)

Number of samples: 2887
Number of classes: 35
Cardinality: 1.764
Samples with no labels: 0
TAG_social               5.264981
TAG_immunoassay          5.264981
TAG_apob                 5.264981
TAG_network              5.264981
TAG_semantic             5.230343
TAG_analysis             5.230343
TAG_dynamics             5.230343
TAG_learning             5.230343
TAG_design               5.230343
TAG_electrochemistry     5.230343
TAG_software             5.195705
TAG_theory               5.195705
TAG_information          5.195705
TAG_ontology             5.195705
TAG_requirements         5.195705
TAG_networks             5.195705
TAG_model                5.195705
TAG_evolution            5.195705
TAG_bibteximport         5.195705
TAG_semanticweb          5.195705
TAG_knowledge            5.161067
TAG_mathematics          5.161067
TAG_topic3               5.161067
TAG_web                  5.161067
TAG_myown                5.161067
TAG_topic11              4.883963
TAG_systems              

# Save

### Balanced

In [None]:
DATASET_DIR_BALANCED.mkdir(parents=True, exist_ok=True)
X.to_csv(DATASET_DIR_BALANCED / "X.csv", index=False)
y.to_csv(DATASET_DIR_BALANCED / "y.csv", index=False)

In [None]:
class_percentages_sorted.to_csv(DATASET_DIR_BALANCED / "class_percentages.csv", header=False)

### Unbalanced

In [None]:
DATASET_DIR_UNBALANCED.mkdir(parents=True, exist_ok=True)
X.to_csv(DATASET_DIR_BALANCED / "X.csv", index=False)
y.to_csv(DATASET_DIR_BALANCED / "y.csv", index=False)

In [None]:
class_percentages_sorted.to_csv(DATASET_DIR_UNBALANCED / "class_percentages.csv", header=False)