# Setup

In [1]:
import os
import sys
from pathlib import Path
import arff
import pandas as pd
import numpy as np

In [2]:
sys.path.append(os.path.abspath("../../../.."))

In [3]:
from src.experiment.helpers.variables import dataset_root_dir
from src.experiment.helpers.utils import get_sorted_class_percentages, get_cardinality

In [4]:
NAME = "bookmarks"

In [5]:
DATASET_DIR = dataset_root_dir / 'multilabel' / NAME
print(DATASET_DIR)

c:\VisualStudioRepositories\MUSIC_DATA\datasets\multilabel\bookmarks


In [6]:
trimmed_name = NAME + "_trimmed"
DATASET_DIR_TRIMMED = dataset_root_dir / 'multilabel' / trimmed_name
print(DATASET_DIR_TRIMMED)

c:\VisualStudioRepositories\MUSIC_DATA\datasets\multilabel\bookmarks_trimmed


# Dataset loading

In [270]:
X = pd.read_csv(DATASET_DIR / 'X.csv')
y = pd.read_csv(DATASET_DIR / 'y.csv')

In [271]:
X

Unnamed: 0,a,aber,ability,absolutely,abstract,academic,accept,accepted,access,accessible,...,young,your,youtube,zeit,zip,zoom,zum,zur,zwei,zwischen
0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87851,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
87852,0,0,0,0,1,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
87853,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
87854,0,1,0,0,0,0,0,0,1,0,...,0,0,0,1,0,0,1,0,0,0


In [272]:
y

Unnamed: 0,TAG_20,TAG_academic,TAG_ajax,TAG_all,TAG_allgemein,TAG_api,TAG_apple,TAG_art,TAG_artery,TAG_article,...,TAG_web20,TAG_webdesign,TAG_webdev,TAG_webservice,TAG_wiki,TAG_wikipedia,TAG_windows,TAG_writing,TAG_xml,TAG_yahoo
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87851,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
87852,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
87853,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
87854,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Dataset info

In [8]:
# number of samples
len(X)

87856

In [9]:
# number of classes
len(y.columns)

208

In [15]:
class_percentages_sorted = get_sorted_class_percentages(y)
print(class_percentages_sorted)

TAG_video                7.708068
TAG_books                7.025132
TAG_software             4.714533
TAG_tools                4.278592
TAG_blog                 3.774358
                           ...   
TAG_marketing            0.346021
TAG_searchengines        0.343744
TAG_clinical             0.343744
TAG_digital              0.342606
TAG_netscapebookmarks    0.341468
Length: 208, dtype: float64


In [13]:
cardinality = get_cardinality(y)
print("Cardinality:", cardinality)

Cardinality: 2.02814833363686


# Functions

In [10]:
def get_least_represented_class_names(class_percentages_sorted, percent):
    num_classes = len(class_percentages_sorted)
    bottom_percent_count = int(np.ceil(num_classes * percent))
    least_represented_classes = class_percentages_sorted.tail(bottom_percent_count).index.tolist()
    return least_represented_classes

In [11]:
def trim_class(X, y, class_name, step, protected_classes):
    y_numeric = y.apply(pd.to_numeric)

    relaxing_indices = y_numeric.index[
        (y_numeric[class_name] == 1) &
        (y_numeric[protected_classes].sum(axis=1) == 0)
    ].tolist()

    np.random.seed(47)
    remove_indices = np.random.choice(relaxing_indices, step, replace=False)

    X_trimmed = X.drop(remove_indices).reset_index(drop=True)
    y_trimmed = y.drop(remove_indices).reset_index(drop=True)
    
    return X_trimmed, y_trimmed

In [12]:
def remove_all_class_samples(X, y, least_represented=True):
    class_counts = y.sum()
    target_class = class_counts.idxmin() if least_represented else class_counts.idxmax()
    indices_to_remove = y[y[target_class] == 1].index
    X_new = X.drop(indices_to_remove).reset_index(drop=True)
    y_new = y.drop(indices_to_remove).reset_index(drop=True).drop(columns=[target_class])
    return X_new, y_new

In [13]:
def remove_bottom_percent_classes(X, y, percent):
    class_percentages_sorted = get_sorted_class_percentages(y)

    bottom_classes = get_least_represented_class_names(class_percentages_sorted, percent)
    indices_to_remove = y[y[bottom_classes].sum(axis=1) > 0].index

    X_new = X.drop(indices_to_remove).reset_index(drop=True)
    y_new = y.drop(indices_to_remove).reset_index(drop=True).drop(columns=bottom_classes)
    return X_new, y_new

In [14]:
def show_summary(X, y):
    print("Number of samples:", len(X))
    print("Number of classes:", y.shape[1])
    
    cardinality = get_cardinality(y)
    print("Cardinality:", round(cardinality, 3))

    class_percentages_sorted = get_sorted_class_percentages(y)
    print(class_percentages_sorted)
    return class_percentages_sorted

In [16]:
def remove_single_class_samples(X, y, step, random_state=42):
    single_class_indices = y.index[y.sum(axis=1) == 1].tolist()
    if step > len(single_class_indices):
        raise ValueError(f"Requested to remove {step} samples, but only {len(single_class_indices)} single-class samples are available.")
    
    np.random.seed(random_state)
    remove_indices = np.random.choice(single_class_indices, step, replace=False)
    X_new = X.drop(remove_indices).reset_index(drop=True)
    y_new = y.drop(remove_indices).reset_index(drop=True)
    return X_new, y_new

# Dataset trimming

In [315]:
X, y  = remove_bottom_percent_classes(X, y, percent=0.05)
class_percentages_sorted = show_summary(X, y)

Number of samples: 13503
Number of classes: 63
Cardinality: 2.227
TAG_journal        11.849219
TAG_recept         11.693698
TAG_medical        11.478931
TAG_recipe         11.449308
TAG_computing       9.412723
                     ...    
TAG_freeware        1.421906
TAG_development     1.414500
TAG_all             1.384877
TAG_technology      1.340443
TAG_media           1.296008
Length: 63, dtype: float64


In [540]:
X, y  = remove_all_class_samples(X, y, least_represented=True)
class_percentages_sorted = show_summary(X, y)

Number of samples: 804
Number of classes: 46
Cardinality: 1.832
TAG_xml                          4.353234
TAG_tools                        4.353234
TAG_technology                   4.353234
TAG_medical                      4.353234
TAG_library                      4.353234
TAG_journal                      4.353234
TAG_mac                          4.228856
TAG_media                        4.228856
TAG_books                        4.228856
TAG_science                      4.228856
TAG_politics                     4.228856
TAG_security                     4.228856
TAG_blog                         4.104478
TAG_ajax                         4.104478
TAG_watersports                  4.104478
TAG_freeware                     4.104478
TAG_howto                        4.104478
TAG_rss                          4.104478
TAG_resources                    4.104478
TAG_sports                       4.104478
TAG_video                        4.104478
TAG_web20                        4.104478
TAG_article 

In [407]:
X, y  = remove_all_class_samples(X, y, least_represented=False)
class_percentages_sorted = show_summary(X, y)

Number of samples: 2369
Number of classes: 56
Cardinality: 1.986
TAG_javascript                   5.192064
TAG_ajax                         5.023217
TAG_history                      4.981005
TAG_blog                         4.812157
TAG_semanticweb                  4.812157
TAG_design                       4.769945
TAG_news                         4.769945
TAG_webdesign                    4.769945
TAG_wiki                         4.727733
TAG_ontology                     4.685521
TAG_web20                        4.643309
TAG_video                        4.601098
TAG_media                        4.474462
TAG_library                      4.474462
TAG_computing                    4.432250
TAG_books                        4.390038
TAG_rdf                          4.221190
TAG_css                          4.136767
TAG_politics                     4.094555
TAG_medical                      4.094555
TAG_opensource                   4.010131
TAG_programming                  4.010131
TAG_xml    

In [545]:
class_percentages_sorted = get_sorted_class_percentages(y)

least_represented_classes = get_least_represented_class_names(class_percentages_sorted, 0.9)
class_chosen_for_trimming = class_percentages_sorted.index[0]
print("Most populated class:", class_chosen_for_trimming)

X, y = trim_class(X, y, class_chosen_for_trimming, 1, least_represented_classes)
class_percentages_sorted = show_summary(X, y)

Most populated class: TAG_technology
Number of samples: 800
Number of classes: 46
Cardinality: 1.835
TAG_tools                        4.375
TAG_books                        4.250
TAG_library                      4.250
TAG_xml                          4.250
TAG_technology                   4.250
TAG_medical                      4.250
TAG_media                        4.250
TAG_science                      4.250
TAG_politics                     4.250
TAG_security                     4.250
TAG_journal                      4.250
TAG_mac                          4.250
TAG_blog                         4.125
TAG_ajax                         4.125
TAG_watersports                  4.125
TAG_freeware                     4.125
TAG_howto                        4.125
TAG_rss                          4.125
TAG_resources                    4.125
TAG_sports                       4.125
TAG_video                        4.125
TAG_web20                        4.125
TAG_article                      4.000
TA

In [532]:
X, y = remove_single_class_samples(X, y, step=2000)
class_percentages_sorted = show_summary(X, y)

ValueError: Requested to remove 2000 samples, but only 270 single-class samples are available.

# Trimmed dataset - info

In [546]:
class_percentages_sorted = show_summary(X, y)

Number of samples: 800
Number of classes: 46
Cardinality: 1.835
TAG_tools                        4.375
TAG_books                        4.250
TAG_library                      4.250
TAG_xml                          4.250
TAG_technology                   4.250
TAG_medical                      4.250
TAG_media                        4.250
TAG_science                      4.250
TAG_politics                     4.250
TAG_security                     4.250
TAG_journal                      4.250
TAG_mac                          4.250
TAG_blog                         4.125
TAG_ajax                         4.125
TAG_watersports                  4.125
TAG_freeware                     4.125
TAG_howto                        4.125
TAG_rss                          4.125
TAG_resources                    4.125
TAG_sports                       4.125
TAG_video                        4.125
TAG_web20                        4.125
TAG_article                      4.000
TAG_reference                    4.000


# Save

In [547]:
X.to_csv(DATASET_DIR_TRIMMED / "X.csv", index=False)
y.to_csv(DATASET_DIR_TRIMMED / "y.csv", index=False)

In [536]:
class_percentages_sorted.to_csv(DATASET_DIR_TRIMMED / "class_percentages.csv", header=False)