# Setup

In [1]:
import os
import sys
from pathlib import Path
import arff
import pandas as pd
import numpy as np

In [2]:
sys.path.append(os.path.abspath("../../../.."))

In [3]:
from src.experiment.helpers.variables import dataset_root_dir
from src.experiment.helpers.utils import get_sorted_class_percentages, get_cardinality

In [4]:
NAME = "slashdot"

In [5]:
DATASET_DIR = dataset_root_dir / 'multilabel' / NAME
print(DATASET_DIR)

c:\VisualStudioRepositories\MUSIC_DATA\datasets\multilabel\slashdot


In [6]:
trimmed_name = NAME + "_trimmed"
DATASET_DIR_TRIMMED = dataset_root_dir / 'multilabel' / trimmed_name
print(DATASET_DIR_TRIMMED)

c:\VisualStudioRepositories\MUSIC_DATA\datasets\multilabel\slashdot_trimmed


# Dataset loading

In [343]:
X = pd.read_csv(DATASET_DIR / 'X.csv')
y = pd.read_csv(DATASET_DIR / 'y.csv')

In [344]:
X

Unnamed: 0,0,000,1,10,100,11,12,15,1up,2,...,wrote,xbox,yahoo,year,years,yesterday,york,young,youtube,zdnet
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3777,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3778,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3779,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3780,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [345]:
y

Unnamed: 0,Idle,Science,Games,Technology,YourRightsOnline,AskSlashdot,IT,Mobile,Hardware,Politics,Linux,Developers,News,Apple,Entertainment,BookReviews,Interviews,BSD,Search,Apache
0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3777,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3778,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3779,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3780,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


# Dataset info

In [350]:
# number of samples
len(X)

3782

In [351]:
# number of classes
len(y.columns)

20

In [352]:
class_percentages_sorted = get_sorted_class_percentages(y)
print(class_percentages_sorted)

Idle                15.441565
Science             15.203596
Games               13.326282
Technology          13.114754
YourRightsOnline    11.237441
AskSlashdot          6.848228
IT                   5.949233
Mobile               5.869910
Hardware             5.843469
Politics             5.446854
Linux                4.970915
Developers           4.521417
News                 2.987837
Apple                2.723427
Entertainment        1.983078
BookReviews          1.480698
Interviews           0.423057
BSD                  0.396616
Search               0.237969
Apache               0.079323
dtype: float64


In [353]:
cardinality = get_cardinality(y)
print("Cardinality:", cardinality)

Cardinality: 1.1808566895822317


# Functions

In [14]:
def get_least_represented_class_names(class_percentages_sorted, percent):
    num_classes = len(class_percentages_sorted)
    bottom_percent_count = int(np.ceil(num_classes * percent))
    least_represented_classes = class_percentages_sorted.tail(bottom_percent_count).index.tolist()
    return least_represented_classes

In [15]:
def trim_class(X, y, class_name, step, protected_classes):
    y_numeric = y.apply(pd.to_numeric)

    relaxing_indices = y_numeric.index[
        (y_numeric[class_name] == 1) &
        (y_numeric[protected_classes].sum(axis=1) == 0)
    ].tolist()

    np.random.seed(47)
    remove_indices = np.random.choice(relaxing_indices, step, replace=False)

    X_trimmed = X.drop(remove_indices).reset_index(drop=True)
    y_trimmed = y.drop(remove_indices).reset_index(drop=True)
    
    return X_trimmed, y_trimmed

In [16]:
def remove_all_class_samples(X, y, least_represented=True):
    class_counts = y.sum()
    target_class = class_counts.idxmin() if least_represented else class_counts.idxmax()
    indices_to_remove = y[y[target_class] == 1].index
    X_new = X.drop(indices_to_remove).reset_index(drop=True)
    y_new = y.drop(indices_to_remove).reset_index(drop=True).drop(columns=[target_class])
    return X_new, y_new

In [17]:
def remove_bottom_percent_classes(X, y, percent):
    class_percentages_sorted = get_sorted_class_percentages(y)

    bottom_classes = get_least_represented_class_names(class_percentages_sorted, percent)
    indices_to_remove = y[y[bottom_classes].sum(axis=1) > 0].index

    X_new = X.drop(indices_to_remove).reset_index(drop=True)
    y_new = y.drop(indices_to_remove).reset_index(drop=True).drop(columns=bottom_classes)
    return X_new, y_new

In [18]:
def show_summary(X, y):
    print("Number of samples:", len(X))
    print("Number of classes:", y.shape[1])
    
    cardinality = get_cardinality(y)
    print("Cardinality:", round(cardinality, 3))

    class_percentages_sorted = get_sorted_class_percentages(y)
    print(class_percentages_sorted)
    return class_percentages_sorted

In [19]:
def remove_single_class_samples(X, y, step, random_state=42):
    single_class_indices = y.index[y.sum(axis=1) == 1].tolist()
    if step > len(single_class_indices):
        raise ValueError(f"Requested to remove {step} samples, but only {len(single_class_indices)} single-class samples are available.")
    
    np.random.seed(random_state)
    remove_indices = np.random.choice(single_class_indices, step, replace=False)
    X_new = X.drop(remove_indices).reset_index(drop=True)
    y_new = y.drop(remove_indices).reset_index(drop=True)
    return X_new, y_new

# Dataset trimming

In [587]:
X, y  = remove_bottom_percent_classes(X, y, percent=0.6)
class_percentages_sorted = show_summary(X, y)

Number of samples: 12928
Number of classes: 6
Cardinality: 2.05
Class68    66.893564
Class34    58.121906
Class67    36.788366
Class32    32.634592
Class66    10.550743
Class52     0.054146
dtype: float64


In [365]:
X, y  = remove_all_class_samples(X, y, least_represented=True)
class_percentages_sorted = show_summary(X, y)

Number of samples: 2710
Number of classes: 8
Cardinality: 1.101
Idle                21.143911
Science             20.442804
Games               16.605166
Technology          15.239852
YourRightsOnline    14.575646
AskSlashdot          7.822878
IT                   7.195572
Politics             7.047970
dtype: float64


In [231]:
X, y  = remove_all_class_samples(X, y, least_represented=False)
class_percentages_sorted = show_summary(X, y)

Number of samples: 3809
Number of classes: 3
Cardinality: 0.59
Class67    48.779207
Class32    10.238908
Class34     0.000000
dtype: float64


In [481]:
class_percentages_sorted = get_sorted_class_percentages(y)

least_represented_classes = get_least_represented_class_names(class_percentages_sorted, 0.5)
class_chosen_for_trimming = class_percentages_sorted.index[0]
print("Most populated class:", class_chosen_for_trimming)

X, y = trim_class(X, y, class_chosen_for_trimming, 10, least_represented_classes)
class_percentages_sorted = show_summary(X, y)

Most populated class: Games
Number of samples: 1575
Number of classes: 8
Cardinality: 1.116
YourRightsOnline    14.920635
Technology          14.857143
Science             14.666667
Idle                14.603175
Games               14.603175
AskSlashdot         13.460317
IT                  12.380952
Politics            12.126984
dtype: float64


In [342]:
X, y = remove_single_class_samples(X, y, step=1)
class_percentages_sorted = show_summary(X, y)

Number of samples: 1680
Number of classes: 8
Cardinality: 1.112
Technology          15.476190
Games               15.476190
YourRightsOnline    15.357143
Science             15.297619
Idle                14.940476
AskSlashdot         12.321429
Politics            11.190476
IT                  11.130952
dtype: float64


# Trimmed dataset - info

In [482]:
class_percentages_sorted = show_summary(X, y)

Number of samples: 1575
Number of classes: 8
Cardinality: 1.116
YourRightsOnline    14.920635
Technology          14.857143
Science             14.666667
Idle                14.603175
Games               14.603175
AskSlashdot         13.460317
IT                  12.380952
Politics            12.126984
dtype: float64


# Save

In [483]:
X.to_csv(DATASET_DIR_TRIMMED / "X.csv", index=False)
y.to_csv(DATASET_DIR_TRIMMED / "y.csv", index=False)

In [484]:
class_percentages_sorted.to_csv(DATASET_DIR_TRIMMED / "class_percentages.csv", header=False)