# Setup

In [504]:
import os
import sys
from pathlib import Path
import arff
import pandas as pd
import numpy as np

In [505]:
sys.path.append(os.path.abspath("../../../.."))

In [506]:
from src.experiment.helpers.variables import dataset_root_dir
from src.experiment.helpers.utils import get_sorted_class_percentages, get_cardinality

In [507]:
NAME = "corel16k009"

In [508]:
DATASET_DIR = dataset_root_dir / 'multilabel' / NAME
print(DATASET_DIR)

c:\VisualStudioRepositories\MUSIC_DATA\datasets\multilabel\corel16k009


In [509]:
trimmed_name = NAME + "_trimmed"
DATASET_DIR_TRIMMED = dataset_root_dir / 'multilabel' / trimmed_name
print(DATASET_DIR_TRIMMED)

c:\VisualStudioRepositories\MUSIC_DATA\datasets\multilabel\corel16k009_trimmed


# Dataset loading

In [926]:
X = pd.read_csv(DATASET_DIR / 'X.csv')
y = pd.read_csv(DATASET_DIR / 'y.csv')

In [927]:
X

Unnamed: 0,Cluster1,Cluster2,Cluster3,Cluster4,Cluster5,Cluster6,Cluster7,Cluster8,Cluster9,Cluster10,...,Cluster491,Cluster492,Cluster493,Cluster494,Cluster495,Cluster496,Cluster497,Cluster498,Cluster499,Cluster500
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13879,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13880,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13881,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13882,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [928]:
y

Unnamed: 0,water,sky,tree,people,grass,building,flowers,mountains,rocks,snow,...,rabbit,man,plain,turn,formula,f-16,kauai,saguaro,bengal,f-18
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13879,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13880,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13881,0,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13882,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Dataset info

In [750]:
# number of samples
len(X)

13884

In [751]:
# number of classes
len(y.columns)

173

In [752]:
class_percentages_sorted = get_sorted_class_percentages(y)
print(class_percentages_sorted)

water      22.832037
sky        18.452895
tree       17.991933
people     16.609046
grass       8.801498
             ...    
f-16        0.216076
kauai       0.216076
saguaro     0.208874
bengal      0.201671
f-18        0.194468
Length: 173, dtype: float64


In [753]:
cardinality = get_cardinality(y)
print("Cardinality:", cardinality)

Cardinality: 2.9301354076634976


# Functions

In [516]:
def get_least_represented_class_names(class_percentages_sorted, percent):
    num_classes = len(class_percentages_sorted)
    bottom_percent_count = int(np.ceil(num_classes * percent))
    least_represented_classes = class_percentages_sorted.tail(bottom_percent_count).index.tolist()
    return least_represented_classes

In [517]:
def trim_class(X, y, class_name, step, protected_classes):
    y_numeric = y.apply(pd.to_numeric)

    relaxing_indices = y_numeric.index[
        (y_numeric[class_name] == 1) &
        (y_numeric[protected_classes].sum(axis=1) == 0)
    ].tolist()

    np.random.seed(47)
    remove_indices = np.random.choice(relaxing_indices, step, replace=False)

    X_trimmed = X.drop(remove_indices).reset_index(drop=True)
    y_trimmed = y.drop(remove_indices).reset_index(drop=True)
    
    return X_trimmed, y_trimmed

In [518]:
def remove_all_class_samples(X, y, least_represented=True):
    class_counts = y.sum()
    target_class = class_counts.idxmin() if least_represented else class_counts.idxmax()
    indices_to_remove = y[y[target_class] == 1].index
    X_new = X.drop(indices_to_remove).reset_index(drop=True)
    y_new = y.drop(indices_to_remove).reset_index(drop=True).drop(columns=[target_class])
    return X_new, y_new

In [519]:
def remove_bottom_percent_classes(X, y, percent):
    class_percentages_sorted = get_sorted_class_percentages(y)

    bottom_classes = get_least_represented_class_names(class_percentages_sorted, percent)
    indices_to_remove = y[y[bottom_classes].sum(axis=1) > 0].index

    X_new = X.drop(indices_to_remove).reset_index(drop=True)
    y_new = y.drop(indices_to_remove).reset_index(drop=True).drop(columns=bottom_classes)
    return X_new, y_new

In [520]:
def show_summary(X, y):
    print("Number of samples:", len(X))
    print("Number of classes:", y.shape[1])
    
    cardinality = get_cardinality(y)
    print("Cardinality:", round(cardinality, 3))

    class_percentages_sorted = get_sorted_class_percentages(y)
    print(class_percentages_sorted)
    return class_percentages_sorted

In [521]:
def remove_single_class_samples(X, y, step, random_state=42):
    single_class_indices = y.index[y.sum(axis=1) == 1].tolist()
    if step > len(single_class_indices):
        raise ValueError(f"Requested to remove {step} samples, but only {len(single_class_indices)} single-class samples are available.")
    
    np.random.seed(random_state)
    remove_indices = np.random.choice(single_class_indices, step, replace=False)
    X_new = X.drop(remove_indices).reset_index(drop=True)
    y_new = y.drop(remove_indices).reset_index(drop=True)
    return X_new, y_new

# Dataset trimming

In [932]:
X, y  = remove_bottom_percent_classes(X, y, percent=0.3)
class_percentages_sorted = show_summary(X, y)

Number of samples: 1932
Number of classes: 14
Cardinality: 2.448
sky          30.072464
tree         29.451346
water        23.964803
mountains    22.981366
people       21.428571
flowers      18.581781
snow         15.734990
leaves       14.751553
plants       13.664596
clouds       13.561077
rocks        12.577640
grass        10.559006
pattern       9.057971
texture       8.385093
dtype: float64


In [937]:
X, y  = remove_all_class_samples(X, y, least_represented=True)
class_percentages_sorted = show_summary(X, y)

Number of samples: 1162
Number of classes: 9
Cardinality: 2.218
people       31.153184
tree         30.464716
flowers      29.001721
sky          23.149742
water        23.063683
leaves       23.063683
plants       21.686747
mountains    20.654045
snow         19.535284
dtype: float64


In [231]:
X, y  = remove_all_class_samples(X, y, least_represented=False)
class_percentages_sorted = show_summary(X, y)

Number of samples: 3809
Number of classes: 3
Cardinality: 0.59
Class67    48.779207
Class32    10.238908
Class34     0.000000
dtype: float64


In [961]:
class_percentages_sorted = get_sorted_class_percentages(y)

least_represented_classes = get_least_represented_class_names(class_percentages_sorted, 0.3)
class_chosen_for_trimming = class_percentages_sorted.index[0]
print("Most populated class:", class_chosen_for_trimming)

X, y = trim_class(X, y, class_chosen_for_trimming, 10, least_represented_classes)
class_percentages_sorted = show_summary(X, y)

Most populated class: people
Number of samples: 912
Number of classes: 9
Cardinality: 2.371
tree         28.070175
flowers      27.850877
people       27.631579
sky          26.535088
mountains    25.877193
plants       25.767544
water        25.548246
snow         24.890351
leaves       24.890351
dtype: float64


In [1014]:
X, y = remove_single_class_samples(X, y, step=2)
class_percentages_sorted = show_summary(X, y)

Number of samples: 812
Number of classes: 9
Cardinality: 2.539
flowers      31.280788
tree         31.157635
mountains    28.940887
sky          28.571429
plants       28.201970
leaves       27.832512
snow         26.970443
water        25.615764
people       25.369458
dtype: float64


# Trimmed dataset - info

In [972]:
class_percentages_sorted = show_summary(X, y)

Number of samples: 892
Number of classes: 9
Cardinality: 2.401
tree         28.699552
flowers      28.475336
people       27.242152
sky          26.793722
mountains    26.457399
plants       26.233184
water        25.672646
leaves       25.336323
snow         25.224215
dtype: float64


# Save

In [973]:
X.to_csv(DATASET_DIR_TRIMMED / "X.csv", index=False)
y.to_csv(DATASET_DIR_TRIMMED / "y.csv", index=False)

In [974]:
class_percentages_sorted.to_csv(DATASET_DIR_TRIMMED / "class_percentages.csv", header=False)