# Setup

In [47]:
import os
import sys
from pathlib import Path
import pandas as pd
import numpy as np

In [48]:
sys.path.append(os.path.abspath("../../.."))

In [49]:
from src.experiment.helpers.variables import dataset_root_dir
from src.experiment.helpers.utils import get_sorted_class_percentages_label_encoded

In [50]:
NAME = "gtsrb-huelist_imbalanced"
CLASSIFICATION_TYPE = "multiclass"

In [51]:
DATASET_DIR = dataset_root_dir / CLASSIFICATION_TYPE / NAME
print(DATASET_DIR)

c:\VisualStudioRepositories\MUSIC_DATA\datasets\multiclass\gtsrb-huelist_imbalanced


In [52]:
balanced_name = NAME + "_balanced"
DATASET_DIR_BALANCED = dataset_root_dir / CLASSIFICATION_TYPE / balanced_name
print(DATASET_DIR_BALANCED)

c:\VisualStudioRepositories\MUSIC_DATA\datasets\multiclass\gtsrb-huelist_imbalanced_balanced


In [53]:
imbalanced_name = NAME + "_imbalanced"
DATASET_DIR_IMBALANCED = dataset_root_dir / CLASSIFICATION_TYPE / imbalanced_name
print(DATASET_DIR_IMBALANCED)

c:\VisualStudioRepositories\MUSIC_DATA\datasets\multiclass\gtsrb-huelist_imbalanced_imbalanced


# Dataset loading

In [54]:
X = pd.read_csv(DATASET_DIR / 'X.csv')
y = pd.read_csv(DATASET_DIR / 'y.csv')

In [55]:
X

Unnamed: 0,pixel-00000,pixel-00001,pixel-00002,pixel-00003,pixel-00004,pixel-00005,pixel-00006,pixel-00007,pixel-00008,pixel-00009,...,pixel-00246,pixel-00247,pixel-00248,pixel-00249,pixel-00250,pixel-00251,pixel-00252,pixel-00253,pixel-00254,pixel-00255
0,0.0,0.019426,0.026364,0.050879,0.077706,0.066605,0.065680,0.063830,0.044866,0.060592,...,0.004163,0.010176,0.006013,0.006013,0.007401,0.012488,0.003238,0.004163,0.001388,0.104995
1,0.0,0.022695,0.024586,0.054846,0.055319,0.048700,0.044917,0.037352,0.030260,0.046336,...,0.001418,0.008511,0.006619,0.002837,0.005201,0.005201,0.005201,0.006619,0.002364,0.123877
2,0.0,0.017486,0.023630,0.047259,0.059546,0.050567,0.047259,0.039697,0.034972,0.052457,...,0.000945,0.008507,0.007089,0.005198,0.008034,0.005671,0.005671,0.006144,0.001418,0.118620
3,0.0,0.024740,0.029948,0.053385,0.043837,0.034722,0.026476,0.032986,0.015625,0.030382,...,0.004774,0.011285,0.009115,0.007812,0.008681,0.012153,0.008247,0.013889,0.004774,0.118924
4,0.0,0.019992,0.032487,0.039150,0.041233,0.039150,0.023324,0.020825,0.011245,0.032903,...,0.004165,0.009579,0.012911,0.010829,0.012911,0.012495,0.009579,0.019992,0.004581,0.129113
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28445,0.0,0.000000,0.000000,0.000000,0.000000,0.001111,0.002222,0.001111,0.000000,0.003333,...,0.000000,0.001111,0.001111,0.001111,0.000000,0.001111,0.000000,0.000000,0.000000,0.015556
28446,0.0,0.053333,0.022222,0.011111,0.005556,0.007778,0.016667,0.007778,0.007778,0.010000,...,0.000000,0.003333,0.005556,0.006667,0.011111,0.005556,0.005556,0.008889,0.028889,0.046667
28447,0.0,0.001111,0.001111,0.001111,0.004444,0.002222,0.003333,0.003333,0.003333,0.004444,...,0.001111,0.003333,0.004444,0.002222,0.004444,0.001111,0.000000,0.003333,0.001111,0.127778
28448,0.0,0.000000,0.000000,0.000000,0.000000,0.002222,0.002222,0.002222,0.000000,0.002222,...,0.020000,0.020000,0.012222,0.008889,0.020000,0.003333,0.001111,0.000000,0.000000,0.033333


In [56]:
y

Unnamed: 0,class
0,1
1,1
2,1
3,1
4,1
...,...
28445,35
28446,25
28447,12
28448,7


# Cut classes to match balanced dataset

In [344]:
y_balanced = pd.read_csv(DATASET_DIR_BALANCED / 'y.csv')
y_balanced

Unnamed: 0,class
0,1
1,1
2,1
3,1
4,1
...,...
30265,35
30266,25
30267,12
30268,7


In [345]:
# Get the set of classes present in y_balanced
classes_in_balanced = set(y_balanced['class'].unique())
print(f"Classes in balanced dataset: {classes_in_balanced}")

# Create mask for samples in y that are present in y_balanced
mask = y['class'].isin(classes_in_balanced)

# # Filter X and y
X = X.loc[mask].reset_index(drop=True)
y = y.loc[mask].reset_index(drop=True)

Classes in balanced dataset: {np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(35), np.int64(7), np.int64(8), np.int64(9), np.int64(10), np.int64(11), np.int64(12), np.int64(13), np.int64(38), np.int64(18), np.int64(25)}


# Reduce class indexes by 1

In [46]:
#check if any label "0" is present in y
if (y == 0).any().any():
    print("Label '0' is present in y")
else:
    print("Label '0' is not present in y")

Label '0' is not present in y


# Remove gaps in labels

In [57]:
# remove gaps in labels, as in if there is a label "0" and then a label "2", turn the 2 into a 1
y = y.apply(lambda x: pd.factorize(x)[0])

# Dataset info

In [58]:
# number of samples
len(X)

28450

In [59]:
# number of classes (number of unique numbers in y)
len(np.unique(y.values))

16

In [60]:
class_percentages_sorted = get_sorted_class_percentages_label_encoded(y)
print(class_percentages_sorted)

1     10.544815
0     10.333919
10    10.123023
9      9.806678
12     9.701230
8      9.384886
3      8.224956
4      8.224956
11     6.959578
7      6.854130
5      4.463972
2      1.933216
6      1.898067
13     0.878735
14     0.597540
15     0.070299
Name: proportion, dtype: float64


# Functions

In [62]:
def get_least_represented_class_names(class_percentages_sorted, percent):
    num_classes = len(class_percentages_sorted)
    bottom_percent_count = int(np.ceil(num_classes * percent))
    least_represented_classes = class_percentages_sorted.tail(bottom_percent_count).index.tolist()
    return least_represented_classes

In [63]:
def get_most_represented_class_names(class_percentages_sorted, percent):
    num_classes = len(class_percentages_sorted)
    top_percent_count = int(np.ceil(num_classes * percent))
    most_represented_classes = class_percentages_sorted.head(top_percent_count).index.tolist()
    return most_represented_classes

In [64]:
def show_summary(X, y):
    print("Number of samples:", len(X))
    print("Number of classes:", len(np.unique(y.values)))

    class_percentages_sorted = get_sorted_class_percentages_label_encoded(y)
    
    highest_class_percentage = class_percentages_sorted.iloc[0]
    lowest_class_percentage = class_percentages_sorted.iloc[-1]
    
    if lowest_class_percentage != 0:
        ratio = highest_class_percentage / lowest_class_percentage
        print(f"Times higher: {round(ratio, 2)}")
    else:
        ratio = np.nan
        print("The lowest class percentage is 0, cannot calculate ratio.")
        
    average_class_percentage = class_percentages_sorted.mean()
    class_percentages_std = class_percentages_sorted.std()

    cov = class_percentages_std / average_class_percentage
    print(f"CoV: {round(cov, 3)}")
    
    print(class_percentages_sorted)
    
    return class_percentages_sorted

In [65]:
def remove_class(X, y, chosen_class_index):
    # If DataFrame, select the first column
    if isinstance(y, pd.DataFrame):
        y_col = y.columns[0]
        y_series = y[y_col]
    else:
        y_series = y


    print(f"Removing class: {chosen_class_index}")
    
    # Create mask from 1D series
    mask = y_series != chosen_class_index

    # Filter X and y using 1D mask
    return X.loc[mask], y.loc[mask]


In [66]:
# removes SAMPLES
def remove_few_samples_of_class(X, y, class_index, step):
    indices_to_remove = y[y.values.flatten() == class_index].index[:step]
    X_trimmed = X.drop(indices_to_remove)
    y_trimmed = y.drop(indices_to_remove)
    
    return X_trimmed, y_trimmed

# Dataset trimming

In [23]:
some_classes = get_most_represented_class_names(class_percentages_sorted, 0.8)
some_classes

[1, 0]

In [None]:
# remove top class
class_percentages_sorted = get_sorted_class_percentages_label_encoded(y)
chosen_class = class_percentages_sorted.index[0]

X,y = remove_class(X, y, chosen_class)
class_percentages_sorted = show_summary(X, y)

Removing class: 1
Number of samples: 600
Number of classes: 1
0    100.0
Name: proportion, dtype: float64


In [739]:
# remove bottom class
class_percentages_sorted = get_sorted_class_percentages_label_encoded(y)
chosen_class = class_percentages_sorted.index[-1]

X,y = remove_class(X, y, chosen_class)
class_percentages_sorted = show_summary(X, y)

Removing class: 14
Number of samples: 31010
Number of classes: 17
Times higher: 1.31
CoV: 0.076
25    6.223799
10    6.191551
38    6.159303
9     6.127056
2     6.127056
13    6.127056
5     6.094808
7     6.094808
4     6.094808
12    6.094808
1     6.062560
8     5.998065
3     5.998065
11    5.611093
18    5.127378
35    5.127378
17    4.740406
Name: proportion, dtype: float64


In [286]:
# remove samples from top class
class_percentages_sorted = get_sorted_class_percentages_label_encoded(y)
chosen_class = class_percentages_sorted.index[0]

X,y = remove_few_samples_of_class(X, y, chosen_class, step=30)
class_percentages_sorted = show_summary(X, y)

Number of samples: 9880
Number of classes: 10
Times higher: 1.09
CoV: 0.023
1    10.273279
5    10.263158
8    10.030364
6    10.020243
4    10.010121
2    10.010121
0    10.000000
9    10.000000
7     9.979757
3     9.412955
Name: proportion, dtype: float64


In [720]:
# remove samples from bottom class
class_percentages_sorted = get_sorted_class_percentages_label_encoded(y)
chosen_class = class_percentages_sorted.index[-8]

X,y = remove_few_samples_of_class(X, y, chosen_class, step=10)
class_percentages_sorted = show_summary(X, y)

Number of samples: 9980
Number of classes: 10
Times higher: 65.61
CoV: 0.747
5    20.380762
3    19.338677
4    14.418838
7    14.388778
2    14.318637
9     6.392786
6     6.312625
1     3.356713
0     0.781563
8     0.310621
Name: proportion, dtype: float64


# Trimmed dataset - info

In [67]:
class_percentages_sorted = show_summary(X, y)

Number of samples: 28450
Number of classes: 16
Times higher: 150.0
CoV: 0.631
1     10.544815
0     10.333919
10    10.123023
9      9.806678
12     9.701230
8      9.384886
3      8.224956
4      8.224956
11     6.959578
7      6.854130
5      4.463972
2      1.933216
6      1.898067
13     0.878735
14     0.597540
15     0.070299
Name: proportion, dtype: float64


# Save

### Basic

In [68]:
X.to_csv(DATASET_DIR / "X.csv", index=False)
y.to_csv(DATASET_DIR / "y.csv", index=False)

In [69]:
class_percentages_sorted.to_csv(DATASET_DIR / "class_percentages.csv", header=False)

### Balanced

In [288]:
DATASET_DIR_BALANCED.mkdir(parents=True, exist_ok=True)
X.to_csv(DATASET_DIR_BALANCED / "X.csv", index=False)
y.to_csv(DATASET_DIR_BALANCED / "y.csv", index=False)

In [289]:
class_percentages_sorted.to_csv(DATASET_DIR_BALANCED / "class_percentages.csv", header=False)

### Imbalanced

In [722]:
DATASET_DIR_IMBALANCED.mkdir(parents=True, exist_ok=True)
X.to_csv(DATASET_DIR_IMBALANCED / "X.csv", index=False)
y.to_csv(DATASET_DIR_IMBALANCED / "y.csv", index=False)

In [723]:
class_percentages_sorted.to_csv(DATASET_DIR_IMBALANCED / "class_percentages.csv", header=False)