# Setup

In [81]:
import os
import sys
from pathlib import Path
import pandas as pd
import numpy as np

In [82]:
sys.path.append(os.path.abspath("../../.."))

In [83]:
from src.experiment.helpers.variables import dataset_root_dir
from src.experiment.helpers.utils import get_sorted_class_percentages_label_encoded

In [84]:
NAME = "mfeat-karhunen"
CLASSIFICATION_TYPE = "multiclass"

In [85]:
DATASET_DIR = dataset_root_dir / CLASSIFICATION_TYPE / NAME
print(DATASET_DIR)

c:\VisualStudioRepositories\MUSIC_DATA\datasets\multiclass\mfeat-karhunen


In [86]:
balanced_name = NAME + "_balanced"
DATASET_DIR_BALANCED = dataset_root_dir / CLASSIFICATION_TYPE / balanced_name
print(DATASET_DIR_BALANCED)

c:\VisualStudioRepositories\MUSIC_DATA\datasets\multiclass\mfeat-karhunen_balanced


In [87]:
imbalanced_name = NAME + "_imbalanced"
DATASET_DIR_IMBALANCED = dataset_root_dir / CLASSIFICATION_TYPE / imbalanced_name
print(DATASET_DIR_IMBALANCED)

c:\VisualStudioRepositories\MUSIC_DATA\datasets\multiclass\mfeat-karhunen_imbalanced


# Dataset loading

In [88]:
X = pd.read_csv(DATASET_DIR / 'X.csv')
y = pd.read_csv(DATASET_DIR / 'y.csv')

In [89]:
X

Unnamed: 0,att1,att2,att3,att4,att5,att6,att7,att8,att9,att10,...,att55,att56,att57,att58,att59,att60,att61,att62,att63,att64
0,-10.297008,-11.666789,11.560669,-2.081316,4.044656,4.086815,-2.558072,-8.476935,2.138135,3.503082,...,1.078083,0.921927,0.496387,-0.643667,0.284104,0.286555,0.348625,1.814691,-1.351353,-0.473910
1,-5.036009,-12.885333,0.161155,0.592460,3.123534,4.220469,-6.411771,-6.335328,-0.244622,1.346073,...,0.942353,2.938791,1.429883,-2.336344,1.281628,-0.098321,0.582357,0.485792,0.642451,0.613107
2,-9.639157,-6.655898,0.388687,-1.717650,0.300346,3.400769,-7.240785,-1.659405,-0.874005,4.153403,...,-0.413174,-0.023028,-0.025265,1.259838,-0.441360,-0.960094,1.995843,1.097748,0.827182,-1.767840
3,-6.650375,-7.043851,4.104350,-2.342780,3.494658,3.924822,-9.874812,-6.556576,-1.364269,1.153308,...,-0.961236,-1.043815,-0.204508,-1.981150,0.982438,-0.144233,-1.449328,-0.913552,-0.771735,0.304992
4,-10.664524,-10.974133,0.194391,0.453885,2.193088,-3.304663,-8.376592,-4.241146,2.964818,-0.949622,...,0.152957,1.448160,-1.254907,-3.481295,-0.563889,1.529335,0.510399,0.298318,-0.943213,1.149847
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,-2.415248,-6.619806,5.053538,6.662300,12.136673,-1.447842,-2.321873,4.042169,-2.981806,-0.106785,...,-1.438355,-0.714285,0.017051,0.460572,-0.951763,0.241901,-0.399051,-0.304857,-0.068411,-1.049052
1996,5.892684,-8.185875,1.819305,6.871263,1.021332,-0.869375,-6.759738,-3.891993,-4.781352,3.355656,...,-0.672254,1.273016,0.227573,0.444086,1.439473,-0.405706,0.378187,-0.128056,0.925637,1.798053
1997,1.881613,-9.650881,0.317780,0.655888,7.882648,1.740497,0.026943,-4.412813,-3.403179,-0.614610,...,-0.121590,-1.622687,0.309964,0.473773,0.916683,0.971719,0.689472,-0.439637,0.287013,-0.420793
1998,-1.530886,-10.183775,-1.055864,4.956079,11.729954,1.480784,-2.806543,0.602515,-5.411981,-2.165543,...,-0.220936,-0.466334,0.128358,-0.888494,-0.014442,-0.780897,1.000286,1.405214,0.435514,-0.225426


In [90]:
y

Unnamed: 0,class
0,0
1,0
2,0
3,0
4,0
...,...
1995,9
1996,9
1997,9
1998,9


# Cut classes to match balanced dataset

In [100]:
y_balanced = pd.read_csv(DATASET_DIR_BALANCED / 'y.csv')
y_balanced

Unnamed: 0,class
0,0
1,0
2,0
3,0
4,0
...,...
475,3
476,3
477,3
478,3


In [101]:
# Get the set of classes present in y_balanced
classes_in_balanced = set(y_balanced['class'].unique())
print(f"Classes in balanced dataset: {classes_in_balanced}")

# Create mask for samples in y that are present in y_balanced
mask = y['class'].isin(classes_in_balanced)

# # Filter X and y
X = X.loc[mask].reset_index(drop=True)
y = y.loc[mask].reset_index(drop=True)

Classes in balanced dataset: {np.int64(0), np.int64(1), np.int64(2), np.int64(3)}


# Reduce class indexes by 1

In [46]:
#check if any label "0" is present in y
if (y == 0).any().any():
    print("Label '0' is present in y")
else:
    print("Label '0' is not present in y")

Label '0' is not present in y


# Remove gaps in labels

In [57]:
# remove gaps in labels, as in if there is a label "0" and then a label "2", turn the 2 into a 1
y = y.apply(lambda x: pd.factorize(x)[0])

# Dataset info

In [105]:
# number of samples
len(X)

800

In [106]:
# number of classes (number of unique numbers in y)
len(np.unique(y.values))

4

In [107]:
class_percentages_sorted = get_sorted_class_percentages_label_encoded(y)
print(class_percentages_sorted)

0    25.0
1    25.0
2    25.0
3    25.0
Name: proportion, dtype: float64


# Functions

In [94]:
def get_least_represented_class_names(class_percentages_sorted, percent):
    num_classes = len(class_percentages_sorted)
    bottom_percent_count = int(np.ceil(num_classes * percent))
    least_represented_classes = class_percentages_sorted.tail(bottom_percent_count).index.tolist()
    return least_represented_classes

In [95]:
def get_most_represented_class_names(class_percentages_sorted, percent):
    num_classes = len(class_percentages_sorted)
    top_percent_count = int(np.ceil(num_classes * percent))
    most_represented_classes = class_percentages_sorted.head(top_percent_count).index.tolist()
    return most_represented_classes

In [96]:
def show_summary(X, y):
    print("Number of samples:", len(X))
    print("Number of classes:", len(np.unique(y.values)))

    class_percentages_sorted = get_sorted_class_percentages_label_encoded(y)
    
    highest_class_percentage = class_percentages_sorted.iloc[0]
    lowest_class_percentage = class_percentages_sorted.iloc[-1]
    
    if lowest_class_percentage != 0:
        ratio = highest_class_percentage / lowest_class_percentage
        print(f"Times higher: {round(ratio, 2)}")
    else:
        ratio = np.nan
        print("The lowest class percentage is 0, cannot calculate ratio.")
        
    average_class_percentage = class_percentages_sorted.mean()
    class_percentages_std = class_percentages_sorted.std()

    cov = class_percentages_std / average_class_percentage
    print(f"CoV: {round(cov, 3)}")
    
    print(class_percentages_sorted)
    
    return class_percentages_sorted

In [97]:
def remove_class(X, y, chosen_class_index):
    # If DataFrame, select the first column
    if isinstance(y, pd.DataFrame):
        y_col = y.columns[0]
        y_series = y[y_col]
    else:
        y_series = y


    print(f"Removing class: {chosen_class_index}")
    
    # Create mask from 1D series
    mask = y_series != chosen_class_index

    # Filter X and y using 1D mask
    return X.loc[mask], y.loc[mask]


In [98]:
# removes SAMPLES
def remove_few_samples_of_class(X, y, class_index, step):
    indices_to_remove = y[y.values.flatten() == class_index].index[:step]
    X_trimmed = X.drop(indices_to_remove)
    y_trimmed = y.drop(indices_to_remove)
    
    return X_trimmed, y_trimmed

# Dataset trimming

In [23]:
some_classes = get_most_represented_class_names(class_percentages_sorted, 0.8)
some_classes

[1, 0]

In [258]:
# remove top class
class_percentages_sorted = get_sorted_class_percentages_label_encoded(y)
chosen_class = class_percentages_sorted.index[0]

X,y = remove_class(X, y, chosen_class)
class_percentages_sorted = show_summary(X, y)

IndexError: index 0 is out of bounds for axis 0 with size 0

In [45]:
# remove bottom class
class_percentages_sorted = get_sorted_class_percentages_label_encoded(y)
chosen_class = class_percentages_sorted.index[-1]

X,y = remove_class(X, y, chosen_class)
class_percentages_sorted = show_summary(X, y)

Removing class: 4
Number of samples: 800
Number of classes: 4
Times higher: 1.0
CoV: 0.0
0    25.0
1    25.0
2    25.0
3    25.0
Name: proportion, dtype: float64


In [77]:
# remove samples from top class
class_percentages_sorted = get_sorted_class_percentages_label_encoded(y)
chosen_class = class_percentages_sorted.index[0]

X,y = remove_few_samples_of_class(X, y, chosen_class, step=10)
class_percentages_sorted = show_summary(X, y)

Number of samples: 480
Number of classes: 4
Times higher: 1.0
CoV: 0.0
0    25.0
1    25.0
2    25.0
3    25.0
Name: proportion, dtype: float64


In [141]:
# remove samples from bottom class
class_percentages_sorted = get_sorted_class_percentages_label_encoded(y)
chosen_class = class_percentages_sorted.index[-3]

X,y = remove_few_samples_of_class(X, y, chosen_class, step=10)
class_percentages_sorted = show_summary(X, y)

Number of samples: 480
Number of classes: 4
Times higher: 2.29
CoV: 0.354
0    33.333333
1    31.250000
2    20.833333
3    14.583333
Name: proportion, dtype: float64


# Trimmed dataset - info

In [140]:
class_percentages_sorted = show_summary(X, y)

Number of samples: 490
Number of classes: 4
Times higher: 2.29
CoV: 0.367
0    32.653061
1    32.653061
2    20.408163
3    14.285714
Name: proportion, dtype: float64


# Save

### Basic

In [68]:
X.to_csv(DATASET_DIR / "X.csv", index=False)
y.to_csv(DATASET_DIR / "y.csv", index=False)

In [69]:
class_percentages_sorted.to_csv(DATASET_DIR / "class_percentages.csv", header=False)

### Balanced

In [79]:
DATASET_DIR_BALANCED.mkdir(parents=True, exist_ok=True)
X.to_csv(DATASET_DIR_BALANCED / "X.csv", index=False)
y.to_csv(DATASET_DIR_BALANCED / "y.csv", index=False)

In [80]:
class_percentages_sorted.to_csv(DATASET_DIR_BALANCED / "class_percentages.csv", header=False)

### Imbalanced

In [142]:
DATASET_DIR_IMBALANCED.mkdir(parents=True, exist_ok=True)
X.to_csv(DATASET_DIR_IMBALANCED / "X.csv", index=False)
y.to_csv(DATASET_DIR_IMBALANCED / "y.csv", index=False)

In [143]:
class_percentages_sorted.to_csv(DATASET_DIR_IMBALANCED / "class_percentages.csv", header=False)