# Setup

In [341]:
import os
import sys
from pathlib import Path
import pandas as pd
import numpy as np

In [342]:
sys.path.append(os.path.abspath("../../.."))

In [343]:
from src.experiment.helpers.variables import dataset_root_dir
from src.experiment.helpers.utils import get_sorted_class_percentages_label_encoded

In [344]:
NAME = "colon-0.5"
CLASSIFICATION_TYPE = "binary"

In [345]:
DATASET_DIR = dataset_root_dir / CLASSIFICATION_TYPE / NAME
print(DATASET_DIR)

c:\VisualStudioRepositories\MUSIC_DATA\datasets\binary\colon-0.5


In [346]:
balanced_name = NAME + "_balanced"
DATASET_DIR_BALANCED = dataset_root_dir / CLASSIFICATION_TYPE / balanced_name
print(DATASET_DIR_BALANCED)

c:\VisualStudioRepositories\MUSIC_DATA\datasets\binary\colon-0.5_balanced


In [347]:
imbalanced_name = NAME + "_imbalanced"
DATASET_DIR_IMBALANCED = dataset_root_dir / CLASSIFICATION_TYPE / imbalanced_name
print(DATASET_DIR_IMBALANCED)

c:\VisualStudioRepositories\MUSIC_DATA\datasets\binary\colon-0.5_imbalanced


# Dataset loading

In [540]:
X = pd.read_csv(DATASET_DIR / 'X.csv')
y = pd.read_csv(DATASET_DIR / 'y.csv')

In [541]:
X

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,...,x53,x54,x55,x56,x57,x58,x59,x60,x61,x62
0,0.529017,0.324412,0.436142,0.672407,0.385966,0.591629,0.647329,-0.218339,1.391723,0.575459,...,-0.402377,0.244848,-0.257350,-0.694400,-0.108642,-0.510170,-1.907441,0.226254,-0.902652,0.305356
1,1.026207,1.166457,0.071509,1.005009,0.419500,0.977392,0.700449,0.399531,-0.249785,0.301207,...,-0.629257,-0.344001,-0.235164,1.164913,-0.325309,0.079989,-0.786775,-0.415554,-0.020382,-0.471993
2,-0.496673,-0.440553,-0.761838,-1.244818,-1.134828,-0.384280,-0.409239,-0.685483,-0.164766,-0.440333,...,-1.049680,-0.069838,-0.652821,-0.643622,-0.607118,-0.733420,-0.560074,-0.299195,-0.562998,-1.007440
3,0.608465,0.130809,-0.649260,-1.028468,0.006053,0.102326,-0.066962,-0.782990,-0.431098,-0.117270,...,0.301158,-0.392406,-0.351681,0.189710,0.538381,0.803642,-0.261229,0.988765,0.545211,0.226809
4,-0.412081,-0.866666,-0.104197,0.021353,-0.534906,-0.403575,0.391729,-0.438812,0.013909,0.416097,...,-0.126862,0.623683,0.188660,0.227028,-0.416755,-0.242779,0.318200,0.199640,-0.407931,0.580741
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2549995,0.034323,-0.672288,0.257126,0.111580,0.107512,0.682698,-0.552540,-1.472325,0.110694,0.323488,...,-0.873854,-0.288949,-0.285236,0.576399,-0.118355,0.459106,-0.163083,-0.536511,0.745296,1.059606
2549996,-0.755059,-0.579062,-0.518623,-0.961513,-0.459879,-0.580180,-1.142283,-0.911493,-0.888238,-0.965574,...,-1.493009,-0.578043,-0.438491,-0.642800,-0.261973,-0.190847,-0.640267,-0.289657,-1.134306,-0.244575
2549997,-0.433831,-0.620781,-0.934634,-0.639863,-0.373569,-0.504319,-0.824410,-0.247597,-0.659062,-0.766601,...,-1.260071,-1.542358,-0.650161,-1.098483,-0.596796,-1.676064,-1.765397,-0.796059,-1.109716,-1.093553
2549998,-0.305396,-0.304284,-0.373917,-0.239466,0.102176,0.099624,-0.745985,-0.564119,0.616056,0.182470,...,-0.462267,0.075264,-0.894071,-0.466849,-0.620570,-0.065862,0.056138,0.152047,0.168283,0.374373


In [542]:
y

Unnamed: 0,y
0,1
1,0
2,1
3,0
4,1
...,...
2549995,1
2549996,1
2549997,1
2549998,1


# Cut classes to match balanced dataset

In [100]:
y_balanced = pd.read_csv(DATASET_DIR_BALANCED / 'y.csv')
y_balanced

Unnamed: 0,class
0,0
1,0
2,0
3,0
4,0
...,...
475,3
476,3
477,3
478,3


In [101]:
# Get the set of classes present in y_balanced
classes_in_balanced = set(y_balanced['class'].unique())
print(f"Classes in balanced dataset: {classes_in_balanced}")

# Create mask for samples in y that are present in y_balanced
mask = y['class'].isin(classes_in_balanced)

# # Filter X and y
X = X.loc[mask].reset_index(drop=True)
y = y.loc[mask].reset_index(drop=True)

Classes in balanced dataset: {np.int64(0), np.int64(1), np.int64(2), np.int64(3)}


# Reduce class indexes by 1

In [46]:
#check if any label "0" is present in y
if (y == 0).any().any():
    print("Label '0' is present in y")
else:
    print("Label '0' is not present in y")

Label '0' is not present in y


# Remove gaps in labels

In [57]:
# remove gaps in labels, as in if there is a label "0" and then a label "2", turn the 2 into a 1
y = y.apply(lambda x: pd.factorize(x)[0])

# Dataset info

In [351]:
# number of samples
len(X)

2550000

In [352]:
# number of classes (number of unique numbers in y)
len(np.unique(y.values))

2

In [353]:
class_percentages_sorted = get_sorted_class_percentages_label_encoded(y)
print(class_percentages_sorted)

1    50.03149
0    49.96851
Name: proportion, dtype: float64


# Functions

In [354]:
def get_least_represented_class_names(class_percentages_sorted, percent):
    num_classes = len(class_percentages_sorted)
    bottom_percent_count = int(np.ceil(num_classes * percent))
    least_represented_classes = class_percentages_sorted.tail(bottom_percent_count).index.tolist()
    return least_represented_classes

In [355]:
def get_most_represented_class_names(class_percentages_sorted, percent):
    num_classes = len(class_percentages_sorted)
    top_percent_count = int(np.ceil(num_classes * percent))
    most_represented_classes = class_percentages_sorted.head(top_percent_count).index.tolist()
    return most_represented_classes

In [356]:
def show_summary(X, y):
    print("Number of samples:", len(X))
    print("Number of classes:", len(np.unique(y.values)))

    class_percentages_sorted = get_sorted_class_percentages_label_encoded(y)
    
    highest_class_percentage = class_percentages_sorted.iloc[0]
    lowest_class_percentage = class_percentages_sorted.iloc[-1]
    
    if lowest_class_percentage != 0:
        ratio = highest_class_percentage / lowest_class_percentage
        print(f"Times higher: {round(ratio, 2)}")
    else:
        ratio = np.nan
        print("The lowest class percentage is 0, cannot calculate ratio.")
        
    average_class_percentage = class_percentages_sorted.mean()
    class_percentages_std = class_percentages_sorted.std()

    cov = class_percentages_std / average_class_percentage
    print(f"CoV: {round(cov, 3)}")
    
    print(class_percentages_sorted)
    
    return class_percentages_sorted

In [357]:
def remove_class(X, y, chosen_class_index):
    # If DataFrame, select the first column
    if isinstance(y, pd.DataFrame):
        y_col = y.columns[0]
        y_series = y[y_col]
    else:
        y_series = y


    print(f"Removing class: {chosen_class_index}")
    
    # Create mask from 1D series
    mask = y_series != chosen_class_index

    # Filter X and y using 1D mask
    return X.loc[mask], y.loc[mask]


In [358]:
# removes SAMPLES
def remove_few_samples_of_class(X, y, class_index, step):
    indices_to_remove = y[y.values.flatten() == class_index].index[:step]
    X_trimmed = X.drop(indices_to_remove)
    y_trimmed = y.drop(indices_to_remove)
    
    return X_trimmed, y_trimmed

# Dataset trimming

In [23]:
some_classes = get_most_represented_class_names(class_percentages_sorted, 0.8)
some_classes

[1, 0]

In [258]:
# remove top class
class_percentages_sorted = get_sorted_class_percentages_label_encoded(y)
chosen_class = class_percentages_sorted.index[0]

X,y = remove_class(X, y, chosen_class)
class_percentages_sorted = show_summary(X, y)

IndexError: index 0 is out of bounds for axis 0 with size 0

In [45]:
# remove bottom class
class_percentages_sorted = get_sorted_class_percentages_label_encoded(y)
chosen_class = class_percentages_sorted.index[-1]

X,y = remove_class(X, y, chosen_class)
class_percentages_sorted = show_summary(X, y)

Removing class: 4
Number of samples: 800
Number of classes: 4
Times higher: 1.0
CoV: 0.0
0    25.0
1    25.0
2    25.0
3    25.0
Name: proportion, dtype: float64


In [706]:
# remove samples from top class
class_percentages_sorted = get_sorted_class_percentages_label_encoded(y)
chosen_class = class_percentages_sorted.index[0]

X,y = remove_few_samples_of_class(X, y, chosen_class, step=1)
class_percentages_sorted = show_summary(X, y)

Number of samples: 1710
Number of classes: 2
Times higher: 4.96
CoV: 0.94
0    83.216374
1    16.783626
Name: proportion, dtype: float64


In [664]:
# remove samples from bottom class
class_percentages_sorted = get_sorted_class_percentages_label_encoded(y)
chosen_class = class_percentages_sorted.index[-1]

X,y = remove_few_samples_of_class(X, y, chosen_class, step=100)
class_percentages_sorted = show_summary(X, y)

Number of samples: 2274
Number of classes: 2
Times higher: 6.92
CoV: 1.057
0    87.379068
1    12.620932
Name: proportion, dtype: float64


# Trimmed dataset - info

In [707]:
class_percentages_sorted = show_summary(X, y)

Number of samples: 1710
Number of classes: 2
Times higher: 4.96
CoV: 0.94
0    83.216374
1    16.783626
Name: proportion, dtype: float64


# Save

### Basic

In [68]:
X.to_csv(DATASET_DIR / "X.csv", index=False)
y.to_csv(DATASET_DIR / "y.csv", index=False)

In [69]:
class_percentages_sorted.to_csv(DATASET_DIR / "class_percentages.csv", header=False)

### Balanced

In [339]:
DATASET_DIR_BALANCED.mkdir(parents=True, exist_ok=True)
X.to_csv(DATASET_DIR_BALANCED / "X.csv", index=False)
y.to_csv(DATASET_DIR_BALANCED / "y.csv", index=False)

In [340]:
class_percentages_sorted.to_csv(DATASET_DIR_BALANCED / "class_percentages.csv", header=False)

### Imbalanced

In [708]:
DATASET_DIR_IMBALANCED.mkdir(parents=True, exist_ok=True)
X.to_csv(DATASET_DIR_IMBALANCED / "X.csv", index=False)
y.to_csv(DATASET_DIR_IMBALANCED / "y.csv", index=False)

In [709]:
class_percentages_sorted.to_csv(DATASET_DIR_IMBALANCED / "class_percentages.csv", header=False)