# Setup

In [595]:
import os
import sys
from pathlib import Path
import pandas as pd
import numpy as np

In [596]:
sys.path.append(os.path.abspath("../../.."))

In [597]:
from src.experiment.helpers.variables import dataset_root_dir
from src.experiment.helpers.utils import get_sorted_class_percentages_label_encoded

In [598]:
NAME = "usps"
CLASSIFICATION_TYPE = "multiclass"

In [599]:
DATASET_DIR = dataset_root_dir / CLASSIFICATION_TYPE / NAME
print(DATASET_DIR)

c:\VisualStudioRepositories\MUSIC_DATA\datasets\multiclass\usps


In [600]:
balanced_name = NAME + "_balanced"
DATASET_DIR_BALANCED = dataset_root_dir / CLASSIFICATION_TYPE / balanced_name
print(DATASET_DIR_BALANCED)

c:\VisualStudioRepositories\MUSIC_DATA\datasets\multiclass\usps_balanced


In [601]:
imbalanced_name = NAME + "_imbalanced"
DATASET_DIR_IMBALANCED = dataset_root_dir / CLASSIFICATION_TYPE / imbalanced_name
print(DATASET_DIR_IMBALANCED)

c:\VisualStudioRepositories\MUSIC_DATA\datasets\multiclass\usps_imbalanced


# Dataset loading

In [1163]:
X = pd.read_csv(DATASET_DIR / 'X.csv')
y = pd.read_csv(DATASET_DIR / 'y.csv')

In [1164]:
X

Unnamed: 0,double1,double2,double3,double4,double5,double6,double7,double8,double9,double10,...,double247,double248,double249,double250,double251,double252,double253,double254,double255,double256
0,-1.000000,-1.000000,-1.000000,-0.999997,-0.999729,-0.989537,-0.860165,-0.395924,0.048299,-0.282151,...,0.131298,0.384728,0.433073,0.193397,-0.237439,-0.649848,-0.906359,-0.988672,-0.999475,-0.999994
1,-0.999927,-0.993644,-0.900309,-0.632621,-0.443145,-0.454436,-0.474872,-0.431176,-0.494539,-0.583648,...,-0.388084,-0.273639,0.001094,0.294825,0.316915,0.113494,-0.092331,-0.329390,-0.584681,-0.868793
2,-1.000000,-1.000000,-1.000000,-0.999995,-0.999986,-0.999987,-0.999990,-0.999316,-0.976299,-0.725917,...,-0.999778,-0.988816,-0.821857,-0.277379,0.136501,-0.247816,-0.809498,-0.988116,-0.999864,-1.000000
3,-0.999998,-0.999800,-0.994136,-0.932288,-0.673685,-0.192056,0.218820,0.337224,0.193371,-0.065072,...,-0.373861,0.196530,0.002823,-0.695900,-0.976372,-0.999458,-0.999996,-1.000000,-1.000000,-1.000000
4,-1.000000,-0.999971,-0.997452,-0.957499,-0.801681,-0.510623,-0.107965,0.217262,0.216431,-0.008529,...,0.214362,0.339240,0.458489,0.473456,0.372240,0.173322,-0.144182,-0.542254,-0.863031,-0.981978
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9293,-1.000000,-0.999994,-0.999321,-0.980459,-0.794210,-0.254973,0.280592,0.447993,0.219635,-0.381608,...,0.440799,0.424252,0.273223,0.045554,-0.271012,-0.624501,-0.891083,-0.986282,-0.999364,-0.999993
9294,-1.000000,-0.999999,-0.999929,-0.997031,-0.958108,-0.788782,-0.422594,0.050120,0.359004,0.317717,...,-0.723952,-0.045200,0.243265,-0.211727,-0.797285,-0.987273,-0.999854,-1.000000,-1.000000,-1.000000
9295,-0.999985,-0.998547,-0.966704,-0.723508,-0.155824,0.073144,-0.394682,-0.861495,-0.990816,-0.997802,...,0.313235,0.253947,-0.374854,-0.863926,-0.991874,-0.999908,-1.000000,-1.000000,-1.000000,-1.000000
9296,-0.999889,-0.996943,-0.962894,-0.772274,-0.303180,0.199492,0.377436,0.159035,-0.186853,-0.523479,...,0.422100,0.266813,0.097945,-0.142201,-0.559187,-0.867977,-0.978216,-0.998277,-0.999924,-0.999998


In [1165]:
y

Unnamed: 0,int0
0,7
1,6
2,5
3,8
4,4
...,...
9293,4
9294,10
9295,5
9296,1


# Cut classes to match balanced dataset

In [1166]:
y_balanced = pd.read_csv(DATASET_DIR_BALANCED / 'y.csv')
y_balanced

Unnamed: 0,int0
0,9
1,9
2,9
3,9
4,9
...,...
2200,6
2201,3
2202,1
2203,1


In [1167]:
# Get the set of classes present in y_balanced
classes_in_balanced = set(y_balanced['int0'].unique())
print(f"Classes in balanced dataset: {classes_in_balanced}")

# Create mask for samples in y that are present in y_balanced
mask = y['int0'].isin(classes_in_balanced)

# # Filter X and y
X = X.loc[mask].reset_index(drop=True)
y = y.loc[mask].reset_index(drop=True)

Classes in balanced dataset: {np.int64(1), np.int64(2), np.int64(3), np.int64(6), np.int64(7), np.int64(9)}


# Dataset info

In [1168]:
# number of samples
len(X)

6009

In [1169]:
# number of classes (number of unique numbers in y)
len(np.unique(y.values))

6

In [1170]:
class_percentages_sorted = get_sorted_class_percentages_label_encoded(y)
print(class_percentages_sorted)

1    25.844566
2    21.118323
3    15.460143
7    13.879181
6    11.915460
9    11.782327
Name: proportion, dtype: float64


# Functions

In [1171]:
def get_least_represented_class_names(class_percentages_sorted, percent):
    num_classes = len(class_percentages_sorted)
    bottom_percent_count = int(np.ceil(num_classes * percent))
    least_represented_classes = class_percentages_sorted.tail(bottom_percent_count).index.tolist()
    return least_represented_classes

In [1172]:
def get_most_represented_class_names(class_percentages_sorted, percent):
    num_classes = len(class_percentages_sorted)
    top_percent_count = int(np.ceil(num_classes * percent))
    most_represented_classes = class_percentages_sorted.head(top_percent_count).index.tolist()
    return most_represented_classes

In [1173]:
def show_summary(X, y):
    print("Number of samples:", len(X))
    print("Number of classes:", len(np.unique(y.values)))

    class_percentages_sorted = get_sorted_class_percentages_label_encoded(y)
    
    highest_class_percentage = class_percentages_sorted.iloc[0]
    lowest_class_percentage = class_percentages_sorted.iloc[-1]
    
    if lowest_class_percentage != 0:
        ratio = highest_class_percentage / lowest_class_percentage
        print(f"Times higher: {round(ratio, 2)}")
    else:
        ratio = np.nan
        print("The lowest class percentage is 0, cannot calculate ratio.")
        
    average_class_percentage = class_percentages_sorted.mean()
    class_percentages_std = class_percentages_sorted.std()

    cov = class_percentages_std / average_class_percentage
    print(f"CoV: {round(cov, 3)}")
    
    print(class_percentages_sorted)
    
    return class_percentages_sorted

In [1174]:
def remove_class(X, y, chosen_class_index):
    # If DataFrame, select the first column
    if isinstance(y, pd.DataFrame):
        y_col = y.columns[0]
        y_series = y[y_col]
    else:
        y_series = y


    print(f"Removing class: {chosen_class_index}")
    
    # Create mask from 1D series
    mask = y_series != chosen_class_index

    # Filter X and y using 1D mask
    return X.loc[mask], y.loc[mask]


In [1175]:
# removes SAMPLES
def remove_few_samples_of_class(X, y, class_index, step):
    indices_to_remove = y[y.values.flatten() == class_index].index[:step]
    X_trimmed = X.drop(indices_to_remove)
    y_trimmed = y.drop(indices_to_remove)
    
    return X_trimmed, y_trimmed

# Dataset trimming

In [23]:
some_classes = get_most_represented_class_names(class_percentages_sorted, 0.8)
some_classes

[1, 0]

In [None]:
# remove top class
class_percentages_sorted = get_sorted_class_percentages_label_encoded(y)
chosen_class = class_percentages_sorted.index[0]

X,y = remove_class(X, y, chosen_class)
class_percentages_sorted = show_summary(X, y)

Removing class: 1
Number of samples: 600
Number of classes: 1
0    100.0
Name: proportion, dtype: float64


In [53]:
# remove bottom class
class_percentages_sorted = get_sorted_class_percentages_label_encoded(y)
chosen_class = class_percentages_sorted.index[-3]

X,y = remove_class(X, y, chosen_class)
class_percentages_sorted = show_summary(X, y)

Removing class: 8
Number of samples: 6009
Number of classes: 6
1    25.844566
2    21.118323
3    15.460143
7    13.879181
6    11.915460
9    11.782327
Name: proportion, dtype: float64


In [1310]:
# remove samples from top class
class_percentages_sorted = get_sorted_class_percentages_label_encoded(y)
chosen_class = class_percentages_sorted.index[0]

X,y = remove_few_samples_of_class(X, y, chosen_class, step=10)
class_percentages_sorted = show_summary(X, y)

Number of samples: 2419
Number of classes: 6
Times higher: 5.86
CoV: 0.433
1    26.167838
6    18.850765
7    18.768086
2    18.147995
3    13.600661
9     4.464655
Name: proportion, dtype: float64


In [1365]:
# remove samples from bottom class
class_percentages_sorted = get_sorted_class_percentages_label_encoded(y)
chosen_class = class_percentages_sorted.index[-1]

X,y = remove_few_samples_of_class(X, y, chosen_class, step=1)
class_percentages_sorted = show_summary(X, y)

Number of samples: 2127
Number of classes: 6
Times higher: 7.03
CoV: 0.488
1    29.760226
7    17.207334
2    17.160320
6    17.113305
3    14.527504
9     4.231312
Name: proportion, dtype: float64


# Trimmed dataset - info

In [1366]:
class_percentages_sorted = show_summary(X, y)

Number of samples: 2127
Number of classes: 6
Times higher: 7.03
CoV: 0.488
1    29.760226
7    17.207334
2    17.160320
6    17.113305
3    14.527504
9     4.231312
Name: proportion, dtype: float64


# Save

### Balanced

In [None]:
DATASET_DIR_BALANCED.mkdir(parents=True, exist_ok=True)
X.to_csv(DATASET_DIR_BALANCED / "X.csv", index=False)
y.to_csv(DATASET_DIR_BALANCED / "y.csv", index=False)

In [373]:
class_percentages_sorted.to_csv(DATASET_DIR_BALANCED / "class_percentages.csv", header=False)

### Imbalanced

In [1367]:
DATASET_DIR_IMBALANCED.mkdir(parents=True, exist_ok=True)
X.to_csv(DATASET_DIR_IMBALANCED / "X.csv", index=False)
y.to_csv(DATASET_DIR_IMBALANCED / "y.csv", index=False)

In [1368]:
class_percentages_sorted.to_csv(DATASET_DIR_IMBALANCED / "class_percentages.csv", header=False)