# Setup

In [248]:
import os
import sys
from pathlib import Path
import pandas as pd
import numpy as np

In [249]:
sys.path.append(os.path.abspath("../../.."))

In [250]:
from src.experiment.helpers.variables import dataset_root_dir
from src.experiment.helpers.utils import get_sorted_class_percentages_label_encoded

In [251]:
NAME = "stocks_imbalanced"
CLASSIFICATION_TYPE = "multiclass"

In [252]:
DATASET_DIR = dataset_root_dir / CLASSIFICATION_TYPE / NAME
print(DATASET_DIR)

c:\VisualStudioRepositories\MUSIC_DATA\datasets\multiclass\stocks_imbalanced


In [253]:
balanced_name = NAME + "_balanced"
DATASET_DIR_BALANCED = dataset_root_dir / CLASSIFICATION_TYPE / balanced_name
print(DATASET_DIR_BALANCED)

c:\VisualStudioRepositories\MUSIC_DATA\datasets\multiclass\stocks_imbalanced_balanced


In [254]:
imbalanced_name = NAME + "_imbalanced"
DATASET_DIR_IMBALANCED = dataset_root_dir / CLASSIFICATION_TYPE / imbalanced_name
print(DATASET_DIR_IMBALANCED)

c:\VisualStudioRepositories\MUSIC_DATA\datasets\multiclass\stocks_imbalanced_imbalanced


# Dataset loading

In [255]:
X = pd.read_csv(DATASET_DIR / 'X.csv')
y = pd.read_csv(DATASET_DIR / 'y.csv')

In [256]:
X

Unnamed: 0,1347,2u,aar,actinium,actuant,acxiom,adcare,adt,advent,aecom,...,xunlei,xylem,yuchai,yume,yy,zayo,zeltiq,zendesk,ziff,zogenix
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35535,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
35536,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
35537,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
35538,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [257]:
y

Unnamed: 0,stock
0,0
1,0
2,0
3,0
4,0
...,...
35535,1410
35536,1410
35537,1410
35538,1410


# Cut classes to match balanced dataset

In [13]:
y_balanced = pd.read_csv(DATASET_DIR_BALANCED / 'y_2.csv')
y_balanced

Unnamed: 0,stock
0,0
1,0
2,0
3,0
4,0
...,...
14695,1410
14696,1410
14697,1410
14698,1410


In [14]:
# Get the set of classes present in y_balanced
classes_in_balanced = set(y_balanced['stock'].unique())
print(f"Classes in balanced dataset: {classes_in_balanced}")

# Create mask for samples in y that are present in y_balanced
mask = y['stock'].isin(classes_in_balanced)

# # Filter X and y
X = X.loc[mask].reset_index(drop=True)
y = y.loc[mask].reset_index(drop=True)

Classes in balanced dataset: {np.int64(0), np.int64(1410), np.int64(147), np.int64(532), np.int64(1310), np.int64(675), np.int64(425), np.int64(1079), np.int64(1086), np.int64(74), np.int64(850), np.int64(1362), np.int64(1370), np.int64(986), np.int64(477), np.int64(608), np.int64(611), np.int64(488), np.int64(496), np.int64(499)}


# Reduce class indexes by 1

In [11]:
#check if any label "0" is present in y
if (y == 0).any().any():
    print("Label '0' is present in y")
else:
    print("Label '0' is not present in y")

Label '0' is not present in y


In [13]:
# reduce y by 1
y['Class'] = y['Class'] - 1

# Remove gaps in labels

In [446]:
# remove gaps in labels, as in if there is a label "0" and then a label "2", turn the 2 into a 1
y = y.apply(lambda x: pd.factorize(x)[0])

# Dataset info

In [15]:
# number of samples
len(X)

42842

In [16]:
# number of classes (number of unique numbers in y)
len(np.unique(y.values))

20

In [17]:
# Convert y to int if needed
class_counts = y.value_counts()
print(class_counts)

stock
1362     2929
496      2820
532      2570
675      2517
147      2400
1086     2365
850      2299
986      2270
74       1992
488      1990
425      1946
1370     1920
611      1901
608      1887
1410     1866
477      1852
1079     1848
1310     1845
0        1829
499      1796
Name: count, dtype: int64


In [18]:
class_percentages_sorted = get_sorted_class_percentages_label_encoded(y)
print(class_percentages_sorted)

1362    6.836749
496     6.582326
532     5.998786
675     5.875076
147     5.601979
1086    5.520284
850     5.366229
986     5.298539
74      4.649643
488     4.644975
425     4.542272
1370    4.481583
611     4.437234
608     4.404556
1410    4.355539
477     4.322861
1079    4.313524
1310    4.306522
0       4.269175
499     4.192148
Name: proportion, dtype: float64


# Functions

In [21]:
def get_least_represented_class_names(class_percentages_sorted, percent):
    num_classes = len(class_percentages_sorted)
    bottom_percent_count = int(np.ceil(num_classes * percent))
    least_represented_classes = class_percentages_sorted.tail(bottom_percent_count).index.tolist()
    return least_represented_classes

In [22]:
def get_most_represented_class_names(class_percentages_sorted, percent):
    num_classes = len(class_percentages_sorted)
    top_percent_count = int(np.ceil(num_classes * percent))
    most_represented_classes = class_percentages_sorted.head(top_percent_count).index.tolist()
    return most_represented_classes

In [23]:
def show_summary(X, y):
    print("Number of samples:", len(X))
    print("Number of classes:", len(np.unique(y.values)))

    class_percentages_sorted = get_sorted_class_percentages_label_encoded(y)
    
    highest_class_percentage = class_percentages_sorted.iloc[0]
    lowest_class_percentage = class_percentages_sorted.iloc[-1]
    
    if lowest_class_percentage != 0:
        ratio = highest_class_percentage / lowest_class_percentage
        print(f"Times higher: {round(ratio, 2)}")
    else:
        ratio = np.nan
        print("The lowest class percentage is 0, cannot calculate ratio.")
        
    average_class_percentage = class_percentages_sorted.mean()
    class_percentages_std = class_percentages_sorted.std()

    cov = class_percentages_std / average_class_percentage
    print(f"CoV: {round(cov, 3)}")
    
    print(class_percentages_sorted)
    
    return class_percentages_sorted

In [24]:
def remove_class(X, y, chosen_class_index):
    # If DataFrame, select the first column
    if isinstance(y, pd.DataFrame):
        y_col = y.columns[0]
        y_series = y[y_col]
    else:
        y_series = y


    print(f"Removing class: {chosen_class_index}")
    
    # Create mask from 1D series
    mask = y_series != chosen_class_index

    # Filter X and y using 1D mask
    return X.loc[mask], y.loc[mask]


In [25]:
# removes SAMPLES
def remove_few_samples_of_class(X, y, class_index, step):
    indices_to_remove = y[y.values.flatten() == class_index].index[:step]
    X_trimmed = X.drop(indices_to_remove)
    y_trimmed = y.drop(indices_to_remove)
    
    return X_trimmed, y_trimmed

In [26]:
def remove_samples_from_all_classes(X, y, step):
    class_percentages_sorted = get_sorted_class_percentages_label_encoded(y)
    for chosen_class in class_percentages_sorted.index:
        X, y = remove_few_samples_of_class(X, y, chosen_class, step=step)   
    return X, y

# Dataset trimming

In [12]:
half_index = len(X) // 2
X = X.iloc[:half_index].reset_index(drop=True)
y = y.iloc[:half_index].reset_index(drop=True)

In [23]:
some_classes = get_most_represented_class_names(class_percentages_sorted, 0.8)
some_classes

[1, 0]

In [None]:
# remove 50% of least represented classes
least_represented_classes = get_least_represented_class_names(class_percentages_sorted, 0.5)
print(f"Least represented classes: {least_represented_classes}")
for chosen_class in least_represented_classes:
    X, y = remove_class(X, y, chosen_class)

In [None]:
# remove top class
class_percentages_sorted = get_sorted_class_percentages_label_encoded(y)
chosen_class = class_percentages_sorted.index[0]

X,y = remove_class(X, y, chosen_class)
class_percentages_sorted = show_summary(X, y)

In [None]:
# remove bottom class
class_percentages_sorted = get_sorted_class_percentages_label_encoded(y)
chosen_class = class_percentages_sorted.index[-1]

X,y = remove_class(X, y, chosen_class)
class_percentages_sorted = show_summary(X, y)

Removing class: 1148
Number of samples: 42842
Number of classes: 20
Times higher: 1.63
CoV: 0.165
1362    6.836749
496     6.582326
532     5.998786
675     5.875076
147     5.601979
1086    5.520284
850     5.366229
986     5.298539
74      4.649643
488     4.644975
425     4.542272
1370    4.481583
611     4.437234
608     4.404556
1410    4.355539
477     4.322861
1079    4.313524
1310    4.306522
0       4.269175
499     4.192148
Name: proportion, dtype: float64


In [910]:
# remove samples from top class

class_percentages_sorted = get_sorted_class_percentages_label_encoded(y)
chosen_class = class_percentages_sorted.index[1]
X,y = remove_few_samples_of_class(X, y, chosen_class, step=15)

class_percentages_sorted = show_summary(X, y)

Number of samples: 14662
Number of classes: 20
Times higher: 40.0
CoV: 0.47
1362    13.640704
675      5.081162
608      5.081162
425      5.074342
147      5.067521
1086     5.067521
532      5.033420
74       5.012959
611      5.006138
1410     5.006138
1370     4.999318
496      4.999318
488      4.999318
986      4.999318
850      4.992498
477      4.978857
0        4.412768
1079     3.860319
1310     2.346201
499      0.341018
Name: proportion, dtype: float64


In [285]:
# remove samples from bottom class
class_percentages_sorted = get_sorted_class_percentages_label_encoded(y)
chosen_class = class_percentages_sorted.index[-2]

X,y = remove_few_samples_of_class(X, y, chosen_class, step=30)
class_percentages_sorted = show_summary(X, y)

Number of samples: 35246
Number of classes: 20
Times higher: 40.65
CoV: 0.489
1362    8.304488
496     7.995234
532     7.285933
675     7.135561
147     6.803609
1086    6.704307
850     6.517052
986     6.434773
74      5.646031
488     5.640356
425     5.515519
1370    5.441752
611     5.387845
608     5.348125
1410    2.791806
477     2.071157
0       1.835669
1079    1.832832
1310    1.103671
499     0.204278
Name: proportion, dtype: float64


In [259]:
X,y = remove_samples_from_all_classes(X, y, step=1)
class_percentages_sorted = show_summary(X, y)

Number of samples: 35500
Number of classes: 20
Times higher: 40.65
CoV: 0.474
1362    8.245070
496     7.938028
532     7.233803
675     7.084507
147     6.754930
1086    6.656338
850     6.470423
986     6.388732
74      5.605634
488     5.600000
425     5.476056
1370    5.402817
611     5.349296
608     5.309859
1410    2.771831
477     2.056338
0       1.822535
1079    1.819718
1310    1.811268
499     0.202817
Name: proportion, dtype: float64


# Trimmed dataset - info

In [911]:
class_percentages_sorted = show_summary(X, y)

Number of samples: 14662
Number of classes: 20
Times higher: 40.0
CoV: 0.47
1362    13.640704
675      5.081162
608      5.081162
425      5.074342
147      5.067521
1086     5.067521
532      5.033420
74       5.012959
611      5.006138
1410     5.006138
1370     4.999318
496      4.999318
488      4.999318
986      4.999318
850      4.992498
477      4.978857
0        4.412768
1079     3.860319
1310     2.346201
499      0.341018
Name: proportion, dtype: float64


# Save

### Basic

In [1847]:
X.to_csv(DATASET_DIR / "X.csv", index=False)
y.to_csv(DATASET_DIR / "y.csv", index=False)

In [1848]:
class_percentages_sorted.to_csv(DATASET_DIR / "class_percentages.csv", header=False)

### Balanced

In [448]:
DATASET_DIR_BALANCED.mkdir(parents=True, exist_ok=True)
X.to_csv(DATASET_DIR_BALANCED / "X.csv", index=False)
y.to_csv(DATASET_DIR_BALANCED / "y.csv", index=False)

In [449]:
class_percentages_sorted.to_csv(DATASET_DIR_BALANCED / "class_percentages.csv", header=False)

### Imbalanced

In [912]:
DATASET_DIR_IMBALANCED.mkdir(parents=True, exist_ok=True)
X.to_csv(DATASET_DIR_IMBALANCED / "X.csv", index=False)
y.to_csv(DATASET_DIR_IMBALANCED / "y.csv", index=False)

In [913]:
class_percentages_sorted.to_csv(DATASET_DIR_IMBALANCED / "class_percentages.csv", header=False)