# Setup

In [340]:
import os
import sys
from pathlib import Path
import pandas as pd
import numpy as np

In [341]:
sys.path.append(os.path.abspath("../../.."))

In [342]:
from src.experiment.helpers.variables import dataset_root_dir
from src.experiment.helpers.utils import get_sorted_class_percentages_label_encoded

In [343]:
NAME = "irish-times_imbalanced"
CLASSIFICATION_TYPE = "multiclass"

In [344]:
DATASET_DIR = dataset_root_dir / CLASSIFICATION_TYPE / NAME
print(DATASET_DIR)

c:\VisualStudioRepositories\MUSIC_DATA\datasets\multiclass\irish-times_imbalanced


In [345]:
balanced_name = NAME + "_balanced"
DATASET_DIR_BALANCED = dataset_root_dir / CLASSIFICATION_TYPE / balanced_name
print(DATASET_DIR_BALANCED)

c:\VisualStudioRepositories\MUSIC_DATA\datasets\multiclass\irish-times_imbalanced_balanced


In [346]:
imbalanced_name = NAME + "_imbalanced"
DATASET_DIR_IMBALANCED = dataset_root_dir / CLASSIFICATION_TYPE / imbalanced_name
print(DATASET_DIR_IMBALANCED)

c:\VisualStudioRepositories\MUSIC_DATA\datasets\multiclass\irish-times_imbalanced_imbalanced


# Dataset loading

In [347]:
X = pd.read_csv(DATASET_DIR / 'X.csv')
y = pd.read_csv(DATASET_DIR / 'y.csv')

In [348]:
X

Unnamed: 0,000,19,2016,2017,2019,2020,20s,3arena,5g,5m,...,yahoo,year,you,your,zealand,zebo,zebre,zone,zuckerberg,éireann
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48115,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
48116,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
48117,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
48118,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [349]:
y

Unnamed: 0,headline_category
0,1
1,1
2,1
3,1
4,1
...,...
48115,19
48116,1
48117,18
48118,6


# Cut classes to match balanced dataset

In [None]:
y_balanced = pd.read_csv(DATASET_DIR_BALANCED / 'y_2.csv')
y_balanced

Unnamed: 0,stock
0,0
1,0
2,0
3,0
4,0
...,...
14695,1410
14696,1410
14697,1410
14698,1410


In [None]:
# Get the set of classes present in y_balanced
classes_in_balanced = set(y_balanced['stock'].unique())
print(f"Classes in balanced dataset: {classes_in_balanced}")

# Create mask for samples in y that are present in y_balanced
mask = y['stock'].isin(classes_in_balanced)

# # Filter X and y
X = X.loc[mask].reset_index(drop=True)
y = y.loc[mask].reset_index(drop=True)

Classes in balanced dataset: {np.int64(0), np.int64(1410), np.int64(147), np.int64(532), np.int64(1310), np.int64(675), np.int64(425), np.int64(1079), np.int64(1086), np.int64(74), np.int64(850), np.int64(1362), np.int64(1370), np.int64(986), np.int64(477), np.int64(608), np.int64(611), np.int64(488), np.int64(496), np.int64(499)}


# Reduce class indexes by 1

In [None]:
#check if any label "0" is present in y
if (y == 0).any().any():
    print("Label '0' is present in y")
else:
    print("Label '0' is not present in y")

Label '0' is not present in y


In [None]:
# reduce y by 1
y['Class'] = y['Class'] - 1

# Remove gaps in labels

In [None]:
# remove gaps in labels, as in if there is a label "0" and then a label "2", turn the 2 into a 1
y = y.apply(lambda x: pd.factorize(x)[0])

# Dataset info

In [15]:
# number of samples
len(X)

1314054

In [16]:
# number of classes (number of unique numbers in y)
len(np.unique(y.values))

20

In [635]:
# Convert y to int if needed
class_counts = y.value_counts()
print(class_counts)

headline_category
10                   5583
9                    1430
2                    1426
19                   1421
13                   1381
3                    1356
6                    1338
18                    445
0                     418
7                     415
14                    404
1                     379
16                    376
15                    374
5                     362
11                    338
8                     315
4                     215
17                    193
12                    141
Name: count, dtype: int64


In [None]:
class_percentages_sorted = get_sorted_class_percentages_label_encoded(y)
print(class_percentages_sorted)

# Functions

In [19]:
def get_least_represented_class_names(class_percentages_sorted, percent):
    num_classes = len(class_percentages_sorted)
    bottom_percent_count = int(np.ceil(num_classes * percent))
    least_represented_classes = class_percentages_sorted.tail(bottom_percent_count).index.tolist()
    return least_represented_classes

In [20]:
def get_most_represented_class_names(class_percentages_sorted, percent):
    num_classes = len(class_percentages_sorted)
    top_percent_count = int(np.ceil(num_classes * percent))
    most_represented_classes = class_percentages_sorted.head(top_percent_count).index.tolist()
    return most_represented_classes

In [21]:
def show_summary(X, y):
    print("Number of samples:", len(X))
    print("Number of classes:", len(np.unique(y.values)))

    class_percentages_sorted = get_sorted_class_percentages_label_encoded(y)
    
    highest_class_percentage = class_percentages_sorted.iloc[0]
    lowest_class_percentage = class_percentages_sorted.iloc[-1]
    
    if lowest_class_percentage != 0:
        ratio = highest_class_percentage / lowest_class_percentage
        print(f"Times higher: {round(ratio, 2)}")
    else:
        ratio = np.nan
        print("The lowest class percentage is 0, cannot calculate ratio.")
        
    average_class_percentage = class_percentages_sorted.mean()
    class_percentages_std = class_percentages_sorted.std()

    cov = class_percentages_std / average_class_percentage
    print(f"CoV: {round(cov, 3)}")
    
    print(class_percentages_sorted)
    
    return class_percentages_sorted

In [22]:
def remove_class(X, y, chosen_class_index):
    # If DataFrame, select the first column
    if isinstance(y, pd.DataFrame):
        y_col = y.columns[0]
        y_series = y[y_col]
    else:
        y_series = y


    print(f"Removing class: {chosen_class_index}")
    
    # Create mask from 1D series
    mask = y_series != chosen_class_index

    # Filter X and y using 1D mask
    return X.loc[mask], y.loc[mask]


In [23]:
# removes SAMPLES
def remove_few_samples_of_class(X, y, class_index, step):
    indices_to_remove = y[y.values.flatten() == class_index].index[:step]
    X_trimmed = X.drop(indices_to_remove)
    y_trimmed = y.drop(indices_to_remove)
    
    return X_trimmed, y_trimmed

In [24]:
def remove_samples_from_all_classes(X, y, step):
    class_percentages_sorted = get_sorted_class_percentages_label_encoded(y)
    for chosen_class in class_percentages_sorted.index:
        X, y = remove_few_samples_of_class(X, y, chosen_class, step=step)   
    return X, y

# Dataset trimming

In [None]:
half_index = len(X) // 2
X = X.iloc[:half_index].reset_index(drop=True)
y = y.iloc[:half_index].reset_index(drop=True)

In [None]:
some_classes = get_most_represented_class_names(class_percentages_sorted, 0.8)
some_classes

[1, 0]

In [None]:
# remove 50% of least represented classes
least_represented_classes = get_least_represented_class_names(class_percentages_sorted, 0.5)
print(f"Least represented classes: {least_represented_classes}")
for chosen_class in least_represented_classes:
    X, y = remove_class(X, y, chosen_class)

In [None]:
# remove top class
class_percentages_sorted = get_sorted_class_percentages_label_encoded(y)
chosen_class = class_percentages_sorted.index[0]

X,y = remove_class(X, y, chosen_class)
class_percentages_sorted = show_summary(X, y)

In [None]:
# remove bottom class
class_percentages_sorted = get_sorted_class_percentages_label_encoded(y)
chosen_class = class_percentages_sorted.index[-1]

X,y = remove_class(X, y, chosen_class)
class_percentages_sorted = show_summary(X, y)

In [660]:
# remove samples from top class

class_percentages_sorted = get_sorted_class_percentages_label_encoded(y)
chosen_class = class_percentages_sorted.index[1]
X,y = remove_few_samples_of_class(X, y, chosen_class, step=10)

class_percentages_sorted = show_summary(X, y)

Number of samples: 15900
Number of classes: 20
Times higher: 39.6
CoV: 1.471
10    35.113208
2      6.452830
19     6.421384
9      6.415094
13     6.169811
3      6.012579
6      5.899371
18     2.798742
0      2.628931
7      2.610063
14     2.540881
1      2.383648
16     2.364780
15     2.352201
5      2.276730
11     2.125786
8      1.981132
4      1.352201
17     1.213836
12     0.886792
Name: proportion, dtype: float64


In [465]:
# remove samples from bottom class
class_percentages_sorted = get_sorted_class_percentages_label_encoded(y)
chosen_class = class_percentages_sorted.index[-7]

X,y = remove_few_samples_of_class(X, y, chosen_class, step=100)
class_percentages_sorted = show_summary(X, y)

Number of samples: 43310
Number of classes: 20
Times higher: 39.77
CoV: 0.805
10    14.784115
1     10.387901
13     9.699838
19     9.099515
2      7.725698
16     7.379358
9      7.273147
6      6.829831
5      4.345417
3      4.100670
18     4.075271
0      4.012930
7      3.775110
15     2.064188
14     1.440776
11     0.826599
8      0.773493
4      0.542600
17     0.491803
12     0.371739
Name: proportion, dtype: float64


In [581]:
X,y = remove_samples_from_all_classes(X, y, step=10)
class_percentages_sorted = show_summary(X, y)

Number of samples: 24310
Number of classes: 20
Times higher: 45.27
CoV: 1.096
10    26.256684
6      7.560675
9      7.527766
3      6.400658
13     6.092143
2      5.865899
19     5.845331
18     5.532703
1      4.027149
16     4.014809
5      3.957219
15     3.595228
0      2.953517
7      2.941176
14     2.484574
11     1.390374
8      1.295763
4      0.884410
17     0.793912
12     0.580008
Name: proportion, dtype: float64


# Trimmed dataset - info

In [661]:
class_percentages_sorted = show_summary(X, y)

Number of samples: 15900
Number of classes: 20
Times higher: 39.6
CoV: 1.471
10    35.113208
2      6.452830
19     6.421384
9      6.415094
13     6.169811
3      6.012579
6      5.899371
18     2.798742
0      2.628931
7      2.610063
14     2.540881
1      2.383648
16     2.364780
15     2.352201
5      2.276730
11     2.125786
8      1.981132
4      1.352201
17     1.213836
12     0.886792
Name: proportion, dtype: float64


# Save

### Basic

In [664]:
X.to_csv(DATASET_DIR / "X.csv", index=False)
y.to_csv(DATASET_DIR / "y.csv", index=False)

In [665]:
class_percentages_sorted.to_csv(DATASET_DIR / "class_percentages.csv", header=False)

### Balanced

In [None]:
# DATASET_DIR_BALANCED.mkdir(parents=True, exist_ok=True)
# X.to_csv(DATASET_DIR_BALANCED / "X.csv", index=False)
# y.to_csv(DATASET_DIR_BALANCED / "y.csv", index=False)

In [None]:
# class_percentages_sorted.to_csv(DATASET_DIR_BALANCED / "class_percentages.csv", header=False)

### Imbalanced

In [662]:
DATASET_DIR_IMBALANCED.mkdir(parents=True, exist_ok=True)
X.to_csv(DATASET_DIR_IMBALANCED / "X.csv", index=False)
y.to_csv(DATASET_DIR_IMBALANCED / "y.csv", index=False)

In [663]:
class_percentages_sorted.to_csv(DATASET_DIR_IMBALANCED / "class_percentages.csv", header=False)