# Setup

In [1697]:
import os
import sys
from pathlib import Path
import pandas as pd
import numpy as np

In [1698]:
sys.path.append(os.path.abspath("../../.."))

In [1699]:
from src.experiment.helpers.variables import dataset_root_dir
from src.experiment.helpers.utils import get_sorted_class_percentages_label_encoded

In [1700]:
NAME = "news-category_imbalanced"
CLASSIFICATION_TYPE = "multiclass"

In [1701]:
DATASET_DIR = dataset_root_dir / CLASSIFICATION_TYPE / NAME
print(DATASET_DIR)

c:\VisualStudioRepositories\MUSIC_DATA\datasets\multiclass\news-category_imbalanced


In [1702]:
balanced_name = NAME + "_balanced"
DATASET_DIR_BALANCED = dataset_root_dir / CLASSIFICATION_TYPE / balanced_name
print(DATASET_DIR_BALANCED)

c:\VisualStudioRepositories\MUSIC_DATA\datasets\multiclass\news-category_imbalanced_balanced


In [1703]:
imbalanced_name = NAME + "_imbalanced"
DATASET_DIR_IMBALANCED = dataset_root_dir / CLASSIFICATION_TYPE / imbalanced_name
print(DATASET_DIR_IMBALANCED)

c:\VisualStudioRepositories\MUSIC_DATA\datasets\multiclass\news-category_imbalanced_imbalanced


# Dataset loading

In [1704]:
X = pd.read_csv(DATASET_DIR / 'X.csv')
y = pd.read_csv(DATASET_DIR / 'y.csv')

In [1705]:
X

Unnamed: 0,00,000,07,08,10,100,1000,100th,101,102,...,caesar,cafe,cafes,cafeteria,caffeine,café,cage,cain,cairo,caitlyn
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61675,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
61676,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
61677,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
61678,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [1706]:
y

Unnamed: 0,category
0,0
1,0
2,0
3,0
4,0
...,...
61675,5
61676,6
61677,6
61678,6


# Cut classes to match balanced dataset

In [100]:
y_balanced = pd.read_csv(DATASET_DIR_BALANCED / 'y.csv')
y_balanced

Unnamed: 0,class
0,0
1,0
2,0
3,0
4,0
...,...
475,3
476,3
477,3
478,3


In [101]:
# Get the set of classes present in y_balanced
classes_in_balanced = set(y_balanced['class'].unique())
print(f"Classes in balanced dataset: {classes_in_balanced}")

# Create mask for samples in y that are present in y_balanced
mask = y['class'].isin(classes_in_balanced)

# # Filter X and y
X = X.loc[mask].reset_index(drop=True)
y = y.loc[mask].reset_index(drop=True)

Classes in balanced dataset: {np.int64(0), np.int64(1), np.int64(2), np.int64(3)}


# Reduce class indexes by 1

In [11]:
#check if any label "0" is present in y
if (y == 0).any().any():
    print("Label '0' is present in y")
else:
    print("Label '0' is not present in y")

Label '0' is not present in y


In [13]:
# reduce y by 1
y['Class'] = y['Class'] - 1

# Remove gaps in labels

In [57]:
# remove gaps in labels, as in if there is a label "0" and then a label "2", turn the 2 into a 1
y = y.apply(lambda x: pd.factorize(x)[0])

# Dataset info

In [228]:
# number of samples
len(X)

209527

In [229]:
# number of classes (number of unique numbers in y)
len(np.unique(y.values))

42

In [368]:
# Convert y to int if needed
class_counts = y.value_counts()
print(class_counts)

category
14          6465
16          6464
7           6462
2           6461
23          6460
8           6422
35          5814
19          5467
17          5460
15          5112
1           4520
6           4197
22          3703
20          3440
30          3075
36          2784
28          2773
21          2692
12          2682
27          2604
41          2546
3           2419
18          2064
9           1897
33          1742
38          1699
25          1697
32          1374
13          1326
5           1224
34          1216
24           876
40           629
10           564
39           521
37           518
0            497
31           459
29           264
26           250
4            194
11            74
Name: count, dtype: int64


In [None]:
class_percentages_sorted = get_sorted_class_percentages_label_encoded(y)
print(class_percentages_sorted)

# Functions

In [231]:
def get_least_represented_class_names(class_percentages_sorted, percent):
    num_classes = len(class_percentages_sorted)
    bottom_percent_count = int(np.ceil(num_classes * percent))
    least_represented_classes = class_percentages_sorted.tail(bottom_percent_count).index.tolist()
    return least_represented_classes

In [232]:
def get_most_represented_class_names(class_percentages_sorted, percent):
    num_classes = len(class_percentages_sorted)
    top_percent_count = int(np.ceil(num_classes * percent))
    most_represented_classes = class_percentages_sorted.head(top_percent_count).index.tolist()
    return most_represented_classes

In [233]:
def show_summary(X, y):
    print("Number of samples:", len(X))
    print("Number of classes:", len(np.unique(y.values)))

    class_percentages_sorted = get_sorted_class_percentages_label_encoded(y)
    
    highest_class_percentage = class_percentages_sorted.iloc[0]
    lowest_class_percentage = class_percentages_sorted.iloc[-1]
    
    if lowest_class_percentage != 0:
        ratio = highest_class_percentage / lowest_class_percentage
        print(f"Times higher: {round(ratio, 2)}")
    else:
        ratio = np.nan
        print("The lowest class percentage is 0, cannot calculate ratio.")
        
    average_class_percentage = class_percentages_sorted.mean()
    class_percentages_std = class_percentages_sorted.std()

    cov = class_percentages_std / average_class_percentage
    print(f"CoV: {round(cov, 3)}")
    
    print(class_percentages_sorted)
    
    return class_percentages_sorted

In [234]:
def remove_class(X, y, chosen_class_index):
    # If DataFrame, select the first column
    if isinstance(y, pd.DataFrame):
        y_col = y.columns[0]
        y_series = y[y_col]
    else:
        y_series = y


    print(f"Removing class: {chosen_class_index}")
    
    # Create mask from 1D series
    mask = y_series != chosen_class_index

    # Filter X and y using 1D mask
    return X.loc[mask], y.loc[mask]


In [235]:
# removes SAMPLES
def remove_few_samples_of_class(X, y, class_index, step):
    indices_to_remove = y[y.values.flatten() == class_index].index[:step]
    X_trimmed = X.drop(indices_to_remove)
    y_trimmed = y.drop(indices_to_remove)
    
    return X_trimmed, y_trimmed

In [236]:
def remove_samples_from_all_classes(X, y, step):
    class_percentages_sorted = get_sorted_class_percentages_label_encoded(y)
    for chosen_class in class_percentages_sorted.index:
        X, y = remove_few_samples_of_class(X, y, chosen_class, step=step)   
    return X, y

# Dataset trimming

In [23]:
some_classes = get_most_represented_class_names(class_percentages_sorted, 0.8)
some_classes

[1, 0]

In [None]:
# remove top class
class_percentages_sorted = get_sorted_class_percentages_label_encoded(y)
chosen_class = class_percentages_sorted.index[0]

X,y = remove_class(X, y, chosen_class)
class_percentages_sorted = show_summary(X, y)

In [166]:
# remove bottom class
class_percentages_sorted = get_sorted_class_percentages_label_encoded(y)
chosen_class = class_percentages_sorted.index[-1]

X,y = remove_class(X, y, chosen_class)
class_percentages_sorted = show_summary(X, y)

Removing class: 7
Number of samples: 175000
Number of classes: 7
Times higher: 1.0
CoV: 0.0
0    14.285714
1    14.285714
2    14.285714
3    14.285714
4    14.285714
5    14.285714
6    14.285714
Name: proportion, dtype: float64


In [1845]:
# remove samples from top class

class_percentages_sorted = get_sorted_class_percentages_label_encoded(y)
chosen_class = class_percentages_sorted.index[1]
X,y = remove_few_samples_of_class(X, y, chosen_class, step=200)

class_percentages_sorted = show_summary(X, y)

Number of samples: 26980
Number of classes: 42
Times higher: 69.9
CoV: 1.151
16    15.285397
35     5.240919
15     5.233506
7      5.196442
2      5.192735
19     5.066716
17     5.040771
23     5.040771
14     4.903632
8      4.899926
1      4.521868
6      3.695330
30     2.057079
28     1.975537
12     1.971831
36     1.942179
21     1.934766
27     1.868050
22     1.864344
41     1.838399
3      1.812454
18     1.793921
20     1.630838
33     0.822832
9      0.730170
38     0.663454
24     0.652335
13     0.652335
5      0.644922
32     0.644922
34     0.615271
40     0.607858
10     0.589325
37     0.585619
25     0.581913
39     0.467013
0      0.452187
29     0.329874
11     0.274277
31     0.237213
26     0.222387
4      0.218681
Name: proportion, dtype: float64


In [1695]:
# remove samples from bottom class
class_percentages_sorted = get_sorted_class_percentages_label_encoded(y)
chosen_class = class_percentages_sorted.index[-41]

X,y = remove_few_samples_of_class(X, y, chosen_class, step=100)
class_percentages_sorted = show_summary(X, y)

Number of samples: 27191
Number of classes: 42
Times higher: 69.05
CoV: 1.484
16    14.982899
14     8.285830
17     8.274797
7      8.208598
2      8.204921
8      8.171822
35     8.142400
23     8.127689
19     5.395168
30     2.041117
28     1.960207
36     1.596116
21     1.588761
12     1.515207
15     1.441653
27     1.412232
1      1.360744
22     0.967232
41     0.831157
3      0.639918
18     0.511199
20     0.459711
33     0.411901
6      0.375124
38     0.364091
9      0.356736
5      0.345703
32     0.345703
13     0.316281
34     0.316281
24     0.316281
40     0.272149
37     0.268471
0      0.264793
39     0.261116
10     0.253760
29     0.253760
25     0.246405
11     0.239050
31     0.235372
26     0.220661
4      0.216984
Name: proportion, dtype: float64


In [323]:
X,y = remove_samples_from_all_classes(X, y, step=500)
class_percentages_sorted = show_summary(X, y)

Number of samples: 160529
Number of classes: 42
Times higher: 236.69
CoV: 1.264
8     14.154452
14    10.630478
7     10.267304
23     5.618922
16     5.565350
2      4.928082
35     3.621776
19     3.405615
17     3.401255
15     3.184471
1      2.815691
6      2.614481
22     2.306748
20     2.142915
30     1.915542
36     1.734266
28     1.727414
21     1.676956
12     1.670726
27     1.622137
41     1.586006
3      1.506893
18     1.285749
9      1.181718
33     1.085162
38     1.058376
25     1.057130
32     0.855920
13     0.826019
5      0.762479
34     0.757496
24     0.545696
40     0.391830
10     0.351338
39     0.324552
37     0.322683
0      0.309601
31     0.285930
29     0.164456
26     0.155735
4      0.120850
11     0.059802
Name: proportion, dtype: float64


# Trimmed dataset - info

In [1846]:
class_percentages_sorted = show_summary(X, y)

Number of samples: 26980
Number of classes: 42
Times higher: 69.9
CoV: 1.151
16    15.285397
35     5.240919
15     5.233506
7      5.196442
2      5.192735
19     5.066716
17     5.040771
23     5.040771
14     4.903632
8      4.899926
1      4.521868
6      3.695330
30     2.057079
28     1.975537
12     1.971831
36     1.942179
21     1.934766
27     1.868050
22     1.864344
41     1.838399
3      1.812454
18     1.793921
20     1.630838
33     0.822832
9      0.730170
38     0.663454
24     0.652335
13     0.652335
5      0.644922
32     0.644922
34     0.615271
40     0.607858
10     0.589325
37     0.585619
25     0.581913
39     0.467013
0      0.452187
29     0.329874
11     0.274277
31     0.237213
26     0.222387
4      0.218681
Name: proportion, dtype: float64


# Save

### Basic

In [1847]:
X.to_csv(DATASET_DIR / "X.csv", index=False)
y.to_csv(DATASET_DIR / "y.csv", index=False)

In [1848]:
class_percentages_sorted.to_csv(DATASET_DIR / "class_percentages.csv", header=False)

### Balanced

In [216]:
DATASET_DIR_BALANCED.mkdir(parents=True, exist_ok=True)
X.to_csv(DATASET_DIR_BALANCED / "X.csv", index=False)
y.to_csv(DATASET_DIR_BALANCED / "y.csv", index=False)

In [217]:
class_percentages_sorted.to_csv(DATASET_DIR_BALANCED / "class_percentages.csv", header=False)

### Imbalanced

In [1696]:
DATASET_DIR_IMBALANCED.mkdir(parents=True, exist_ok=True)
X.to_csv(DATASET_DIR_IMBALANCED / "X_2.csv", index=False)
y.to_csv(DATASET_DIR_IMBALANCED / "y_2.csv", index=False)

In [1170]:
class_percentages_sorted.to_csv(DATASET_DIR_IMBALANCED / "class_percentages.csv", header=False)