<a href="https://colab.research.google.com/github/julianthr/ML_Assignments/blob/main/Final_Project/Music/temporary_solution.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# import libraries
!pip install scikit-learn
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

### Transform to single-label dataset

In [None]:
# load relevant training data
X_train = np.load("X_train.npy")
y_train = np.load("y_train.npy")

# load relevant training data
X_val = np.load("X_val.npy")
y_val = np.load("y_val.npy")

# load relevant test data
X_test = np.load("X_test.npy")
y_test = np.load("y_test.npy")

In [None]:
# append all sets to perform the operations on the whole dataset - the train/test split will be performed again afterwards
X = np.append(X_train, np.append(X_val, X_test, axis=0), axis=0)
y = np.append(y_train, np.append(y_val, y_test, axis=0), axis=0)

In [None]:
# input shapes: all but the first shape have to stay the same
print(X.shape)
print(y.shape)

(7065, 1292, 20, 1)
(7065, 10)


In [None]:
# build new dataset without mulitlabel cases
X_new = []
y_new = []
counter = 0

for i in y:
    if i.sum()==1:
        X_new.append(X[counter])
        y_new.append(y[counter])
    counter+=1

In [None]:
# transform lists to arrays
X_final = np.array(X_new)
y_final = np.array(y_new)

In [None]:
# reestablish train:validation:test in the ratio 60:20:20

# Split the MFCCs into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.2, random_state=42)

# Split training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

### SMOTE


In [None]:
# get the number of occurences of all 10 labels in single-label dataset
index_positions = np.argmax(y_train, axis=1)


# Get the unique values and their counts
unique_values, counts = np.unique(index_positions, return_counts=True)

count_dict = {}

# Print the unique values and their counts
for value, count in zip(unique_values, counts):
    count_dict[value] = count

In [None]:
# label distribution
count_dict

{0: 161, 1: 631, 2: 588, 3: 62, 4: 211, 5: 431, 6: 147, 7: 657, 8: 358, 9: 147}

In [None]:
# retrieve the count for the majority class - this will be the benchmark for SMOTE
max_val = np.max(list(count_dict.values()))

In [None]:
# calculate the number of observations we want to have after SMOTE for each label
# as upsampling all classes to the observation count of the majority class will lead to overfitting
# (we have 10 times more observation for emotion 7 than for emotion 3),
# this is supposed to bridge only a share of the difference (e.g. 60% of the difference between the observation count of a particular class and the majority class)
weight_dict = count_dict.copy()
for i in count_dict.keys():
    count_dict[i]=max_val/count_dict[i]
    count_dict[i]-=1
    count_dict[i]*=0.6
    count_dict[i]+=1
    count_dict[i] = int(count_dict[i] * weight_dict[i])

In [None]:
# the desired observation counts after SMOTE
count_dict

{0: 309,
 1: 638,
 2: 608,
 3: 240,
 4: 344,
 5: 498,
 6: 299,
 7: 657,
 8: 447,
 9: 299}

In [None]:
# remove the key with the maximum occurences from the dict -  for this class, no SMOTE has to be conducted
key=max(count_dict, key=count_dict.get)
del count_dict[key]
count_dict

In [None]:
# reshape X_train to a 2D array (necessity for SMOTE - will be reversed afterwards)
X_train = X_train.reshape((X_train.shape[0], -1))
X_train.shape

In [None]:
# conduct SMOTE - oversample all minority classes based on the count_dict
smote = SMOTE(sampling_strategy = count_dict, random_state = 42)
X_smote, y_smote = smote.fit_resample(X_train, y_train)

In [None]:
# SMOTE added observations
X_smote.shape

(4339, 25840)

In [None]:
# the counts for each class coincide with the desired observation counts defined above
index_positions = np.argmax(y_smote, axis=1)


# Get the unique values and their counts
unique_values, counts = np.unique(index_positions, return_counts=True)

# Print the unique values and their counts
for value, count in zip(unique_values, counts):
    print(f"{value}: {count}")

0: 309
1: 638
2: 608
3: 240
4: 344
5: 498
6: 299
7: 657
8: 447
9: 299


In [None]:
X_smote.shape

(4339, 25840)

In [None]:
# reshape the training set again
X_train = X_smote.reshape(X_smote.shape[0], 1292, 20, 1)
y_train = y_smote.reshape(y_smote.shape[0], 10)

In [None]:
print(X_train.shape)
print(y_train.shape)

(4339, 1292, 20, 1)

In [None]:
# we will use one-hot encoding for the training
# code for transformation to sparse encoding would be like this:
"""
def reshape_y(y):
  y_reshaped = []
  for index,values in enumerate(y):
      for inner_index, inner_value in enumerate(y[index]):
          if inner_value==1:
              y_reshaped.append(inner_index)
  y_reshaped = np.array(y_reshaped).reshape((-1, 1))
  return y_reshaped

y_train_reshaped = reshape_y(y_train)
y_val_reshaped = reshape_y(y_val)
y_test_reshaped = reshape_y(y_test)
"""

In [None]:
# store the train, validation and test set 

files = [X_train, X_val, X_test, y_train, y_val, y_test]

file_names = ['X_train.npy',
              'X_val.npy',
              'X_test.npy',
              'y_train.npy',
              'y_val.npy',
              'y_test.npy']

for file, file_name in zip(files, file_names):
    np.save(file_name, file)