## Notebook for multimodal learning


### 0.0 Import packages and load data

In [1]:
import numpy as np
import pandas as pd

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import ast
from joblib import dump, load

In [3]:
df_train = pd.read_csv('database/LaA_train.csv')
df_test = pd.read_csv('database/LaA_test.csv')
df_train.head()

Unnamed: 0.1,Unnamed: 0,artist_name,track_name,valence,arousal,lyrics,audio_url,search_method,emotion,lyrics_embedding,audio_embedding,label
0,6481,Alexz Johnson,White Lines,0.678952,-2.333604,I tried to tell you\nI've got to get away\nI t...,https://p.scdn.co/mp3-preview/eae97329ac7135a5...,artist_and_song,Relaxed,"[0.5272476077079773, 1.168580174446106, 0.1721...","[0.7431232416583579, 0.20046921661336187, 0.23...",2
1,9075,Turntablerocker,No Melody,0.373325,-0.923151,We've got the song\nBut they got no melody\nNo...,https://p.scdn.co/mp3-preview/4f397176ee912edf...,artist_and_song,Relaxed,"[-0.5302870273590088, -1.6064175367355347, 1.8...","[2.089026585978773, 1.0356926605543044, 0.7353...",2
2,9082,Lamb,Zero,-0.367547,-0.939283,LAMB: ZERO\n\nThere's no one here today\n'Caus...,https://p.scdn.co/mp3-preview/f65a72ecfbf7c304...,artist_and_song,Sad,"[0.054062437266111374, 0.38927221298217773, -0...","[0.8801736065962609, 1.9399530531306048, 2.596...",3
3,10587,Einstuerzende Neubauten,Youme & Meyou,0.526139,-1.628377,They build a ship each wintertime\nFor launch ...,https://p.scdn.co/mp3-preview/a2acca0ac29d3a0a...,artist_and_song,Relaxed,"[-1.2014904022216797, -0.5136592388153076, -0....","[0.5157966639886593, 1.5658099646756507, 1.259...",2
4,9177,Mouse On Mars,Wipe That Sound,0.815393,0.662457,"Kick the can\nI kick, kick kick the can\nI kic...",https://p.scdn.co/mp3-preview/a803171c426e144d...,artist_and_song,Happy,"[-0.42865195870399475, 0.8299823999404907, 1.1...","[2.3368670630266033, 1.3798174751404781, 0.647...",1


In [4]:
embeddings_audio_train = df_train['audio_embedding'].apply(ast.literal_eval).apply(np.array)
x_audio_train = np.stack(embeddings_audio_train.values)
embeddings_audio_test = df_test['audio_embedding'].apply(ast.literal_eval).apply(np.array)
x_audio_test = np.stack(embeddings_audio_test.values)

embeddings_lyrics_train = df_train['lyrics_embedding'].apply(ast.literal_eval).apply(np.array)
x_lyrics_train = np.stack(embeddings_lyrics_train.values)
embeddings_lyrics_test = df_test['lyrics_embedding'].apply(ast.literal_eval).apply(np.array)
x_lyrics_test = np.stack(embeddings_lyrics_test.values)

y_train = df_train['label']
y_test = df_test['label']

#### 0.1 Train or load the models and get predictions
Run the first cell to train the models, run the second one to load the models

In [5]:
use_pretrained_svms = True
if use_pretrained_svms: 
    name_svm_audio = 'models/SVM_audio.joblib'
    name_svm_lyrics = 'models/SVM_lyrics.joblib'
    svm_classifier_audio = load(name_svm_audio)
    svm_classifier_lyrics = load(name_svm_lyrics)
else:
    svm_classifier_lyrics = SVC(kernel='rbf', C=1, gamma='auto', probability=True)
    svm_classifier_lyrics.fit(x_lyrics_train, y_train)
    print("Lyrics SVM trained")

    svm_classifier_audio = SVC(kernel='rbf', C=1, gamma='auto', probability=True)
    svm_classifier_audio.fit(x_audio_train, y_train)
    print("Audio SVM trained")

In [6]:
# Predict the probabilities

y_pred_test_prob_lyrics = svm_classifier_lyrics.predict_proba(x_lyrics_test)
y_pred_test_prob_audio = svm_classifier_audio.predict_proba(x_audio_test)
y_pred_train_prob_lyrics = svm_classifier_lyrics.predict_proba(x_lyrics_train)
y_pred_train_prob_audio = svm_classifier_audio.predict_proba(x_audio_train)

In [7]:
#get labels
y_pred_lyrics = np.argmax(y_pred_test_prob_lyrics, axis=1)
y_pred_audio = np.argmax(y_pred_test_prob_audio, axis=1)

acc_lyrics, acc_audio = accuracy_score(y_test, y_pred_lyrics), accuracy_score(y_test, y_pred_audio)
print(f"Lyrics accuracy: {acc_lyrics:.2f}")
print(f"Audio accuracy: {acc_audio:.2f}")

Lyrics accuracy: 0.43
Audio accuracy: 0.42


### 1. Models taking only the probabilities into account

### 1.1 (Un)weighted average

In [8]:
# First model adds confidence scores of the two models and then takes the argmax
y_pred_max_combined = np.argmax(y_pred_test_prob_lyrics + y_pred_test_prob_audio, axis=1)
acc_max_combined = accuracy_score(y_test, y_pred_max_combined)
print(f"Accuracy first model: {acc_max_combined:.2f}")

# Variation: Use the weighted average. Weights are the accuracies of the individual models.
y_pred_weighted_combined = np.argmax(acc_lyrics * y_pred_test_prob_lyrics + acc_audio * y_pred_test_prob_audio, axis=1)
acc_weighted_combined = accuracy_score(y_test, y_pred_weighted_combined)
print(f"Accuracy modified first model: {acc_weighted_combined:.2f}")

# The scores for the evenly weighted models and the combined model are very similar. This is likely due to the fact that the accuracies of the two models are very similar.

Accuracy first model: 0.45
Accuracy modified first model: 0.45


### 1.2 Confidence-based selection

####  1.2.1 Proper confidence values

In [9]:
# Second model takes the choice of the model that's more certain of its decision (i.e. the maximum probability)

y_pred_max = np.argmax(np.maximum(y_pred_test_prob_lyrics, y_pred_test_prob_audio), axis=1)
acc_max = accuracy_score(y_test, y_pred_max)
print(f"Accuracy: {acc_max:.2f}")

Accuracy: 0.45


### 1.2.2 Adjusted confidence values

Here, we don't just take the choice of the model that's more confident, but we weight the confidence values with the resepctive model's performance in the given emotion class.

 **Example** : If model 1 chooses happy, has a mid confidence value while model 2 chooses sad with a higher confidence value, we still use model 1's choice in the case that model 1 is significantly better in classifying happy songs than model 2 is in classifying sad songs.

**First step:** Calculate the accuracies of the models in the given classes

In [10]:
unique_labels = np.unique(y_train)

accuracies_train_audio = {}
accuracies_train_lyrics = {}

# Convert y_train to a numpy array
y_train_np = y_train.to_numpy()

for label in unique_labels:
    idx = np.where(y_train_np == label)

    y_train_label = y_train_np[idx]
    y_pred_train_prob_audio_label = np.argmax(y_pred_train_prob_audio[idx], axis=1)
    y_pred_train_prob_lyrics_label = np.argmax(y_pred_train_prob_lyrics[idx], axis=1)

    accuracies_train_audio[label] = accuracy_score(y_train_label, y_pred_train_prob_audio_label)
    accuracies_train_lyrics[label] = accuracy_score(y_train_label, y_pred_train_prob_lyrics_label)

**Second step:** Weight the confidence values with the accuracies of the given model in the given emotion class

In [11]:
# Modification: Weight the confidence values with the accuracy of the model in the given class

def calculate_combined_accuracy(y_test, y_pred_test_prob_audio, y_pred_test_prob_lyrics, accuracies_train_audio, accuracies_train_lyrics):
    combined_predictions = []
    
    for audio_confidences, lyrics_confidences in zip(y_pred_test_prob_audio, y_pred_test_prob_lyrics):

        adjusted_confidences_audio = {label: audio_confidences[label] * accuracies_train_audio[label] for label in range(len(audio_confidences))}
        adjusted_confidences_lyrics = {label: lyrics_confidences[label] * accuracies_train_lyrics[label] for label in range(len(lyrics_confidences))}

        combined_confidences = {label: max(adjusted_confidences_audio[label], adjusted_confidences_lyrics[label]) for label in adjusted_confidences_audio}        

        final_prediction = max(combined_confidences, key=combined_confidences.get)
        combined_predictions.append(final_prediction)
    
    total_accuracy = accuracy_score(y_test, combined_predictions)
    return total_accuracy

y_test_np = np.array(y_test)

total_accuracy = calculate_combined_accuracy(y_test_np, y_pred_test_prob_audio, y_pred_test_prob_lyrics, accuracies_train_audio, accuracies_train_lyrics)

print(f"Total accuracy of the combined model: {total_accuracy:.2f}")

Total accuracy of the combined model: 0.44


**Conclusion:** Weighting the confidence values doesn't give us a better prediction!

### 2. Second kind of models take probabilities as new input

#### 2.1 RandomForestClassifier

To add (If we want to add it in the presentation): How does it work?

In [12]:
x_meta_train = np.hstack((y_pred_train_prob_lyrics, y_pred_train_prob_audio))
x_meta_test = np.hstack((y_pred_test_prob_lyrics, y_pred_test_prob_audio))

In [13]:
from sklearn.ensemble import RandomForestClassifier

meta_classifier_forest = RandomForestClassifier()
meta_classifier_forest.fit(x_meta_train, y_train)
y_pred_forest = meta_classifier_forest.predict(x_meta_test)
acc_forest = accuracy_score(y_test, y_pred_forest)
print(f"Accuracy: {acc_forest:.2f}")

Accuracy: 0.44


#### 2.2 Logistic regression with the confidence values

Yet to be explained

In [14]:
from sklearn.linear_model import LogisticRegression

# Using the same meta features as before
meta_classifier_lr = LogisticRegression(max_iter=1000)
meta_classifier_lr.fit(x_meta_train, y_train)

# Predictions
y_pred_lr = meta_classifier_lr.predict(x_meta_test)
acc_meta_lr = accuracy_score(y_test, y_pred_lr)
print(f"Accuracy: {acc_meta_lr:.2f}")


Accuracy: 0.45


#### 2.3 Neural Network

In [15]:
import torch
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

x_meta_train_tensor = torch.tensor(x_meta_train, dtype=torch.float)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
x_meta_test_tensor = torch.tensor(x_meta_test, dtype=torch.float)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

train_dataset = TensorDataset(x_meta_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_dataset = TensorDataset(x_meta_test_tensor, y_test_tensor)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

class MetaClassifierNN(nn.Module):
    def __init__(self, input_size, num_classes):
        super(MetaClassifierNN, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)  # First hidden layer
        self.fc2 = nn.Linear(128, 64)  # Second hidden layer
        self.fc3 = nn.Linear(64, num_classes)  # Output layer
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Initialize the model
model = MetaClassifierNN(input_size=8, num_classes=4)

# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 20
for epoch in range(num_epochs):
    for inputs, labels in train_loader:
        optimizer.zero_grad()  # Clear gradients for this training step
        outputs = model(inputs)  # Forward pass
        loss = criterion(outputs, labels)  # Calculate loss
        loss.backward()  # Backward pass
        optimizer.step()  # Apply gradients
    
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}')

Epoch 1/20, Loss: 0.7346339821815491
Epoch 2/20, Loss: 0.6938502788543701
Epoch 3/20, Loss: 0.49238133430480957
Epoch 4/20, Loss: 0.7426857948303223
Epoch 5/20, Loss: 0.589238703250885
Epoch 6/20, Loss: 0.6927602291107178
Epoch 7/20, Loss: 0.5901066660881042
Epoch 8/20, Loss: 0.682794988155365
Epoch 9/20, Loss: 0.6886836886405945
Epoch 10/20, Loss: 0.7383273243904114
Epoch 11/20, Loss: 0.47385212779045105
Epoch 12/20, Loss: 0.5663787126541138
Epoch 13/20, Loss: 0.43598952889442444
Epoch 14/20, Loss: 0.43212762475013733
Epoch 15/20, Loss: 0.4314401149749756
Epoch 16/20, Loss: 0.49361732602119446
Epoch 17/20, Loss: 0.6350902915000916
Epoch 18/20, Loss: 0.6237216591835022
Epoch 19/20, Loss: 0.6353707313537598
Epoch 20/20, Loss: 0.6628891825675964


In [16]:
model.eval()

correct = 0
total = 0
with torch.no_grad(): 
    for inputs, labels in test_loader:
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy_NN = correct / total
print(f'Accuracy on the test set: {accuracy_NN * 100:.2f}%')

Accuracy on the test set: 44.19%


In [17]:
print(x_audio_train.shape)
print(x_lyrics_train.shape)

(10870, 498)
(10870, 768)


### 3. Training an SVM on the combined feature space

Try to train the SVM on the combined featurespace.

In [18]:
# Combine the two models
x_train_combined = np.concatenate((x_audio_train, x_lyrics_train), axis = 1)
x_test_combined = np.concatenate((x_audio_test, x_lyrics_test), axis = 1)

x_train_combined.shape

svm_classifier_combined = SVC(kernel='rbf', C=1, gamma='auto', probability=True)
svm_classifier_combined.fit(x_train_combined, y_train)

In [None]:
y_pred_combined = svm_classifier_combined.predict(x_test_combined)
acc_combined = accuracy_score(y_test, y_pred_combined)
print(f"Accuracy: {acc_combined:.2f}")

Accuracy: 0.45


### Ideas for data analysis

### Other things to mention in the presentation

- We checked that the dataset is balanced
- We checked that the models are not horribly over fitting (hopefully)
- Explanation of audio features and how we acquired the data
- Explanation of lyrical features and how we acquired it 
- Explanation of the arousal/valence scale
- Short explanation SVM, mention scaling
- Analysis of the two independent models
- Introduction to ensemble learning, present different techniques
- Discussion, what went wrong, what could be done better (both for the simple models and the bimodal model)
- Perhaps comparison with Deezer paper