In [23]:
import torch
from torch.utils.data import DataLoader
from miditok import REMI
from miditok.pytorch_data import DatasetMIDI, DataCollator
from pathlib import Path
import json
import pandas as pd
import numpy as np
import random
import importlib

In [24]:
import features_vectors as utils

In [25]:
importlib.reload(utils)

<module 'features_vectors' from '/mnt/nfs_share_magnet1/lafuente/symbolic_music/author-profiling/experiments/e13/features_vectors.py'>

In [4]:
train_data=pd.read_csv('../../train data/piano_merged_scores_train_set.csv')
validation_data=pd.read_csv('../../train data/piano_merged_scores_validation_set.csv')
test_data=pd.read_csv('../../train data/piano_merged_scores_validation_set.csv')

## 1. Get features vector

### 1.1 Create dataloader

In [26]:
val_dataloader=utils.create_data_loader(scores_df=validation_data,paths_column_name='paths_type0_piano')
train_dataloader=utils.create_data_loader(scores_df=train_data,paths_column_name='paths_type0_piano')
test_dataloader=utils.create_data_loader(scores_df=test_data,paths_column_name='paths_type0_piano')

0      ../../train data/merged piano scores/Bizet,_Ge...
1      ../../train data/merged piano scores/Browne,_A...
2      ../../train data/merged piano scores/Boulanger...
3      ../../train data/merged piano scores/Bizet,_Ge...
4      ../../train data/merged piano scores/Boulanger...
                             ...                        
104    ../../train data/merged piano scores/Abrams,_H...
105    ../../train data/merged piano scores/Debussy,_...
106    ../../train data/merged piano scores/Burleigh,...
107    ../../train data/merged piano scores/Debussy,_...
108    ../../train data/merged piano scores/Gonzaga,_...
Name: paths_type0_piano, Length: 109, dtype: object
0      ../../train data/merged piano scores/Kinkel,_J...
1      ../../train data/merged piano scores/Jaëll,_Ma...
2      ../../train data/merged piano scores/Schröter,...
3      ../../train data/merged piano scores/Jaëll,_Ma...
4      ../../train data/merged piano scores/Lehmann,_...
                             ...    

### 1.2 Get feature vectors from pre-trained model

In [27]:
train_feature_vectors=utils.get_feature_vectors(dataloader=train_dataloader,
                                                dataframe=train_data,
                                                set_type='train', 
                                                feature_tensors=False)




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
computing feature tensors: 100%|███████████████| 154/154 [09:31<00:00,  3.71s/it]


In [28]:
val_feature_vectors=utils.get_feature_vectors(dataloader=val_dataloader,
                                                dataframe=validation_data,
                                                set_type='val', 
                                                feature_tensors=False)




computing feature tensors: 100%|█████████████████| 22/22 [01:32<00:00,  4.20s/it]


## 2. Train MLP

In [29]:
from torch.utils.data import Dataset,DataLoader

seed = 42
if seed is not None:
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    
class DatasetMLP(Dataset):

    def __init__(self,data):
        self.data=data

    def __len__(self):
        return self.data.shape[0]

    def __getitem__(self,ind):
        x=self.data[ind][:-1]
        y=self.data[ind][-1]

        return x,y

class TestDataset(DatasetMLP):
    def __getitem__(self,ind):
        x=self.data[ind]
        return x

train_set_mlp=DatasetMLP(np.array(train_feature_vectors))
val_set_mlp=DatasetMLP(np.array(val_feature_vectors))

batch_size=20

train_dataloder_mlp=DataLoader(train_set_mlp,
                               batch_size=batch_size,
                               shuffle=True)  

val_dataloder_mlp=DataLoader(val_set_mlp,
                               batch_size=batch_size,
                               shuffle=False)  

In [30]:
import torch
import torch.nn as nn

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

#class MLP(nn.Module):
#    def __init__(self, input_dim):
#        super(MLP, self).__init__()
#        self.linear = nn.Linear(input_dim, 2)  # Output 2 classes
    
#    def forward(self, x):
#        out = self.linear(x)
#        return out

class MLP(nn.Module):
    def __init__(self, input_dim, hidden_dim1=128, hidden_dim2=64, hidden_dim3=32, output_dim=2):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim1)
        self.fc2 = nn.Linear(hidden_dim1, hidden_dim2)
        self.fc3 = nn.Linear(hidden_dim2, hidden_dim3)
        self.fc4 = nn.Linear(hidden_dim3, output_dim)
        self.relu = nn.ReLU()
    
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.relu(self.fc3(x))
        out = self.fc4(x)
        return out

from sklearn.metrics import balanced_accuracy_score

def evaluate(model, dataloader, criterion):
    model.eval()  # Set model to evaluation mode
    predictions = []
    true_labels = []
    losses = []

    with torch.no_grad():
        for input_data in dataloader:
            x, y = input_data
            x = x.to(device).float()
            y = y.to(device).long()

            output = model(x)
            _, predicted = torch.max(output, 1)

            batch_predictions = predicted.cpu().detach().numpy().tolist()
            batch_true_labels = y.cpu().detach().numpy().tolist()

            predictions.extend(batch_predictions)
            true_labels.extend(batch_true_labels)

            # Compute loss
            loss = criterion(output, y)
            losses.append(loss.item())

    # Compute average validation loss
    avg_loss = sum(losses) / len(losses)

    # Compute balanced accuracy
    predictions = np.array(predictions)
    true_labels = np.array(true_labels)
    balanced_accuracy = balanced_accuracy_score(true_labels, predictions)

    return balanced_accuracy, avg_loss, predictions, true_labels


In [36]:
from sklearn.metrics import balanced_accuracy_score
import torch.optim.lr_scheduler as lr_scheduler



seed = 42
if seed is not None:
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    
input_dim = 523264
model = MLP(input_dim).to(device)

initial_lr = 0.001
optimizer = torch.optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()
# Assuming optimizer is already defined
#scheduler = lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)

print(model)

epochs = 10

model.train()
train_avg_loss_list=[]
val_avg_loss_list=[]
train_balanced_accuracy_list=[]
val_balanced_accuracy_list=[]

for epoch in range(epochs):
    losses = []
    predictions = []
    true_labels = []
    for batch_num, input_data in enumerate(train_dataloder_mlp):
        optimizer.zero_grad()
        x, y = input_data
        x = x.to(device).float()
        y = y.to(device).long()  # Ensure y is of type long for CrossEntropyLoss

        output = model(x)
        loss = criterion(output, y)
        loss.backward()
        losses.append(loss.item())

        optimizer.step()

        # Convert predictions to class labels (0 or 1)
        _, predicted = torch.max(output, 1)
        batch_predictions = predicted.cpu().detach().numpy().tolist()
        batch_true_labels = y.cpu().detach().numpy().tolist()

        predictions.extend(batch_predictions)
        true_labels.extend(batch_true_labels)

        #if batch_num % 40 == 0:
        #    print('\tEpoch %d | Batch %d | Loss %6.2f' % (epoch, batch_num, loss.item()))
    # Step the scheduler
    scheduler.step()
    
    train_balanced_accuracy = balanced_accuracy_score(true_labels, predictions)
    train_avg_loss=sum(losses)/len(losses) 
    print('Epoch %d | Train Loss %6.2f| Train Balanced Accuracy %6.2f' % (epoch, train_avg_loss ,train_balanced_accuracy))
    
    val_balanced_accuracy, val_avg_loss, val_predictions, val_true_labels = evaluate(model, val_dataloder_mlp,criterion)
    print('Epoch %d | Validation Loss %6.2f| Validation Balanced Accuracy: %6.2f' % (epoch, val_avg_loss, val_balanced_accuracy))

    train_avg_loss_list.append(train_avg_loss)
    val_avg_loss_list.append(val_avg_loss)
    
    train_balanced_accuracy_list.append(train_balanced_accuracy)
    val_balanced_accuracy_list.append(val_balanced_accuracy)


# Convert predictions and true labels to numpy arrays
predictions = np.array(predictions)
true_labels = np.array(true_labels)

# Example of using predictions and true labels
print("Predictions:", predictions)
print("True Labels:", true_labels)


MLP(
  (fc1): Linear(in_features=523264, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=64, bias=True)
  (fc3): Linear(in_features=64, out_features=32, bias=True)
  (fc4): Linear(in_features=32, out_features=2, bias=True)
  (relu): ReLU()
)


OutOfMemoryError: CUDA out of memory. Tried to allocate 256.00 MiB. GPU 

In [32]:
balanced_accuracy_score(y_true=true_labels,y_pred=predictions)

0.9970326409495549

In [33]:
test_feature_vectors=utils.get_feature_vectors(dataloader=test_dataloader,
                                                dataframe=test_data,
                                                set_type='test', 
                                                feature_tensors=False)

test_set_mlp=DatasetMLP(np.array(test_feature_vectors))

test_dataloder_mlp=DataLoader(test_set_mlp,
                               batch_size=batch_size,
                               shuffle=False)  




computing feature tensors: 100%|█████████████████| 22/22 [01:25<00:00,  3.90s/it]


In [34]:
test_balanced_accuracy, loss, test_predictions, test_true_labels = evaluate(model, test_dataloder_mlp, criterion)

In [35]:
test_balanced_accuracy*100

63.6734693877551

In [37]:
metrics_df=pd.DataFrame(data={'train_avg_loss':train_avg_loss_list,
                            'train_balanced_accuracy':train_balanced_accuracy_list,
                            'val_avg_loss':val_avg_loss_list,
                            'val_balanced_accuracy':val_balanced_accuracy_list})
metrics_df.to_csv('metrics_df_e13.csv',index=False)

In [15]:
torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'scheduler_state_dict': scheduler.state_dict(),
            'best_balanced_accuracy': train_balanced_accuracy
        }, 'best_model_e11.pth')

In [16]:
checkpoint = torch.load('best_model_e11.pth')
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
best_balanced_accuracy = checkpoint['best_balanced_accuracy']


OutOfMemoryError: CUDA out of memory. Tried to allocate 256.00 MiB. GPU 

In [39]:
predictions_df_test_e13=pd.DataFrame(data={'labels':test_true_labels,'predictions':test_predictions})

In [40]:
predictions_df_test_e13.to_csv('predictions_df_test_e13.csv')

In [41]:
predictions_df_val_e13=pd.DataFrame(data={'labels':val_true_labels,'predictions':val_predictions})
predictions_df_val_e13.to_csv('predictions_df_val_e13.csv')