In [1]:
import torch
from torch.utils.data import DataLoader
from miditok import REMI
from miditok.pytorch_data import DatasetMIDI, DataCollator
from pathlib import Path
import json
import pandas as pd
import numpy as np
import random

In [2]:
train_data=pd.read_csv('../../dataframes/train_set_1.csv')
validation_data=pd.read_csv('../../dataframes/validation_set_1.csv')
test_set=pd.read_csv('../../dataframes/test_set_1.csv')

### 1. Dataloader

In [3]:
seed = 42
if seed is not None:
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [4]:
torch.set_default_device("cuda")

# Define a custom DatasetMIDI subclass
class CustomDatasetMIDI(DatasetMIDI):
    def __init__(self, files_paths, labels, tokenizer, max_seq_len, bos_token_id, eos_token_id):
        super().__init__(files_paths, tokenizer, max_seq_len, bos_token_id, eos_token_id)
        self.labels = labels

    def __getitem__(self, idx):
        item = super().__getitem__(idx)
        item["labels"] = self.labels[idx]  # Add labels to the item dictionary
        return item


def create_data_loader(piano_scores_df):
  
  midi_paths = []
  labels=[]
  paths=piano_scores_df['piano_scores_paths']
  paths=paths.apply(lambda x: x.replace('../author-profiling-in-symbolic-music/','../../'))

  for i,score in enumerate(paths):

      midi_paths.append(Path(score))

      integer_label= 0 if piano_scores_df['composer_gender'][i] == 'Male' else 1

      labels.append(torch.tensor(integer_label))  # Modify this line to extract the label

  # Initialize the tokenizer
  tokenizer = REMI.from_pretrained("Natooz/Maestro-REMI-bpe20k")


  # Initialize the dataset
  dataset = CustomDatasetMIDI(
      files_paths=midi_paths,
      labels=labels,  # Pass the labels to the dataset
      tokenizer=tokenizer,
      max_seq_len=1024,
      bos_token_id=tokenizer.pad_token_id,
      eos_token_id=tokenizer["BOS_None"]
  )

  # Initialize the collator
  collator = DataCollator(tokenizer.pad_token_id)

  data_loader = DataLoader(dataset=dataset, collate_fn=collator, batch_size=5)

  return data_loader


eval_dataloader=create_data_loader(validation_data)
train_dataloader=create_data_loader(train_data)

  self.config = TokenizerConfig()
  return cls(**input_dict, **kwargs)
  return cls(**input_dict, **kwargs)


### 2. Trainning loop

In [32]:
import pandas as pd
from accelerate import Accelerator
from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler
import torch
from tqdm.auto import tqdm

def evaluate(model, eval_dl, accelerator):
    model.eval()
    total_loss = 0
    total_correct = 0
    total_samples = 0
    pred_list_val_temp=[]
    labels_list_val_temp=[]
    
    for batch in eval_dl:
        with torch.no_grad():
            outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.item()
        
        predictions = torch.argmax(outputs.logits, dim=-1)
        total_correct += (predictions == batch['labels']).sum().item()
        total_samples += len(batch['labels'])
        pred_list_val_temp.append(predictions)
        labels_list_val_temp.append(batch['labels'])
    
    avg_loss = total_loss / len(eval_dl)
    accuracy = total_correct / total_samples

    predictions_df_val_temp=pd.DataFrame({'predictions':[tensor.tolist() for tensor in pred_list_val_temp],'labels':[tensor.tolist() for tensor in labels_list_val_temp]})
    predictions_df_val_temp=predictions_df_val_temp.explode(['predictions', 'labels'])
    predictions_df_val_temp['epoch']=epoch
    
    return avg_loss, accuracy, predictions_df_val_temp

# Initialize the accelerator
accelerator = Accelerator()
torch.cuda.empty_cache()

# Load model
model = AutoModelForSequenceClassification.from_pretrained(
    "Natooz/Maestro-REMI-bpe20k",
    trust_remote_code=True,
    torch_dtype="auto",
    num_labels=2
)
model.config.pad_token_id = model.config.eos_token_id

# Initialize optimizer
optimizer = AdamW(model.parameters(), lr=3e-5)

# Prepare the data and model
train_dl, eval_dl, model, optimizer = accelerator.prepare(
    train_dataloader, eval_dataloader, model, optimizer
)

# Set up the learning rate scheduler
num_epochs = 10

num_training_steps = num_epochs * len(train_dl)

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

# Progress bar
progress_bar = tqdm(range(num_training_steps))

# Initialize lists to store metrics
metrics = {
    "epoch": [],
    "batch": [],
    "train_loss": [],
    "train_accuracy": [],
    "val_loss": [],
    "val_accuracy": [],
    "total_train_loss": [],  # New column for total training loss
    "total_train_accuracy": []  # New column for total training accuracy
}

# Training loop
model.train()
batch_number = 0

predictions_df_val=pd.DataFrame(columns=['predictions','labels','epoch'])
predictions_df_train=pd.DataFrame(columns=['predictions','labels','epoch'])

for epoch in range(num_epochs):
    total_train_loss = 0
    total_correct = 0
    total_samples = 0
    pred_list_train_temp=[]
    labels_list_train_temp=[]
    
    for batch in train_dl:

    
        outputs = model(**batch)
        loss = outputs.loss
        total_train_loss += loss.item()
        
        predictions = torch.argmax(outputs.logits, dim=-1)
        total_correct += (predictions == batch['labels']).sum().item()
        total_samples += len(batch['labels'])
        
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
        
        # Calculate training accuracy
        train_accuracy = total_correct / total_samples
        
        # Log batch metrics
        metrics["epoch"].append(epoch + 1)
        metrics["batch"].append(batch_number + 1)
        metrics["train_loss"].append(loss.item())
        metrics["train_accuracy"].append(train_accuracy)
        batch_number += 1
        
        pred_list_train_temp.append(predictions)
        labels_list_train_temp.append(batch['labels'])
        
        predictions_df_train_temp=pd.DataFrame({'predictions':[tensor.tolist() for tensor in pred_list_train_temp],'labels':[tensor.tolist() for tensor in labels_list_train_temp]})
        predictions_df_train_temp=predictions_df_train_temp.explode(['predictions', 'labels'])
        predictions_df_train_temp['epoch']=epoch

        #print(f"Epoch {epoch + 1}, Batch {batch_number}, Loss: {loss.item()}, Accuracy: {train_accuracy}")
    predictions_df_train=pd.concat([predictions_df_train,predictions_df_train_temp])
   
    # Log epoch metrics
    avg_train_loss = total_train_loss / len(train_dl)
    avg_train_accuracy = total_correct / total_samples
    print(f"Epoch {epoch + 1} Average Training Loss: {avg_train_loss}, Average Training Accuracy: {avg_train_accuracy}")
    
    # Add total training loss and accuracy to metrics dictionary
    metrics["total_train_loss"].append(avg_train_loss)
    metrics["total_train_accuracy"].append(avg_train_accuracy)

    # Fill missing batch entries with NaN for consistency
    metrics["total_train_loss"] += [None] * (len(metrics["epoch"]) - len(metrics["total_train_loss"]))
    metrics["total_train_accuracy"] += [None] * (len(metrics["epoch"]) - len(metrics["total_train_accuracy"]))

    # Evaluate the model
    val_loss, val_accuracy, predictions_df_val_temp = evaluate(model, eval_dl, accelerator)
    print(f"Epoch {epoch + 1} Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}")
    
    predictions_df_val=pd.concat([predictions_df_val,predictions_df_val_temp])
    
    # Log epoch validation metrics
    metrics["val_loss"].append(val_loss)
    metrics["val_accuracy"].append(val_accuracy)

    # Fill missing batch entries with NaN for consistency
    metrics["val_loss"] += [None] * (len(metrics["epoch"]) - len(metrics["val_loss"]))
    metrics["val_accuracy"] += [None] * (len(metrics["epoch"]) - len(metrics["val_accuracy"]))
    
    # Switch back to training mode
    model.train()

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at Natooz/Maestro-REMI-bpe20k and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/2320 [00:00<?, ?it/s]

Epoch 1 Average Training Loss: 0.9125905463186021, Average Training Accuracy: 0.657439446366782
Epoch 1 Validation Loss: 1.923276453188831, Validation Accuracy: 0.33986928104575165
Epoch 2 Average Training Loss: 0.743209915902016, Average Training Accuracy: 0.6401384083044983
Epoch 2 Validation Loss: 1.4389988163546208, Validation Accuracy: 0.33986928104575165
Epoch 3 Average Training Loss: 0.6925250604700554, Average Training Accuracy: 0.6868512110726643
Epoch 3 Validation Loss: 1.236229038526935, Validation Accuracy: 0.33986928104575165
Epoch 4 Average Training Loss: 0.6666846187175091, Average Training Accuracy: 0.7093425605536332
Epoch 4 Validation Loss: 1.129022327642287, Validation Accuracy: 0.33986928104575165
Epoch 5 Average Training Loss: 0.6557525243642258, Average Training Accuracy: 0.7076124567474048
Epoch 5 Validation Loss: 1.0602615574194538, Validation Accuracy: 0.33986928104575165
Epoch 6 Average Training Loss: 0.6466314100242895, Average Training Accuracy: 0.7024221453

In [None]:
#predictions_df_val=pd.DataFrame(columns=['predictions','labels','epoch'])
#predictions_df_train=pd.DataFrame(columns=['predictions','labels','epoch'])

In [None]:
#save model
model.save_pretrained('model_e3')

# Create a DataFrame from the metrics
metrics_df = pd.DataFrame(metrics)
metrics_df.to_csv('metrics_df_e3.csv', index=False)

#export predictions in trainning and validation set
predictions_df_val.to_csv('predictions_df_val_e3.csv')
predictions_df_train.to_csv('predictions_df_train_e3.csv')

### 3. Results

In [None]:
import plotly.express as px
import pandas as pd

metrics_df=pd.read_csv('metrics_df_e3.csv')

metrics_per_epoch=metrics_df[~metrics_df['val_loss'].isna()]

In [None]:
metrics_per_epoch

In [None]:
import plotly.graph_objects as go

fig = go.Figure()

fig.add_trace(go.Scatter(
    y=metrics_per_epoch['val_accuracy'],
    x=metrics_per_epoch['epoch'],
    mode='lines+markers+text',
    name='val_accuracy',
    text=metrics_per_epoch['val_accuracy'].round(2),  # Add labels to the points
    textposition='top center'  # Position the labels
))

fig.add_trace(go.Scatter(
    y=metrics_per_epoch['total_train_accuracy'],
    x=metrics_per_epoch['epoch'],
    mode='lines+markers+text',
    name='train_accuracy',
    text=metrics_per_epoch['total_train_accuracy'].round(2),  # Add labels to the points
    textposition='top center'  # Position the labels
))

fig.update_layout(
    xaxis=dict(range=[0.9, 10.1],dtick=1),
    yaxis=dict(range=[0.3, 1],dtick=0.05),
    xaxis_title='Epoch',
    yaxis_title='Average Accuracy'
)

fig.show()

In [None]:
import plotly.graph_objects as go

fig = go.Figure()

fig.add_trace(go.Scatter(
    y=metrics_per_epoch['val_loss'],
    x=metrics_per_epoch['epoch'],
    mode='lines+markers+text',
    name='val_loss',
    text=metrics_per_epoch['val_loss'].round(2),  # Add labels to the points
    textposition='top center'  # Position the labels
))

fig.add_trace(go.Scatter(
    y=metrics_per_epoch['total_train_loss'],
    x=metrics_per_epoch['epoch'],
    mode='lines+markers+text',
    name='train_loss',
    text=metrics_per_epoch['total_train_loss'].round(2),  # Add labels to the points
    textposition='top center'  # Position the labels
))

fig.update_layout(
    xaxis=dict(range=[0.9, 3.1],dtick=1),    
    yaxis=dict(range=[0.6, 2.3]),
    xaxis_title='Epoch',
    yaxis_title='Average Loss'
)

fig.show()

In [None]:
def evaluate_test_set(model, test_dataloader, accelerator):
    model.eval()
    total_loss = 0
    total_correct = 0
    total_samples = 0
    
    for batch in eval_dl:
        with torch.no_grad():
            outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.item()
        
        predictions = torch.argmax(outputs.logits, dim=-1)
        total_correct += (predictions == batch['labels']).sum().item()
        total_samples += len(batch['labels'])
    
    avg_loss = total_loss / len(eval_dl)
    accuracy = total_correct / total_samples
    return avg_loss, accuracy
    
test_dataloader=create_data_loader(test_set)

avg_test_loss, test_accuracy=evaluate_test_set(model, test_dataloader, accelerator)

print('test accuracy: ',test_accuracy)

In [None]:
def evaluate_test_set(model, test_dataloader, accelerator):
    model.eval()
    total_loss = 0
    total_correct = 0
    total_samples = 0
    pred_list=[]
    labels_list=[]
    
    for i,batch in enumerate(test_dataloader):
        
        with torch.no_grad():
            outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.item()
        
        predictions = torch.argmax(outputs.logits, dim=-1)
        total_correct += (predictions == batch['labels']).sum().item()
        total_samples += len(batch['labels'])
        
        pred_list.append(predictions)
        labels_list.append(batch['labels'])
            
    avg_loss = total_loss / len(eval_dl)
    accuracy = total_correct / total_samples
    print('number of batches',i)
    
    return avg_loss, accuracy, pred_list, labels_list


test_dataloader=create_data_loader(test_set)

# Prepare the data and model
train_dl, test_dataloader, model, optimizer = accelerator.prepare(
    train_dataloader, test_dataloader, model, optimizer
)
    
avg_test_loss, test_accuracy, pred_list, labels_list=evaluate_test_set(model, test_dataloader, accelerator)

print('test accuracy: ',test_accuracy)

predictions_e3=pd.DataFrame({'predictions':[tensor.tolist() for tensor in pred_list],'labels':[tensor.tolist() for tensor in labels_list]})
predictions_e3=predictions_e3.explode(['predictions','labels'])
predictions_e3.to_csv('predictions_e3.csv')

In [None]:
predictions_e3=pd.DataFrame({'predictions':[tensor.tolist() for tensor in pred_list],'labels':[tensor.tolist() for tensor in labels_list]})
predictions_e3=predictions_e3.explode(['predictions','labels'])
predictions_e3.to_csv('predictions_e3.csv')