# Tempo evaluation

### Import requirements

In [203]:
import pandas as pd
import librosa
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
from IPython.display import clear_output
from AudioFeatureExtractor import AudioFeatureExtractor
from functools import reduce
import os
device = "mps"
sr=22050

### Utils

In [204]:
def slice_by_second(audio, segment_length_sec, sr=sr):
    # Convert segment length to samples
    segment_length_samples = int(segment_length_sec * sr)
    num_segments = len(audio) // segment_length_samples
    segments = []
    for i in range(num_segments):
        start_sample = i * segment_length_samples
        end_sample = (i + 1) * segment_length_samples
        segment = audio[start_sample:end_sample]
        segments.append(segment)
    return segments

def parse_for_local(path:str):
    is_drive = os.path.isdir('/content/drive/MyDrive/')
    if is_drive:
        return path
    else:
        return path.replace("/content/drive/MyDrive/", "/Users/yoonsookim/Library/CloudStorage/GoogleDrive-ml.laptise@gmail.com/マイドライブ/")

# Function to extract MFCC features from audio segment
def extract_melspecto(audio_segment):
    hop_length = int(1024/ 4)
    melspectrogram = librosa.feature.melspectrogram(
        y=audio_segment,
        sr=sr,
        n_fft=1024,
        hop_length=hop_length)
    return melspectrogram

### Load audios

In [201]:
sheet = pd.read_csv("https://docs.google.com/spreadsheets/d/1UDQxW1s2D6kUJuYOqQOC5WPmU6UlGb-GTCwoe52_3qw/export?format=csv")
rows = sheet.dropna(subset=["BPM", "AR"])
# List to store loaded audio data
loaded_data = []

dset = []
# Load audio files
for _,row in rows.iterrows():
    path = row["dir"]
    bpm = row["BPM"]
    ar_path = parse_for_local(f"{path}/ar.wav")
    audio, _ = librosa.load(ar_path, sr=sr, mono=True)
    metadata = {
        "bpm": bpm,
        "ar_path": ar_path,
        "title": row["Title"],
        'artist': row['Artist'],
    }
    segments = slice_by_second(audio, 10)
    for segment in segments:
        features = extract_melspecto(segment)
        feature_torch = torch.tensor(features)
        dset.append({
            "features": feature_torch,
            "bpm": bpm,
            "metadata": metadata
        })
print(len(dset))

372


### Extract Feature and build dataset

In [202]:
from numpy import float32


def pick_values(dset, key):
    return list(map(lambda d: d[key], dset))

class TempoDataset(Dataset):
    
    def get_input_size(self):
        shapes = list(self._dset[0]["feature"].shape)
        size = reduce(lambda x, y: x * y, shapes)
        return size
    
    def _parse_dict(self, d: dict):
        return {
            "feature": d["features"],
            "bpm": float32(d["bpm"]),
            "metadata": d["metadata"]
        }

    def _check_feature_length(self):
        fisrt_shape = self._dset[0]["feature"].shape
        shapes = []
        for d in self._dset:
            shape = d["feature"].shape
            shapes.append(shape)
            if shape != fisrt_shape:
                print(shapes)
                print(d['metadata'])
                raise ValueError("All features must have the same shape.")

    def __init__(self, dset: list[dict]):
        self._dset = list(map(self._parse_dict, dset))
        self._check_feature_length()
    def __len__(self):
        return len(self._dset)
    def __getitem__(self, idx):
        target = self._dset[idx]
        return target["feature"], target["bpm"], target["metadata"]

dataset = TempoDataset(dset)


[torch.Size([128, 862]), torch.Size([128, 862]), torch.Size([128, 862]), torch.Size([128, 862]), torch.Size([128, 862]), torch.Size([128, 862]), torch.Size([128, 862]), torch.Size([128, 862]), torch.Size([128, 862]), torch.Size([128, 862]), torch.Size([128, 862]), torch.Size([128, 862]), torch.Size([128, 862]), torch.Size([128, 862]), torch.Size([128, 862]), torch.Size([128, 862]), torch.Size([128, 862]), torch.Size([128, 862]), torch.Size([128, 862]), torch.Size([128, 862]), torch.Size([128, 862]), torch.Size([128, 862]), torch.Size([128, 862]), torch.Size([128, 862]), torch.Size([128, 862]), torch.Size([128, 862]), torch.Size([128, 862]), torch.Size([128, 862]), torch.Size([128, 862]), torch.Size([128, 862]), torch.Size([128, 862]), torch.Size([128, 862]), torch.Size([128, 862]), torch.Size([128, 862]), torch.Size([128, 862]), torch.Size([128, 862]), torch.Size([128, 862]), torch.Size([128, 862]), torch.Size([128, 862]), torch.Size([128, 862]), torch.Size([128, 862]), torch.Size([128

### Training

In [None]:
# Simple model definition
import random


class SimpleModel(nn.Module):
    def __init__(self, input_size:int, hidden_size:int):
        super(SimpleModel, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, 1)  # 出力層の数は10

    def forward(self, x):
        x = self.flattern(x)
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)  # 出力層（活性化関数なし）
        return x
    
    def flattern(self, x):
        return x.view(x.size(0), -1)

    def predict(self, audio_path:str):
        y, *_ = librosa.load(audio_path, sr=sr, mono=True)
        sliced, *_ = slice_by_second(y, 5)
        melspecto = extract_melspecto(sliced)
        torch_melspecto = torch.tensor([melspecto]).to(device)
        result = self(torch_melspecto)
        return result.item()
    
    def train(self, dataset: TempoDataset, num_epochs:int,optimizer, criterion=nn.MSELoss()):
        # for plot
        losses = []  # 各エポックのロスを格納するリスト
        epoches = []
        # for log
        messages = ""

        dataloader = DataLoader(dataset, batch_size=1, shuffle=True)
        step = 0
        # Training loop
        for epoch in range(num_epochs):
            # Each Step
            for feature, bpm, metadata in dataloader:
                # Zero the gradients
                optimizer.zero_grad()

                # Forward pass
                outputs = self(feature.to(device))
                # Calculate loss
                loss = criterion(outputs.to(device), bpm.to(device))
                # Backward pass and optimization
                loss.backward()
                optimizer.step()
                step += 1
            
            # Each Epoch
            new_epoch = epoch + 1
            epoches.append(new_epoch)
            losses.append(loss.item())
            if loss < lowest_loss_model["loss"]:
                lowest_loss_model["model"] = model
                lowest_loss_model["loss"] = loss
                print(f"Lowest loss so far: {lowest_loss_model['loss']}")
            clear_output()
            messages += f'Epoch [{new_epoch}/{num_epochs}], Loss: {loss.item()}, Answer: {bpm.item()}, Predicted: {outputs.item()}\n'
            #plot
            plt.plot(epoches, losses)
            plt.show()
            #log
            print(messages)
            
input_size = dataset.get_input_size()
hidden_size = 200 

model = SimpleModel(input_size, hidden_size).to(device)

lowest_loss_model = {
    "model": None,
    "loss": 1000
}

model.train(
    dataset=dataset,
    num_epochs=1000,
    criterion=nn.MSELoss(),
    optimizer=optim.Adam(model.parameters(), lr=0.0001)
    )

### Predict

In [None]:
model.eval()
file_path="/content/drive/MyDrive/audio/batch/Anarchy-hige/ar.wav"

# Perform prediction
with torch.no_grad():
    predicted = model.predict(parse_for_local(file_path))
    print(predicted)
