In [1]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import librosa
import librosa.display
import matplotlib.pyplot as plt
from PIL import Image
from tqdm import tqdm

# Set seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Ensure deterministic behavior in PyTorch
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Device configuration (force CPU for consistency)
device = torch.device('cpu')

# Define the conv layer and ResBlock
def conv(ni, nf, ks=3, stride=1, bias=False):
    return nn.Conv2d(ni, nf, kernel_size=ks, stride=stride, padding=ks//2, bias=bias)

def conv_layer(ni, nf, ks=3, stride=1, act=True):
    bn = nn.BatchNorm2d(nf)
    layers = [conv(ni, nf, ks, stride)]
    if act:
        layers.append(nn.ReLU(inplace=True))
    layers.append(bn)
    return nn.Sequential(*layers)

class ResBlock(nn.Module):
    def __init__(self, nf):
        super().__init__()
        self.conv1 = conv_layer(nf, nf)
        self.conv2 = conv_layer(nf, nf, act=False)

    def forward(self, x):
        return x + self.conv2(self.conv1(x))

def conv_layer_averpl(ni, nf):
    return nn.Sequential(conv_layer(ni, nf), nn.AvgPool2d(2, 2))

# Load the recommendation model
def load_recommendation_model():
    model = nn.Sequential(
        conv_layer_averpl(1, 64),
        ResBlock(64),
        conv_layer_averpl(64, 64),
        ResBlock(64),
        conv_layer_averpl(64, 128),
        ResBlock(128),
        conv_layer_averpl(128, 256),
        ResBlock(256),
        conv_layer_averpl(256, 512),
        ResBlock(512),
        nn.AdaptiveAvgPool2d((1, 1)),
        nn.Flatten(),
        nn.Linear(512, 40)
    )
    state_dict = torch.load('best_model.pt', map_location=device)
    model.load_state_dict(state_dict, strict=False)
    model = model.to(device)
    model.eval()
    return model

# Function to extract features from spectrogram slices
def extract_features(image_tensor, model):
    image_tensor = image_tensor.to(torch.float32)
    if len(image_tensor.shape) == 2:
        image_tensor = image_tensor.unsqueeze(0).unsqueeze(0)
    elif len(image_tensor.shape) == 3:
        image_tensor = image_tensor.unsqueeze(0)
    with torch.no_grad():
        return model(image_tensor)

# Function to calculate the average vector for each track
def calculate_average_vector(file_path, model, spec_folder, slice_size=128, duration=30, sr=22050):
    if not os.path.exists(spec_folder):
        os.makedirs(spec_folder)

    try:
        y, _ = librosa.load(file_path, sr=sr, duration=duration)
        window_size = sr * duration
        hop_length = sr
        rms = librosa.feature.rms(y=y, frame_length=window_size, hop_length=hop_length)[0]
        max_rms_index = np.argmax(rms)
        start_sample = max_rms_index * hop_length
        y_segment = y[start_sample:start_sample + window_size]

        mel_spectrogram = librosa.feature.melspectrogram(y=y_segment, sr=sr, n_mels=128, fmax=8000)
        mel_db = librosa.power_to_db(mel_spectrogram)

        title = os.path.splitext(os.path.basename(file_path))[0]
        spec_path = os.path.join(spec_folder, f"{title}.jpg")
        plt.figure(figsize=(mel_db.shape[1] / 100, mel_db.shape[0] / 100))
        plt.axis('off')
        librosa.display.specshow(mel_db, sr=sr, cmap='gray_r')
        plt.savefig(spec_path, dpi=100, bbox_inches='tight', pad_inches=0)
        plt.close()

        img = Image.open(spec_path).convert('L')
        width, height = img.size
        num_slices = width // slice_size

        vectors = []
        for i in range(num_slices):
            start = i * slice_size
            img_crop = img.crop((start, 0, start + slice_size, slice_size))
            img_crop = img_crop.resize((128, 128))
            img_array = np.array(img_crop).astype(np.float32) / 255.0
            image_tensor = torch.from_numpy(img_array).unsqueeze(0).unsqueeze(0).to(device)
            feature_vector = extract_features(image_tensor, model)
            vectors.append(feature_vector.cpu().numpy())

        if len(vectors) == 0:
            print(f"No slices extracted for {file_path}. Check the spectrogram dimensions.")
            return None

        avg_vector = np.mean(vectors, axis=0)
        return avg_vector

    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None

# Function to create a CSV file with all tracks' vectors
def create_vectors_csv(audio_folder, model, output_csv, spec_folder):
    vectors_data = []
    for file in tqdm(os.listdir(audio_folder), desc="Processing Audio Files"):
        if file.endswith((".mp3", ".wav")):
            file_path = os.path.join(audio_folder, file)
            title = os.path.splitext(file)[0]
            avg_vector = calculate_average_vector(file_path, model, spec_folder)
            if avg_vector is not None:
                vector_dict = {f'vector_{i}': avg_vector.flatten()[i] for i in range(avg_vector.size)}
                vector_dict['track_name'] = title
                vectors_data.append(vector_dict)

    if len(vectors_data) == 0:
        print("No vectors were processed. Ensure audio files are available and correctly formatted.")
        return

    df = pd.DataFrame(vectors_data)
    df.to_csv(output_csv, index=False)
    print(f"CSV file saved: {output_csv}")

# Main execution
model = load_recommendation_model()
audio_folder = 'audio'
spec_folder = 'spec_folder'  # Ensure this folder exists
output_csv = 'db_vectors.csv'

# Create CSV file with average vectors
create_vectors_csv(audio_folder, model, output_csv, spec_folder)


  state_dict = torch.load('best_model.pt', map_location=device)
Processing Audio Files: 100%|█████████████████| 101/101 [00:41<00:00,  2.44it/s]

CSV file saved: db_vectors.csv



