In [1]:
import os
import librosa
import pandas as pd

def extract_features(file_path):
    y, sr = librosa.load(file_path)
    tempo, _ = librosa.beat.beat_track(y=y, sr=sr)
    chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr).mean()
    rmse = librosa.feature.rms(y=y).mean()
    spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr).mean()
    spectral_bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr).mean()
    rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr).mean()
    zero_crossing_rate = librosa.feature.zero_crossing_rate(y).mean()
    mfcc = librosa.feature.mfcc(y=y, sr=sr).mean(axis=1)
    features = {'tempo': tempo, 'chroma_stft': chroma_stft, 'rmse': rmse, 'spectral_centroid': spectral_centroid,
                'spectral_bandwidth': spectral_bandwidth, 'rolloff': rolloff, 'zero_crossing_rate': zero_crossing_rate}
    for i, coef in enumerate(mfcc):
        features[f'mfcc_{i+1}'] = coef
    return features

def process_directory(directory):
    features_list = []
    for file_name in os.listdir(directory):
        if file_name.endswith('.mp3'):
            file_path = os.path.join(directory, file_name)
            features = extract_features(file_path)
            features['song'] = file_name  # Keep track of the song
            features_list.append(features)
    return features_list

root_dir = r'D:\MMC-Project\song-recommender\backend\data'
all_features = []
for i in range(156):  # 155 directories, assuming zero-indexed
    dir_name = f"{i:03d}"  # Formats the directory number with leading zeros
    full_dir_path = os.path.join(root_dir, dir_name)
    if os.path.exists(full_dir_path):
        print(f"Processing directory: {full_dir_path}")
        directory_features = process_directory(full_dir_path)
        all_features.extend(directory_features)

df = pd.DataFrame(all_features)
df.to_csv('song_features.csv', index=False)  # Save the features to a CSV file


Processing directory: D:\MMC-Project\song-recommender\backend\data\000


In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import numpy as np

# Load the data
df = pd.read_csv('song_features.csv')

# Separate numeric and non-numeric data
numeric_cols = df.select_dtypes(include=[np.number]).columns
non_numeric_cols = df.columns.difference(numeric_cols)

# Check for missing values in numeric columns and fill them with the column mean
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

# Normalize features in numeric columns only
scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

# Now df is properly preprocessed, with the 'song' column unchanged

In [19]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors

# Split the data
X = df.drop('song', axis=1)
y = df['song']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# Train the model
model = NearestNeighbors(n_neighbors=5, algorithm='ball_tree')
model.fit(X_train)


In [23]:
# Find nearest neighbors for a test set
distances, indices = model.kneighbors(X_test)

In [24]:
def recommend(song_index, n_recommendations=5):
    #given song
    print(f"Recommendations for {y.iloc[song_index]}")
    distances, indices = model.kneighbors(X.loc[song_index:song_index])
    return y.iloc[indices[0]]

# Recommend songs similar to the first song in the test set
recommendations = recommend(0)
print(recommendations)

Recommendations for 000002.mp3
19    000255.mp3
34    000667.mp3
27    000574.mp3
15    000210.mp3
30    000620.mp3
Name: song, dtype: object
