# Big Data FundaMENTALS Project 2024
# By 
## Immad Shahid (Chief) 21I-1664
## Haroon Salim (2nd to Chief) 21I-1663
## Ahmed Luqman 22I-2018


### Importing Libraries

In [3]:
import os
import librosa
import numpy as np
import pandas as pd
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import mean_squared_error, mean_absolute_error



### Loading and feature extraction of our Audio Files using Librosa library saving it in a .csv file

In [3]:
fma_small_path = r"C:\Users\Dell Pc\Downloads\fma_small"


# Define a function to extract features from audio files
def extract_features(audio_path, max_length=1000):
    print(f"Reading audio file: {audio_path}")
    # Load audio file
    try:
        y, sr = librosa.load(audio_path)
    except Exception as e:
        print(f"Error reading {audio_path}: {e}")
        return None
    
    # Extract features
    mfcc = librosa.feature.mfcc(y=y, sr=sr)
    spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
    zero_crossing_rate = librosa.feature.zero_crossing_rate(y)
    
    # Pad or truncate features to ensure a fixed length
    mfcc = librosa.util.fix_length(mfcc, size=max_length, axis=1)
    spectral_centroid = librosa.util.fix_length(spectral_centroid, size=max_length)
    zero_crossing_rate = librosa.util.fix_length(zero_crossing_rate, size=max_length)
    
    # Concatenate features into a single array
    features = np.concatenate([mfcc, spectral_centroid, zero_crossing_rate], axis=0)
    
    return features.flatten()  # Flatten the feature array to ensure a consistent shape

# Initialize empty lists to store features and track IDs
features_list = []
track_ids = []

# Initialize counter for tracking the number of files processed
file_count = 0

# Initialize a DataFrame to store all features
all_features_df = None

# Iterate through each folder in fma_small
for folder in os.listdir(fma_small_path):
    if os.path.isdir(os.path.join(fma_small_path, folder)):
        for filename in os.listdir(os.path.join(fma_small_path, folder)):
            if filename.endswith(".mp3"):
                # Construct the path to the audio file
                audio_path = os.path.join(fma_small_path, folder, filename)
                
                # Extract features from the audio file
                audio_features = extract_features(audio_path)
                if audio_features is not None:
                    # Store features and track ID
                    features_list.append(audio_features)
                    track_ids.append(filename.split('.')[0])  # Assuming the filename is in the format 'trackID.mp3'
                    
                    # Increment file count
                    file_count += 1
                    
                    # Save features to DataFrame
                    if file_count % 50 == 0 or file_count == len(os.listdir(fma_small_path)):
                        # Convert features list to DataFrame
                        features_df = pd.DataFrame(data=features_list, columns=[f"feature_{i}" for i in range(features_list[0].shape[0])])
                        features_df['track_id'] = track_ids
                        
                        # Concatenate with previous features
                        if all_features_df is None:
                            all_features_df = features_df
                        else:
                            all_features_df = pd.concat([all_features_df, features_df], ignore_index=True)
                        
                        # Reset lists after saving
                        features_list = []
                        track_ids = []
                        
                        # Save all features to a CSV file
                        all_features_df.to_csv('audio_features_partial.csv', index=False)
                        print(f"Partial audio features saved. Total processed: {file_count}")

print("Audio features extracted and saved to a partial CSV file.")


Reading audio file: C:\Users\Dell Pc\Downloads\fma_small\000\000002.mp3
Reading audio file: C:\Users\Dell Pc\Downloads\fma_small\000\000005.mp3
Reading audio file: C:\Users\Dell Pc\Downloads\fma_small\000\000010.mp3
Reading audio file: C:\Users\Dell Pc\Downloads\fma_small\000\000140.mp3
Reading audio file: C:\Users\Dell Pc\Downloads\fma_small\000\000141.mp3
Reading audio file: C:\Users\Dell Pc\Downloads\fma_small\000\000148.mp3
Reading audio file: C:\Users\Dell Pc\Downloads\fma_small\000\000182.mp3
Reading audio file: C:\Users\Dell Pc\Downloads\fma_small\000\000190.mp3
Reading audio file: C:\Users\Dell Pc\Downloads\fma_small\000\000193.mp3
Reading audio file: C:\Users\Dell Pc\Downloads\fma_small\000\000194.mp3
Reading audio file: C:\Users\Dell Pc\Downloads\fma_small\000\000197.mp3
Reading audio file: C:\Users\Dell Pc\Downloads\fma_small\000\000200.mp3
Reading audio file: C:\Users\Dell Pc\Downloads\fma_small\000\000203.mp3
Reading audio file: C:\Users\Dell Pc\Downloads\fma_small\000\000

### Establishing a MongoDB Collection using pymongo

In [1]:
import pymongo

# Establish MongoDB connection
client = pymongo.MongoClient("mongodb://localhost:27017/")

# Access a specific database
db = client["bda_db"]

# Optionally, you can access a specific collection within the database
collection = db["bda_collection"]

# Now you can perform operations on the database and its collections



### Inserting Data into MongoDB Collection

In [3]:
import pymongo
import csv


# Read CSV file
with open('audio_features_partial.csv', 'r') as file:
    reader = csv.DictReader(file)
    # Iterate through each row in the CSV
    for row in reader:
        # Assuming each row contains data for one audio file
        # Insert the row into MongoDB collection
        collection.insert_one(row)

print("Data inserted successfully.")

Data inserted successfully.


### Loading Data from MongoDB for Model Training

In [15]:
client = pymongo.MongoClient("mongodb://localhost:27017/")
db = client["bda_db"]
collection = db["bda_collection"]

In [16]:
data = list(collection.find({}))


### Deep Learning Model using PyTorch, torch and Tensors

### Extraction of Data and Splitting in Features

In [52]:
# Step 2: Extract Data and Split into Features and Labels
features = []
targets = []


# Iterate through each document
for entry in data:
    # Extract feature values from the document
    feature_values = []
    for key, value in entry.items():
        # Check if the key represents a feature (e.g., "feature_X")
        if key.startswith("feature_"):
            # Convert the value to float and append to the list
            feature_values.append(float(value))
    
    # Extract the target value from the document (assuming it's stored in a field named "target_field")
    target_value = float(entry.get("target_field", 0))  # Replace "target_field" with the actual field name
    
    # Append the feature values and target value to their respective lists
    features.append(feature_values)
    targets.append(target_value)

# Convert features and targets to PyTorch tensors
X_train_tensor = torch.tensor(features)
y_train_tensor = torch.tensor(targets)


Defining the Model to train

In [66]:
X_traiX_train_tensor = torch.tensor(X_train)
y_train_tensor = torch.tensor(y_train)

# Define your model
class MyModel(nn.Module):
    def __init__(self, input_size):
        super(MyModel, self).__init__()
        self.fc1 = nn.Linear(in_features=input_size, out_features=64)
        self.fc2 = nn.Linear(in_features=64, out_features=1)

    def forward(self, x):
        x = self.fc1(x)
        x = self.fc2(x)
        return x

# Instantiate the model with the correct input size
model = MyModel(input_size=X_train_tensor.shape[1])

Training the Model

In [81]:
def train(model, criterion, optimizer, train_loader):
    model.train()
    running_loss = 0.0
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * inputs.size(0)
    return running_loss / len(train_loader.dataset)

def evaluate(model, criterion, data_loader):
    model.eval()
    running_loss = 0.0
    with torch.no_grad():
        for inputs, labels in data_loader:
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            running_loss += loss.item() * inputs.size(0)
    return running_loss / len(data_loader.dataset)


In [82]:
from torch.utils.data import DataLoader, TensorDataset

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
test_loader = DataLoader(test_dataset, batch_size=64)

Training and Testing

In [87]:
X_train, X_test, y_train, y_test = train_test_split(features, targets, test_size=0.2, random_state=42)

# Convert features and labels to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

My Model class defined

In [88]:
class MyModel(nn.Module):
    def __init__(self, input_size):
        super(MyModel, self).__init__()
        self.fc1 = nn.Linear(in_features=input_size, out_features=64)
        self.fc2 = nn.Linear(in_features=64, out_features=1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [89]:
model = MyModel(input_size=X_train_tensor.shape[1])


In [90]:
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


In [91]:
epochs = 100
for epoch in range(epochs):
    model.train()  # Set model to training mode
    optimizer.zero_grad()  # Zero the gradients

    # Forward pass
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)

    # Backward pass and optimization
    loss.backward()
    optimizer.step()

    if epoch % 10 == 0:
        print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item()}')


  return F.mse_loss(input, target, reduction=self.reduction)


Epoch [1/100], Loss: 1384.70458984375
Epoch [11/100], Loss: 30989.873046875
Epoch [21/100], Loss: 0.012051478959619999
Epoch [31/100], Loss: 0.011855214834213257
Epoch [41/100], Loss: 0.011778025887906551
Epoch [51/100], Loss: 0.011747525073587894
Epoch [61/100], Loss: 0.011735320091247559
Epoch [71/100], Loss: 0.01173012051731348
Epoch [81/100], Loss: 0.011727587319910526
Epoch [91/100], Loss: 0.011726006865501404


Performance Metrics 

In [95]:
from sklearn.metrics import r2_score

model.eval()  # Set model to evaluation mode
with torch.no_grad():
    y_pred = model(X_test_tensor)
    test_loss = criterion(y_pred, y_test_tensor)
    mse = mean_squared_error(y_test_tensor.numpy(), y_pred.numpy())
    mae = mean_absolute_error(y_test_tensor.numpy(), y_pred.numpy())
    r2 = r2_score(y_test_tensor.numpy(), y_pred.numpy())

print(f'Test Loss: {test_loss.item()}')
print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')
print(f'R^2 Score: {r2}')

Test Loss: 0.011724754236638546
Mean Squared Error: 0.011724752373993397
Mean Absolute Error: 0.10828089714050293
R^2 Score: 0.0


  return F.mse_loss(input, target, reduction=self.reduction)


Save the Torch Model in model.pth

In [2]:
torch.save(model.state_dict(), 'music_recommendation_model.pth')


## This is the whole process done with pytorch, mongo db, librosa, feature extraction