In [None]:
!pip install -r ../requirements.txt

In [1]:
%load_ext autoreload
%autoreload 2

In [4]:
import sys
sys.path.append("/home/jparekh4/musidict/")

In [5]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split


from src.components import data_ingestion
from src.components import data_transformation
from src.components import data_loading
from src import utils

INFO: [rank: 0] Seed set to 7


In [None]:
data = np.load("../data/raw_data/npz_files_10377.npz", allow_pickle=True)

In [None]:
dataset = data_ingestion.DataIngestion().initiate_data_ingestion()

In [None]:
_, transformed_data = data_transformation.DataTransformation().initiate_data_transformation()

In [None]:
torch.tensor(transformed_data[:, -3:].astype(float))

In [None]:
dataset["genre"]

In [None]:
z = np.column_stack((transformed_data, dataset["genre"].values))

In [None]:
z[:,14]

In [None]:
import numpy as np
import torch

# Example numpy array with dtype=object and some 2D arrays
# data = np.array([
#     [1, 2, 3, np.array([[1, 2], [3, 4]])],
#     [4, 5, 6, np.array([[5, 6], [7, 8]])],
#     [7, 8, 9, np.array([[9, 10], [11, 12]])]
# ], dtype=object)

# Step 1: Flatten 2D arrays into 1D arrays
flattened_data = []
for row in transformed_data:
    flattened_row = []
    for value in row:
        if isinstance(value, np.ndarray):  # Check if value is a 2D array
            flattened_row.extend(value.flatten())  # Flatten the 2D array and add to list
        else:
            flattened_row.append(value)  # Keep the scalar as is
    flattened_data.append(flattened_row)

# Step 2: Convert the flattened data to a numpy array with a numeric dtype
flattened_data = np.array(flattened_data, dtype=np.float32)

# Step 3: Convert the numpy array to a PyTorch tensor
tensor_data = torch.tensor(flattened_data)

print(tensor_data)


In [None]:
a = np.random.randn(8, 13, 937)  # Example shape (8 samples, 128, 937)
b = np.random.randn(8, 13, 937)
c = np.random.randn(8, 13, 937)

In [None]:
np.array(["hi"]).reshape(-1, 1)

In [None]:
row = np.array([c, a, b])
rows = np.array([row, row])

In [None]:
rows[:,2]

In [None]:
features = {
    'mel_spectrogram': np.random.randn(10, 128, 937),
    'mfccs': np.random.randn(10, 13, 937),
    'chroma': np.random.randn(10, 12, 937),
    'spectral_contrast': np.random.randn(10, 7, 937),
    'zcr': np.random.randn(10, 1, 937),
    'spectral_centroid': np.random.randn(10, 1, 937),
    'spectral_bandwidth': np.random.randn(10, 1, 937),
    'rms_energy': np.random.randn(10, 1, 937),
    'tonnetz': np.random.randn(10, 6, 937),
}

# Bit rate and duration as scalar values
bit_rate = np.random.rand(10)
duration = np.random.rand(10)


In [None]:


# Sample categorical data
genres = ['pop', 'rock', 'jazz', 'classical'] * 2 + ['pop', 'classical']  # 1000 samples
song_success = ['hit', 'flop', "can't say"] * 3 + ['hit']  # 1000 samples

# One-hot encode genres
genre_encoder = OneHotEncoder(sparse=False)
genres_encoded = genre_encoder.fit_transform(np.array(genres).reshape(-1, 1))  # Shape: (1000, num_genres)

# One-hot encode song success
success_encoder = OneHotEncoder(sparse=False)
success_encoded = success_encoder.fit_transform(np.array(song_success).reshape(-1, 1))  # Shape: (1000, 3)


In [None]:
scaler_2d = StandardScaler()
scaler_scalar = StandardScaler()

# Example standardization for each 2D time-series feature
features_scaled = {}
for feature_name, feature_data in features.items():
    # Flatten, scale, then reshape
    flat_feature = feature_data.reshape(-1, feature_data.shape[-1])  # Shape: (N*D, T)
    scaled_flat_feature = scaler_2d.fit_transform(flat_feature)
    features_scaled[feature_name] = scaled_flat_feature.reshape(feature_data.shape)

# Example scaling for scalar features
bit_rate_scaled = scaler_scalar.fit_transform(bit_rate.reshape(-1, 1)).flatten()
duration_scaled = scaler_scalar.fit_transform(duration.reshape(-1, 1)).flatten()

In [None]:
from torch.utils.data import DataLoader, Dataset

class MusicDataset(Dataset):
    def __init__(self, scaled_features, genre_one_hot, bit_rate_scaled, duration_scaled, targets_one_hot):
        self.features = scaled_features
        self.genre_one_hot = genre_one_hot.astype(np.float32)
        self.bit_rate = bit_rate_scaled.astype(np.float32)
        self.duration = duration_scaled.astype(np.float32)
        self.targets = targets_one_hot.astype(np.float32)

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, idx):
        return {
            'mel_spectrogram': torch.tensor(self.features['mel_spectrogram'][idx], dtype=torch.float32),
            'mfccs': torch.tensor(self.features['mfccs'][idx], dtype=torch.float32),
            'chroma': torch.tensor(self.features['chroma'][idx], dtype=torch.float32),
            'spectral_contrast': torch.tensor(self.features['spectral_contrast'][idx], dtype=torch.float32),
            'zcr': torch.tensor(self.features['zcr'][idx], dtype=torch.float32),
            'spectral_centroid': torch.tensor(self.features['spectral_centroid'][idx], dtype=torch.float32),
            'spectral_bandwidth': torch.tensor(self.features['spectral_bandwidth'][idx], dtype=torch.float32),
            'rms_energy': torch.tensor(self.features['rms_energy'][idx], dtype=torch.float32),
            'tonnetz': torch.tensor(self.features['tonnetz'][idx], dtype=torch.float32),
            'bit_rate': torch.tensor(self.bit_rate[idx], dtype=torch.float32),
            'duration': torch.tensor(self.duration[idx], dtype=torch.float32),
            'genre': torch.tensor(self.genre_one_hot[idx], dtype=torch.float32),
            'target': torch.tensor(self.targets[idx], dtype=torch.float32),
        }


In [None]:
# Create dataset
dataset = MusicDataset(features_scaled, genres_encoded, bit_rate_scaled, duration_scaled, success_encoded)

# DataLoader with batch size
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=4)


In [None]:
for b in dataloader:
    print(b)
    break

In [None]:
b.shape

In [None]:
row = np.array([[0], [2 ,3 ]], dtype=np.float32)

In [None]:
rows = np.array([row, row])

In [None]:
rows

In [None]:
torch.tensor(transformed_data[:, 1])

In [None]:
first = torch.tensor(transformed_data[:, 1].astype(float))

In [None]:
second = torch.tensor(np.stack(transformed_data[:, 2]))

In [None]:
scalar_tensor_expanded = first.view(-1, 1, 1)  # Make each scalar a 1x1 matrix
first = scalar_tensor_expanded.expand(-1, *second.shape[1:])

In [None]:
second.shape

In [None]:
result = torch.cat((first, second), dim=1)

In [None]:
result.shape

In [None]:
result[0]

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np

# Sample DataLoader for Music Dataset (Assumes you already have the data loaded)
class MusicDataset(Dataset):
    def __init__(self, features, targets, genres, bit_rate, duration):
        self.features = features  # Dictionary with all feature arrays
        self.targets = targets    # One-hot encoded target
        self.genres = genres      # Indices for genres
        self.bit_rate = bit_rate
        self.duration = duration

    def __getitem__(self, idx):
        mel_spectrogram = torch.tensor(self.features['mel_spectrogram'][idx], dtype=torch.float32)
        mfccs = torch.tensor(self.features['mfccs'][idx], dtype=torch.float32)
        chroma = torch.tensor(self.features['chroma'][idx], dtype=torch.float32)
        spectral_contrast = torch.tensor(self.features['spectral_contrast'][idx], dtype=torch.float32)
        zcr = torch.tensor(self.features['zcr'][idx], dtype=torch.float32)
        spectral_centroid = torch.tensor(self.features['spectral_centroid'][idx], dtype=torch.float32)
        spectral_bandwidth = torch.tensor(self.features['spectral_bandwidth'][idx], dtype=torch.float32)
        rms_energy = torch.tensor(self.features['rms_energy'][idx], dtype=torch.float32)
        tonnetz = torch.tensor(self.features['tonnetz'][idx], dtype=torch.float32)

        # Scalar features
        bit_rate = torch.tensor(self.bit_rate[idx], dtype=torch.float32).unsqueeze(0)
        duration = torch.tensor(self.duration[idx], dtype=torch.float32).unsqueeze(0)

        # Categorical features (genre as an index)
        genre = torch.tensor(self.genres[idx], dtype=torch.long)
        target = torch.tensor(self.targets[idx], dtype=torch.float32)  # One-hot encoded target

        return {
            'mel_spectrogram': mel_spectrogram,
            'mfccs': mfccs,
            'chroma': chroma,
            'spectral_contrast': spectral_contrast,
            'zcr': zcr,
            'spectral_centroid': spectral_centroid,
            'spectral_bandwidth': spectral_bandwidth,
            'rms_energy': rms_energy,
            'tonnetz': tonnetz,
            'bit_rate': bit_rate,
            'duration': duration,
            'genre': genre,
            'target': target
        }

    def __len__(self):
        return len(self.features['mel_spectrogram'])

# Define the model for predicting song success
class MusicSuccessPredictor(nn.Module):
    def __init__(self, genre_size=10, genre_embedding_dim=4):
        super(MusicSuccessPredictor, self).__init__()

        # Embedding for genres (10 genres, embedding dim 4)
        self.genre_embedding = nn.Embedding(genre_size, genre_embedding_dim)

        # Convolutional layers for mel_spectrogram
        self.conv_mel = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=(3, 3), stride=(1, 1), padding=1),
            nn.ReLU(),
            nn.MaxPool2d((2, 2))
        )

        # Linear layers for scalar features (bit_rate, duration, and genre embedding)
        self.scalar_fc = nn.Linear(2 + genre_embedding_dim, 16)  # bit_rate + duration + genre embedding

        # Fully connected layers for output
        self.fc1 = nn.Linear(16 + 16, 64)  # Combine features from the CNN and scalar features
        self.fc2 = nn.Linear(64, 3)  # 3 categories for output (hit, flop, can't say)
        self.softmax = nn.Softmax(dim=1)  # Softmax activation for classification

    def forward(self, mel_spectrogram, mfccs, chroma, spectral_contrast, zcr, spectral_centroid,
                spectral_bandwidth, rms_energy, tonnetz, bit_rate, duration, genre):

        # Embedding for genre
        genre_embedding = self.genre_embedding(genre)  # Shape: (batch_size, genre_embedding_dim)

        # Process mel_spectrogram with CNN
        mel_spectrogram = mel_spectrogram.unsqueeze(1)  # Add channel dimension for Conv2d
        mel_features = self.conv_mel(mel_spectrogram)
        mel_features = torch.flatten(mel_features, start_dim=1)

        # Combine scalar features with genre embedding
        scalar_features = torch.cat([bit_rate, duration, genre_embedding], dim=1)
        scalar_features = self.scalar_fc(scalar_features)

        # Combine CNN features and scalar features, pass through fully connected layers
        combined_features = torch.cat([mel_features, scalar_features], dim=1)
        x = self.fc1(combined_features)
        x = torch.relu(x)  # Apply activation function
        output = self.fc2(x)  # Output layer with 3 categories

        # Softmax for classification
        output = self.softmax(output)

        return output

# Data preparation
# Assuming you have features and labels as numpy arrays and one-hot encoded target
features = {
    'mel_spectrogram': np.random.rand(1000, 128, 937),  # Example shape: (num_samples, 128, 937)
    'mfccs': np.random.rand(1000, 13, 937),
    'chroma': np.random.rand(1000, 12, 937),
    'spectral_contrast': np.random.rand(1000, 7, 937),
    'zcr': np.random.rand(1000, 1, 937),
    'spectral_centroid': np.random.rand(1000, 1, 937),
    'spectral_bandwidth': np.random.rand(1000, 1, 937),
    'rms_energy': np.random.rand(1000, 1, 937),
    'tonnetz': np.random.rand(1000, 6, 937)
}
bit_rate = np.random.rand(1000)  # Scalar feature
duration = np.random.rand(1000)  # Scalar feature
genres = np.random.randint(0, 10, 1000)  # Integer indices for genres (10 genres)
targets = np.random.randint(0, 3, 1000)  # Example target labels: 0=hit, 1=flop, 2=can't say

# One-hot encoding the target
targets_onehot = np.eye(3)[targets]

# Create Dataset and DataLoader
dataset = MusicDataset(features, targets_onehot, genres, bit_rate, duration)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# # Model initialization
# model = MusicSuccessPredictor()

# # Loss function and optimizer
# loss_fn = nn.CrossEntropyLoss()  # Cross-entropy loss for multi-class classification
# optimizer = optim.Adam(model.parameters(), lr=0.001)

# # Training Loop
# num_epochs = 10
# for epoch in range(num_epochs):
#     for batch in dataloader:
#         mel_spectrogram = batch['mel_spectrogram']
#         mfccs = batch['mfccs']
#         chroma = batch['chroma']
#         spectral_contrast = batch['spectral_contrast']
#         zcr = batch['zcr']
#         spectral_centroid = batch['spectral_centroid']
#         spectral_bandwidth = batch['spectral_bandwidth']
#         rms_energy = batch['rms_energy']
#         tonnetz = batch['tonnetz']
#         bit_rate = batch['bit_rate']
#         duration = batch['duration']
#         genre = batch['genre']
#         target = batch['target']  # One-hot encoded target

#         # Zero gradients
#         optimizer.zero_grad()

#         # Forward pass
#         outputs = model(mel_spectrogram, mfccs, chroma, spectral_contrast, zcr,
#                         spectral_centroid, spectral_bandwidth, rms_energy, tonnetz,
#                         bit_rate, duration, genre)

#         # Loss calculation
#         loss = loss_fn(outputs, torch.max(target, 1)[1])  # Cross-entropy loss with one-hot target

#         # Backward pass and optimization
#         loss.backward()
#         optimizer.step()

#     print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}")

# # Save the model after training
# torch.save(model.state_dict(), 'music_success_predictor.pth')


In [None]:
for d in dataloader:
    break

In [None]:
d["mel_spectrogram"].shape

In [None]:
result = utils.convert_dataset_into_tensor_dict(transformed_data)

In [None]:
torch.tensor(np.stack(transformed_data[:, 1]) ,dtype=torch.flo)

In [None]:
result['mel_spectrogram'].shape

In [None]:
result["success"].shape

In [None]:
targets = np.random.randint(0, 3, 1000)  # Example target labels: 0=hit, 1=flop, 2=can't say


In [None]:
targets.shape

In [None]:
targets_onehot = np.eye(3)[targets]

In [None]:
torch.tensor(targets_onehot).shape

In [None]:
a = data_loading.DataModule(2)

In [None]:
a.load_and_split_data()

In [None]:
t = a.train_dataloader()

In [None]:
for i in t:
    break

In [None]:
i['mel_spectrogram'].shape

In [None]:
from src.components import model_trainer

In [None]:
import torch

# Create a 2x3 tensor
input_tensor = torch.tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])

# Apply softmax along dim=0 (rows)
softmax_dim0 = torch.softmax(input_tensor, dim=0)
print("Softmax along dim=0:\n", softmax_dim0)

# Apply softmax along dim=1 (columns)
softmax_dim1 = torch.softmax(input_tensor, dim=1)
print("Softmax along dim=1:\n", softmax_dim1)


In [None]:
data_loader_obj = data_loading.DataModule(batch_size=2)
# data_ingestion.DataIngestion().initiate_data_ingestion()
train_loader = data_loader_obj.train_dataloader()
val_loader = data_loader_obj.val_dataloader()

In [None]:
for vb in val_loader:
    break

In [None]:
vb["bit_rate"]
# print(batch['bit_rate'].float(),   # Scalar (batch_size, 1)
#                 batch['duration'].float(),   # Scalar (batch_size, 1)
#                 batch['genre'].float())

In [None]:
vb["duration"]

In [None]:
vb["genre"]

In [None]:
t_dataset = np.load("/Users/jayparekh/Documents/projects/musidict/data/transformed_data/transformed_dataset.npy", allow_pickle=True)


In [None]:
t_dataset[:, 0].shape

In [None]:
t_dataset[:, 2].shape

In [None]:
t_dash = utils.convert_dataset_into_tensor_dict(t_dataset)

In [None]:
t_dash["genre"].shape

In [None]:
t_dash["duration"].shape

In [None]:
t_dash

In [None]:
torch.tensor(t_dash["bit_rate"], dtype=torch.float32).unsqueeze(0)

In [None]:
t_dash["bit_rate"]

In [None]:
import lightning as L
L.seed_everything(7, workers=True)
from lightning.pytorch.callbacks import EarlyStopping, LearningRateMonitor, ModelCheckpoint
from src.components import model_trainer

batch_size = 2
criterion = nn.CrossEntropyLoss()
learning_rate = 0.001
dropout_prob = 0.3
lr_logger = LearningRateMonitor()
early_stopping = EarlyStopping('val_loss_mean', mode='min', patience=10)
model_checkpoint = ModelCheckpoint(dirpath="../artifacts/MODELS",save_last=True, save_top_k=3, monitor="val_loss_mean")
epochs = 10
data_loader_obj = data_loading.DataModule(batch_size=batch_size)
# data_ingestion.DataIngestion().initiate_data_ingestion()
train_loader = data_loader_obj.train_dataloader()
val_loader = data_loader_obj.val_dataloader()

lightning_model = model_trainer.MusicSuccessPredictor(loss_fn=criterion, learning_rate=learning_rate, dropout_prob=dropout_prob)

trainer = L.Trainer(max_epochs=epochs, callbacks=[lr_logger, early_stopping, model_checkpoint])

trainer.fit(lightning_model, train_loader, val_loader)

In [None]:
t_dash["mfccs"].shape

In [None]:
t_dash["mfccs"][0].shape

In [None]:
t_dash["mfccs"][0].unsqueeze(0).shape

In [None]:
data_ingestion.DataIngestion().initiate_data_ingestion()

In [None]:
a = ["ara", 1 , np.array([[1, 2], [2, 3]])]
b = [1 , np.array([[1, 2], [2, 3]])]

In [None]:
rows = [a, b]

In [None]:
d =np.stack(rows)

In [None]:
d.shape

In [None]:
import os
dataset = []
# logging.info("Data Ingestion Started.")
for filename in os.listdir("../data/raw_data"):
    data_point = []
    if filename.endswith('.npz'):
        file_path = os.path.join("../data/raw_data", filename)
        data = np.load(file_path, allow_pickle=True)
        genre = utils.preprocess_genres(data["metadata"][0]["genres"])
        if not genre:
            continue
        data_point.append(genre)
        data_point.append(data["metadata"][0]["bit_rate"])
        data_point.append(data["metadata"][0]["duration"])
        data_point.append(utils.categorize_listens(data["metadata"][0]["listens"]))
        

        data = utils.reshape_all_time_series_data(data)
        data_point.append(data["mel_spectrogram"])
        data_point.append(data["mfccs"])
        data_point.append(data["chroma"])
        data_point.append(data["spectral_contrast"])
        data_point.append(data["zcr"])
        data_point.append(data["spectral_centroid"])
        data_point.append(data["spectral_bandwidth"])
        data_point.append(data["rms_energy"])
        data_point.append(data["tonnetz"])
        print(data_point)
    dataset.append(data_point)
print(len(dataset))
dataset = np.stack(dataset)
print(dataset.shape)
dataset_df = pd.DataFrame(dataset, columns=[
    "genre",
    "bit_rate",
    "duration",
    "success",
    "mel_spectrogram",
    "mfccs",
    "chroma",
    "spectral_contrast",
    "zcr",
    "spectral_centroid",
    "spectral_bandwidth",
    "rms_energy",
    "tonnetz",
    ])

In [8]:
dm = data_loading.DataModule(4)

INFO: [rank: 0] Seed set to 7


In [9]:
train_loader = dm.train_dataloader()

In [None]:
for i in train_loader:
    break

In [11]:
i

{'mel_spectrogram': tensor([[[[-2.0870e-01, -1.6125e-01, -1.3827e-01,  ..., -1.2600e-01,
            -1.2688e-01, -1.3345e-01],
           [-2.0928e-01, -1.6126e-01, -1.3803e-01,  ..., -1.2592e-01,
            -1.2682e-01, -1.3044e-01],
           [-1.9611e-01, -1.5209e-01, -1.3213e-01,  ..., -8.8608e-02,
            -9.5087e-02, -9.4204e-02],
           ...,
           [-2.1004e-01, -1.6152e-01, -1.3824e-01,  ..., -1.2609e-01,
            -1.2685e-01, -1.4150e-01],
           [-2.1018e-01, -1.6164e-01, -1.3834e-01,  ..., -1.2611e-01,
            -1.2695e-01, -1.4163e-01],
           [-2.1018e-01, -1.6164e-01, -1.3834e-01,  ..., -1.2611e-01,
            -1.2696e-01, -1.4163e-01]]],
 
 
         [[[-4.1728e-01, -3.0843e-01, -2.2506e-01,  ..., -1.9419e-01,
            -1.9818e-01, -1.7043e-01],
           [-3.8290e-01, -2.9721e-01, -2.2075e-01,  ...,  2.6881e+00,
             2.6645e+00,  2.7233e+00],
           [-2.9914e-01, -2.8645e-01, -2.2247e-01,  ...,  1.0621e+01,
             1.04

In [None]:
for j in train_loader:
    break

In [13]:
j

{'mel_spectrogram': tensor([[[[ 5.4300e-01, -1.3056e-02, -3.4545e-01,  ...,  4.5275e+00,
             4.8355e+00,  4.0502e+00],
           [ 8.1361e-01,  1.5389e+00,  2.4137e+00,  ...,  9.5538e+00,
             9.0293e+00,  9.5469e+00],
           [ 6.9433e-03,  4.5246e-01,  1.0315e+00,  ...,  1.3538e+00,
             2.2285e+00,  2.3764e+00],
           ...,
           [-4.9072e-01, -4.7271e-01, -3.9973e-01,  ..., -2.1744e-01,
            -2.3119e-01, -2.1707e-01],
           [-4.9214e-01, -4.7399e-01, -4.0036e-01,  ..., -2.1744e-01,
            -2.3119e-01, -2.1707e-01],
           [-4.9314e-01, -4.7473e-01, -4.0091e-01,  ..., -2.1744e-01,
            -2.3119e-01, -2.1707e-01]]],
 
 
         [[[-7.2179e-02, -1.7516e-01, -1.4741e-01,  ..., -1.0940e-01,
            -1.0917e-01, -1.0906e-01],
           [-1.0820e-01, -1.7101e-01, -1.4117e-01,  ..., -1.0962e-01,
            -1.0975e-01, -1.0990e-01],
           [-1.4134e-01, -2.0539e-01, -1.4204e-01,  ..., -1.0986e-01,
            -1.09