In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import model_utils as mu
from gensim.models import Word2Vec
import numpy as np
import torch
import torch.nn as nn
from tqdm.autonotebook import tqdm
import nn_model as nnm
from sklearn.preprocessing import LabelEncoder

NUM_SEQUENCES_PER_BATCH = 1024

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [2]:
# Load the pre-trained model
embedder = nnm.Embedder('all-MiniLM-L6-v2', device=device)

In [3]:
#load sentences
similar_song_lyrics= pd.read_csv("data/kat_data_similar_songs.csv") 
similar_song_lyrics.head()

Unnamed: 0,title,artist,song,similar_lyrics,similar_track,similar_artist
0,Cuéntale,Ivy Queen,Cuéntale by Ivy Queen,4 ContributorsDile LyricsQuien te va a querer ...,Dile,Ivy Queen
1,Cuéntale,Ivy Queen,Cuéntale by Ivy Queen,3 ContributorsCansada LyricsWelcome to the dra...,Cansada,Ivy Queen
2,Cuéntale,Ivy Queen,Cuéntale by Ivy Queen,11 ContributorsTranslationsEnglishDeutschElla ...,Ella Me Levantó,Daddy Yankee
3,Cuéntale,Ivy Queen,Cuéntale by Ivy Queen,"12 ContributorsDale Don Dale Lyrics[Letra de ""...",Dale Don Dale,Don Omar
4,Cuéntale,Ivy Queen,Cuéntale by Ivy Queen,"8 ContributorsSexy Robótica Lyrics[Letra de ""S...",Sexy Robotica,Don Omar


In [4]:
lyrics, song_title = similar_song_lyrics["similar_lyrics"].tolist(), similar_song_lyrics["song"].tolist()
processed_lyrics = []
for lyric in lyrics:
    processed_lyrics.append(nnm.preprocess_sentence(lyric))

In [5]:
#Create word embeddings 
embeddings = embedder(processed_lyrics, batch_size=NUM_SEQUENCES_PER_BATCH)
print(embeddings.shape)


torch.Size([7047, 384])


In [6]:
label_encoder = LabelEncoder()

# Fit and transform the labels to integers
y_encoded = label_encoder.fit_transform(song_title) # Label Encoder does transform values into unique

# Convert to a PyTorch tensor
y_tensor = torch.tensor(y_encoded, dtype=torch.int64).to(device)
print(y_tensor[:5])

tensor([141, 141, 141, 141, 141], device='cuda:0')


In [7]:
model, test_dataloader = nnm.full_pipeline(x=embeddings, y=y_tensor, vocab_size = len(set(y_encoded)), batch_size=NUM_SEQUENCES_PER_BATCH, embedding_size=embeddings.shape[1], hidden_units=1024
                                           , epochs=1000, device=device, early_stop_threshold=1e-3)
print(embeddings.shape)  # Should be [num_samples, embedding_dim]

  0%|          | 0/1000 [00:00<?, ?it/s]

Epoch: 0, Loss: 6.4095

Epoch: 1, Loss: 6.3141

Epoch: 2, Loss: 6.1275

Epoch: 3, Loss: 5.9222

Epoch: 4, Loss: 5.7659

Epoch: 5, Loss: 5.6639

Epoch: 6, Loss: 5.5988

Epoch: 7, Loss: 5.5355

Epoch: 8, Loss: 5.4688

Epoch: 9, Loss: 5.3989

Epoch: 10, Loss: 5.3165

Epoch: 11, Loss: 5.2383

Epoch: 12, Loss: 5.1509

Epoch: 13, Loss: 5.0598

Epoch: 14, Loss: 4.9794

Epoch: 15, Loss: 4.8949

Epoch: 16, Loss: 4.8183

Epoch: 17, Loss: 4.7402

Epoch: 18, Loss: 4.6596

Epoch: 19, Loss: 4.5897

Epoch: 20, Loss: 4.5210

Epoch: 21, Loss: 4.4450

Epoch: 22, Loss: 4.3697

Epoch: 23, Loss: 4.2982

Epoch: 24, Loss: 4.2263

Epoch: 25, Loss: 4.1641

Epoch: 26, Loss: 4.0913

Epoch: 27, Loss: 4.0244

Epoch: 28, Loss: 3.9505

Epoch: 29, Loss: 3.8908

Epoch: 30, Loss: 3.8143

Epoch: 31, Loss: 3.7475

Epoch: 32, Loss: 3.6835

Epoch: 33, Loss: 3.6107

Epoch: 34, Loss: 3.5531

Epoch: 35, Loss: 3.4932

Epoch: 36, Loss: 3.4243

Epoch: 37, Loss: 3.3547

Epoch: 38, Loss: 3.2976

Epoch: 39, Loss: 3.2365

Epoch: 40,

In [8]:
# Evaluate the model on the test set
accuracy = nnm.evaluate_model(model, test_dataloader)
# Save the trained model
torch.save(model.state_dict(), "model/trained_ffnn_model.pth")
print("Model saved as 'trained_ffnn_model.pth'")
# Load the model for future use
print(accuracy)

Accuracy on test set: 19.01% (268/1410)
Model saved as 'trained_ffnn_model.pth'
0.1900709219858156


In [9]:
import lyricsgenius
song_title = "Yonaguni"
artist_name = "Bad Bunny"

genius = lyricsgenius.Genius("Z_wiD32yFUiTd_bFET7Xo4UkbdCvZZJm7ViomZFISpf4wpr_4nvblXiuMaemGED8")
queried_song = genius.search_song(song_title, artist_name)

if queried_song:
    lyrics = queried_song.lyrics
    processed_lyrics = nnm.preprocess_sentence(lyric)
    encoded_lyrics = embedder(processed_lyrics, batch_size=NUM_SEQUENCES_PER_BATCH)
    encoded_lyrics.unsqueeze_(0) # Add a batch dimension

    model.eval()

    with torch.no_grad(): # Speeds up inference and reduces memory usage by not having to calcualte gradients
        logits = model(encoded_lyrics) # Forward pass on the model
        probability = nn.functional.softmax(logits, dim=1) # Normalize z scores to probability
        predicted_idx = torch.multinomial(probability, num_samples=10).cpu()[0].squeeze(0) # Get the top 10 predictions
        predicted_songs = label_encoder.inverse_transform(predicted_idx)
        print(probability.cpu()[0][predicted_idx])
        print(predicted_songs)

else:
    print(f"Song '{song_title}' by '{artist_name}' not found.")


Searching for "Yonaguni" by Bad Bunny...


Done.
tensor([1.6920e-01, 1.9079e-01, 1.9062e-01, 2.3054e-01, 2.1122e-01, 2.9269e-06,
        1.3557e-03, 4.4522e-04, 1.3136e-04, 7.1207e-04])
['Toxic by Britney Spears' 'Loba by Shakira'
 "Don't Stop The Music by Rihanna" "Just Dance by Lady Gaga,Colby O'Donis"
 'Die Young by Kesha' 'MONACO by Bad Bunny' 'SOS by Rihanna'
 'Cannibal by Kesha'
 "Ain't No Rest For The Wicked - Original Version by Cage The Elephant"
 'Womanizer by Britney Spears']


In [10]:
#Labels that are not similar to: FireBurning by Sean Kingston
# 252 -  'Honey' by Kehlani
# 11 - 505 by Arctic Monkeys
# 13 - 8TEEN by Khalid
# 21 - American Requiem by Beyonce

#Labels somewhat similar to : Salio el Sol by Don Omar
# 336 - Low  by Flo Rida
# 431 -Promiscuous
# 294 -Just Dance

In [None]:
potential_indices = [252, 11, 13, 21, 336, 431, 294]
# Get the corresponding labels
print(probability.shape)
for index in potential_indices:
    value = probability[0, index]
    label = label_encoder.inverse_transform([index])[0]
    print(f"Probability for '{label}' is {value} ")