In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import model_utils as mu
from gensim.models import Word2Vec
import numpy as np
import torch
import torch.nn as nn
from tqdm.autonotebook import tqdm
import nn_model as nnm
from sklearn.preprocessing import LabelEncoder

NUM_SEQUENCES_PER_BATCH = 1024

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [2]:
# Load the pre-trained model
embedder = nnm.Embedder('all-MiniLM-L6-v2', device=device)

In [3]:
#load sentences
similar_song_lyrics= pd.read_csv("data/kat_data_similar_songs.csv") 
similar_song_lyrics.head()

Unnamed: 0,title,artist,song,similar_lyrics,similar_track,similar_artist
0,Cuéntale,Ivy Queen,Cuéntale by Ivy Queen,4 ContributorsDile LyricsQuien te va a querer ...,Dile,Ivy Queen
1,Cuéntale,Ivy Queen,Cuéntale by Ivy Queen,3 ContributorsCansada LyricsWelcome to the dra...,Cansada,Ivy Queen
2,Cuéntale,Ivy Queen,Cuéntale by Ivy Queen,11 ContributorsTranslationsEnglishDeutschElla ...,Ella Me Levantó,Daddy Yankee
3,Cuéntale,Ivy Queen,Cuéntale by Ivy Queen,"12 ContributorsDale Don Dale Lyrics[Letra de ""...",Dale Don Dale,Don Omar
4,Cuéntale,Ivy Queen,Cuéntale by Ivy Queen,"8 ContributorsSexy Robótica Lyrics[Letra de ""S...",Sexy Robotica,Don Omar


In [4]:
lyrics, song_title = similar_song_lyrics["similar_lyrics"].tolist(), similar_song_lyrics["song"].tolist()
processed_lyrics = []
for lyric in lyrics:
    processed_lyrics.append(nnm.preprocess_sentence(lyric))

In [5]:
#Create word embeddings 
embeddings = embedder(processed_lyrics, batch_size=NUM_SEQUENCES_PER_BATCH)
print(embeddings.shape)


torch.Size([7047, 384])


In [6]:
label_encoder = LabelEncoder()

# Fit and transform the labels to integers
y_encoded = label_encoder.fit_transform(song_title) # Label Encoder does transform values into unique

# Convert to a PyTorch tensor
y_tensor = torch.tensor(y_encoded, dtype=torch.int64).to(device)
print(y_tensor[:5])

tensor([141, 141, 141, 141, 141], device='cuda:0')


In [8]:
model, test_dataloader = nnm.full_pipeline(x=embeddings, y=y_tensor, vocab_size = len(set(y_encoded)), batch_size=NUM_SEQUENCES_PER_BATCH, embedding_size=embeddings.shape[1], hidden_units=1024
                                           , epochs=500, device=device, early_stop_threshold=1e-3)
print(embeddings.shape)  # Should be [num_samples, embedding_dim]

  0%|          | 0/500 [00:00<?, ?it/s]

Epoch: 0, Loss: 6.4090

Epoch: 1, Loss: 6.3045

Epoch: 2, Loss: 6.1182

Epoch: 3, Loss: 5.9195

Epoch: 4, Loss: 5.7656

Epoch: 5, Loss: 5.6616

Epoch: 6, Loss: 5.6026

Epoch: 7, Loss: 5.5397

Epoch: 8, Loss: 5.4746

Epoch: 9, Loss: 5.3996

Epoch: 10, Loss: 5.3236

Epoch: 11, Loss: 5.2427

Epoch: 12, Loss: 5.1557

Epoch: 13, Loss: 5.0686

Epoch: 14, Loss: 4.9825

Epoch: 15, Loss: 4.9010

Epoch: 16, Loss: 4.8207

Epoch: 17, Loss: 4.7502

Epoch: 18, Loss: 4.6678

Epoch: 19, Loss: 4.5843

Epoch: 20, Loss: 4.5319

Epoch: 21, Loss: 4.4542

Epoch: 22, Loss: 4.3732

Epoch: 23, Loss: 4.3047

Epoch: 24, Loss: 4.2341

Epoch: 25, Loss: 4.1689

Epoch: 26, Loss: 4.0961

Epoch: 27, Loss: 4.0261

Epoch: 28, Loss: 3.9542

Epoch: 29, Loss: 3.8891

Epoch: 30, Loss: 3.8148

Epoch: 31, Loss: 3.7566

Epoch: 32, Loss: 3.6819

Epoch: 33, Loss: 3.6136

Epoch: 34, Loss: 3.5500

Epoch: 35, Loss: 3.4819

Epoch: 36, Loss: 3.4208

Epoch: 37, Loss: 3.3554

Epoch: 38, Loss: 3.2937

Epoch: 39, Loss: 3.2352

Epoch: 40,

In [9]:
# Evaluate the model on the test set
accuracy = nnm.evaluate_model(model, test_dataloader)
# Save the trained model
torch.save(model.state_dict(), "model/trained_ffnn_model.pth")
print("Model saved as 'trained_ffnn_model.pth'")
# Load the model for future use
print(accuracy)

Accuracy on test set: 19.15% (270/1410)
Model saved as 'trained_ffnn_model.pth'
0.19148936170212766


In [27]:
import lyricsgenius
song_title = "Fire Burning"
artist_name = "Sean Kingston"

genius = lyricsgenius.Genius("Z_wiD32yFUiTd_bFET7Xo4UkbdCvZZJm7ViomZFISpf4wpr_4nvblXiuMaemGED8")
queried_song = genius.search_song(song_title, artist_name)

if queried_song:
    lyrics = queried_song.lyrics
    processed_lyrics = nnm.preprocess_sentence(lyric)
    encoded_lyrics = embedder(processed_lyrics, batch_size=NUM_SEQUENCES_PER_BATCH)
    encoded_lyrics.unsqueeze_(0) # Add a batch dimension

    model.eval()

    with torch.no_grad(): # Speeds up inference and reduces memory usage by not having to calcualte gradients
        logits = model(encoded_lyrics) # Forward pass on the model
        probability = nn.functional.softmax(logits, dim=1) # Normalize z scores to probability
        predicted_idx = torch.multinomial(probability, num_samples=8).cpu()[0].squeeze(0) # Get the top 10 predictions
        predicted_songs = label_encoder.inverse_transform(predicted_idx)
        print(probability.cpu()[0][predicted_idx])
        print(predicted_songs)

else:
    print(f"Song '{song_title}' by '{artist_name}' not found.")


Searching for "Fire Burning" by Sean Kingston...
Done.
tensor([2.3962e-01, 1.9794e-01, 1.8243e-01, 2.2839e-01, 1.4290e-01, 3.5385e-05,
        1.1110e-03, 2.0815e-04])
["Don't Stop The Music by Rihanna" "Just Dance by Lady Gaga,Colby O'Donis"
 'Loba by Shakira' 'Toxic by Britney Spears' 'Die Young by Kesha'
 'HONEY (ARE U COMING?) by Måneskin' 'SOS by Rihanna'
 '¿Y Eso? by Rauw Alejandro']


In [None]:
#Labels that are not similar to: FireBurning by Sean Kingston
# 252 -  'Honey' by Kehlani
# 11 - 505 by Arctic Monkeys
# 13 - 8TEEN by Khalid
# 21 - American Requiem by Beyonce

#Labels somewhat similar to : FireBurning by Sean Kingston
# 336 - Low  by Flo Rida
# 431 -Promiscuous
# 294 -Just Dance

# 210 - Fire Burning by Sean Kingston

In [12]:
potential_indices = [252, 11, 13, 21, 336, 431, 294]
# Get the corresponding labels
print(probability.shape)
for index in potential_indices:
    value = probability[0, index]
    label = label_encoder.inverse_transform([index])[0]
    print(f"Probability for '{label}' is {value} ")

torch.Size([1, 616])
Probability for 'Honey by Kehlani' is 3.207186182407895e-06 
Probability for '505 by Arctic Monkeys' is 7.78443946130615e-14 
Probability for '8TEEN by Khalid' is 1.8380416633156926e-10 
Probability for 'AMERIICAN REQUIEM by Beyoncé' is 4.422945949045243e-06 
Probability for 'Low by SZA' is 3.6613246265915222e-06 
Probability for 'Promiscuous by Nelly Furtado,Timbaland' is 0.00025504574296064675 
Probability for 'Just Dance by Lady Gaga,Colby O'Donis' is 0.19794219732284546 


In [24]:
label_mapping = {index: label for index, label in enumerate(label_encoder.classes_)}
print(label_mapping[210])

Fire Burning by Sean Kingston
