In [1]:

import pandas as pd
import lr_model as lr
from sklearn.preprocessing import StandardScaler,LabelEncoder
from scipy.sparse import hstack
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np
import time

# you'll play with these 
# bers to see how they affect the model's performance
NUM_FEATURES = 5000
#NUM_ITERATIONS = 1000
#NUM_TRAINING_EXAMPLES = 1000

NUM_ITERATIONS = 500
NUM_TRAINING_EXAMPLES = 200

# the learning rate will stay the same for this HW
LEARNING_RATE = 0.1

In [None]:
#load the data, tuples
similar_song_lyrics= pd.read_csv("data/similar_song_lyrics_2.csv") 

print(len( similar_song_lyrics))
lyrics, song_title = similar_song_lyrics["lyrics"].tolist(), similar_song_lyrics["similar_track"].tolist()


504


In [None]:
LEMMATIZER = WordNetLemmatizer()
STOP_WORDS = set(stopwords.words('english'))

#Function to preprocess the lyrics
def preprocess_sentence(lyrics:str,lemmatizer: WordNetLemmatizer = LEMMATIZER, 
                        stop_words: set = STOP_WORDS) -> str: 

     # Apply case-folding on your text.
    lyrics = lyrics.lower()

    # Remove any punctuations within your sentence.
    lyrics = lyrics.translate(str.maketrans('', '', string.punctuation))
    lyrics =lyrics.split('Lyrics', 1)[1].strip() if 'Lyrics' in lyrics else lyrics.strip()

    tokens = word_tokenize(lyrics)
    print("tokens" , tokens[:5])
  
    # Remove stop words and lemmatize your sentence if they are provided
    if stop_words is not None:
        tokens = [word for word in tokens if word not in stop_words]
    if lemmatizer is not None:
        tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    print("tokens post" , tokens[:5])

    preprocessed = ' '.join(tokens)
    return preprocessed




In [4]:
processed_lyrics = []
for lyric in lyrics:
    processed_lyrics.append(preprocess_sentence(lyric))

tokens ['28', 'contributorsbubble', 'music', 'lyricsintro', 'but']
tokens post ['28', 'contributorsbubble', 'music', 'lyricsintro', 'still']
tokens ['23', 'contributorsget', 'down', 'lyricsproducer', 'chad']
tokens post ['23', 'contributorsget', 'lyricsproducer', 'chad', 'hamilton']
tokens ['2', 'contributorsthe', 'king', 'lyricskiller', 'jim']
tokens post ['2', 'contributorsthe', 'king', 'lyricskiller', 'jim']
tokens ['38', 'contributorsmy', 'block', 'freestyle', 'lyricsverse']
tokens post ['38', 'contributorsmy', 'block', 'freestyle', 'lyricsverse']
tokens ['2', 'contributorsthis', 'is', 'jim', 'jones']
tokens post ['2', 'contributorsthis', 'jim', 'jones', 'lyricsintro']
tokens ['1', 'contributorwhat', 'did', 'i', 'do']
tokens post ['1', 'contributorwhat', 'lyricssample', 'make', 'life']
tokens ['13', 'contributorsdamn', 'lyricsunh', 'mr', 'duhduhduhduhduhduhduhduhduhduhdamn']
tokens post ['13', 'contributorsdamn', 'lyricsunh', 'mr', 'duhduhduhduhduhduhduhduhduhduhdamn']
tokens ['12'

In [5]:
print("Processed lyrics count: ", len(processed_lyrics))

Processed lyrics count:  504


In [6]:
song_title[:20]

['Killa Cam',
 'Killa Cam',
 'Killa Cam',
 'Killa Cam',
 'Killa Cam',
 'Killa Cam',
 'Forgive Me Father',
 'Forgive Me Father',
 'Forgive Me Father',
 'Forgive Me Father',
 'Forgive Me Father',
 'Forgive Me Father',
 'Forgive Me Father',
 'Forgive Me Father',
 'Forgive Me Father',
 'Forgive Me Father',
 'Down and Out',
 'Down and Out',
 'Down and Out',
 'Down and Out']

In [None]:
# Vectorize the data using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=NUM_FEATURES)
tfidf_matrix = tfidf_vectorizer.fit_transform(processed_lyrics)

#add in numeric features?
#lyric_tfidf = tfidf_vectorizer.fit_transform(lyrics)
#X = hstack([lyric_tfidf, numerical_features_sparse])

tfidf_matrix = tfidf_matrix.toarray()


In [None]:
# Scale the features using StandardScaler - Maybe not needed
scaler = StandardScaler()
X_scaled = scaler.fit_transform(tfidf_matrix)


In [10]:
X_train, X_test, y_train, y_test = [], [], [], []


In [11]:
#Vectorize the data

"""TF-IDF Vectorizer

We'll use sklearn's `TfidfVectorizer` class to handle our preprocessed data"""


for i in range(0, len(lyrics), 10):  
    X_train_chunk = X_scaled[i:i+8]  # Select 8 samples for training
    X_test_chunk = X_scaled[i+8:i+10]  # Select 2 sample for testing
    y_train_chunk = song_title[i:i+8]  # Corresponding labels for training
    y_test_chunk = song_title[i+8:i+10]  # Corresponding label for testing
    
    # Append chunks to total lists
    X_train.append(X_train_chunk)
    X_test.append(X_test_chunk)
    y_train.append(y_train_chunk)
    y_test.append(y_test_chunk)

X_train = np.vstack(X_train)
X_test = np.vstack(X_test)
y_train = np.hstack(y_train)
y_test = np.hstack(y_test)


In [12]:
print("Training set size:", X_train[:5])
print("labels for training set:", y_train[:5])
print("Testing set size:", X_test[:5])
print("labels for training set:", y_test[:5])


Training set size: [[-0.20022738 -0.11894023 -0.14829075 ... -0.0445878  -0.0445878
  -0.0445878 ]
 [-0.20022738 -0.11894023 -0.14829075 ... -0.0445878  -0.0445878
  -0.0445878 ]
 [-0.20022738 -0.11894023 -0.14829075 ... -0.0445878  -0.0445878
  -0.0445878 ]
 [-0.20022738 -0.11894023 -0.14829075 ... -0.0445878  -0.0445878
  -0.0445878 ]
 [-0.20022738 -0.11894023 -0.14829075 ... -0.0445878  -0.0445878
  -0.0445878 ]]
labels for training set: ['Killa Cam' 'Killa Cam' 'Killa Cam' 'Killa Cam' 'Killa Cam']
Testing set size: [[-0.20022738 -0.11894023 -0.14829075 ... -0.0445878  -0.0445878
  -0.0445878 ]
 [-0.20022738 -0.11894023 -0.14829075 ... -0.0445878  -0.0445878
  -0.0445878 ]
 [-0.20022738 -0.11894023 -0.14829075 ... -0.0445878  -0.0445878
  -0.0445878 ]
 [-0.20022738 -0.11894023 -0.14829075 ... -0.0445878  -0.0445878
  -0.0445878 ]
 [-0.20022738 -0.11894023 -0.14829075 ... -0.0445878  -0.0445878
  -0.0445878 ]]
labels for training set: ['Forgive Me Father' 'Forgive Me Father' 'Down an

In [14]:
print(tfidf_matrix.shape)
print(np.count_nonzero(tfidf_matrix[0])/len(tfidf_matrix[0])*100)

(504, 5000)
4.12


In [15]:
print(tfidf_matrix[:3][:10])
print(y_train[:3])

print((np.count_nonzero(tfidf_matrix[0])/len(tfidf_matrix[0]))*100)
print((np.count_nonzero(tfidf_matrix[1])/len(tfidf_matrix[1]))*100)
print((np.count_nonzero(tfidf_matrix[2])/len(tfidf_matrix[1]))*100)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
['Killa Cam' 'Killa Cam' 'Killa Cam']
4.12
3.52
2.82


In [16]:
regression_model = lr.LogisticRegression(LEARNING_RATE,NUM_ITERATIONS)
print(f"x shape: {X_train.shape}")
start_time = time.time()
regression_model.train(X_train, y_train, True)
end_time = time.time()

print(f"time: {end_time - start_time}")

x shape: (404, 5000)
Training for 500 iterations
losses: 0.06823907603703501
losses: 0.0016152467779731584
losses: 0.0014204538187491014
losses: 0.0013450102343618476
losses: 0.0013036277128349431
time: 11.252521991729736


In [17]:
print(X_test[0])
output = regression_model.predict(X_test[0])
print(output)


[-0.20022738 -0.11894023 -0.14829075 ... -0.0445878  -0.0445878
 -0.0445878 ]
Spend the Night


In [None]:
y_pred =[]

for test_plot in X_test:
  y_pred.append(regression_model.predict(test_plot))

print(f"Predicted Song Titles: {y_pred[:10]}")
print(f"Actual Song Titles: {y_test[:10]}")


a_score = accuracy_score(y_test, y_pred)
p_score = precision_score(y_test, y_pred, average='weighted')
r_score = recall_score(y_test, y_pred, average='weighted')
f_score = f1_score(y_test, y_pred, average='weighted')
#ocab_size = classifier.get_vocab_size()
print(f"accuracy score = {a_score}")
print(f"precision score = {p_score}")
print(f"recall score = {r_score}")
print(f"f1 score = {f_score}")

Predicted Song Titles: ['Spend the Night', 'Never Change', 'We Gonna Make It', 'Family Ties', 'All Falls Down', 'Rockin and Rollin', 'Fly Out', 'Hey AZ', 'Not Give a Fuck', 'Down and Out']
Actual Song Titles: ['Forgive Me Father' 'Forgive Me Father' 'Down and Out' 'Down and Out'
 'Fly In' 'Fly In' 'Lollipop Remix' 'Lollipop Remix' 'Family Ties'
 'Rockin and Rollin']
accuracy score = 0.12
precision score = 0.13974358974358972
recall score = 0.12
f1 score = 0.11666666666666664


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
