In [1]:
import pandas as pd
import lr_model as lr
from sklearn.preprocessing import StandardScaler,LabelEncoder
from scipy.sparse import hstack
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np
import time

# you'll play with these 
# bers to see how they affect the model's performance
NUM_FEATURES = 5000
#NUM_ITERATIONS = 1000
#NUM_TRAINING_EXAMPLES = 1000

NUM_ITERATIONS = 500
NUM_TRAINING_EXAMPLES = 200

# the learning rate will stay the same for this HW
LEARNING_RATE = 0.1

In [2]:
#load the data, tuples
similar_song_lyrics= pd.read_csv("data/lyrics_results.csv") 
print(len(similar_song_lyrics))
lyrics, song_title = similar_song_lyrics["similar_lyrics"].tolist(), similar_song_lyrics["similar_song"].tolist()


7120


In [3]:
LEMMATIZER = WordNetLemmatizer()
STOP_WORDS = set(stopwords.words('english'))

#Function to preprocess the lyrics
def preprocess_sentence(lyrics:str,lemmatizer: WordNetLemmatizer = LEMMATIZER, 
                        stop_words: set = STOP_WORDS) -> str: 

     # Apply case-folding on your text.
    lyrics = lyrics.lower()

    # Remove any punctuations within your sentence.
    lyrics = lyrics.translate(str.maketrans('', '', string.punctuation))
    lyrics =lyrics.split('Lyrics', 1)[1].strip() if 'Lyrics' in lyrics else lyrics.strip()

    tokens = word_tokenize(lyrics)
    print("tokens" , tokens[:5])
  
    # Remove stop words and lemmatize your sentence if they are provided
    if stop_words is not None:
        tokens = [word for word in tokens if word not in stop_words]
    if lemmatizer is not None:
        tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    print("tokens post" , tokens[:5])

    preprocessed = ' '.join(tokens)
    return preprocessed




In [4]:
processed_lyrics = []
for lyric in lyrics:
    processed_lyrics.append(preprocess_sentence(lyric))

tokens ['77', 'contributorssleep', 'now', 'in', 'the']
tokens post ['77', 'contributorssleep', 'fire', 'lyricsverse', '1']
tokens ['71', 'contributorsguerrilla', 'radio', 'lyricsverse', '1']
tokens post ['71', 'contributorsguerrilla', 'radio', 'lyricsverse', '1']
tokens ['39', 'contributorscochise', 'lyricsverse', '1', 'well']
tokens post ['39', 'contributorscochise', 'lyricsverse', '1', 'well']
tokens ['50', 'contributorsshow', 'me', 'how', 'to']
tokens post ['50', 'contributorsshow', 'live', 'lyricsverse', '1']
tokens ['48', 'contributorsall', 'my', 'life', 'lyricsrefrain']
tokens post ['48', 'contributorsall', 'life', 'lyricsrefrain', 'life']
tokens ['55', 'contributorsepic', 'lyricsverse', '1', 'can']
tokens post ['55', 'contributorsepic', 'lyricsverse', '1', 'feel']
tokens ['56', 'contributorsmy', 'hero', 'lyricsverse', '1']
tokens post ['56', 'contributorsmy', 'hero', 'lyricsverse', '1']
tokens ['69', 'contributorsbullet', 'with', 'butterfly', 'wings']
tokens post ['69', 'contrib

In [5]:
print("Processed lyrics count: ", len(processed_lyrics))

Processed lyrics count:  7120


In [6]:
song_title[:20]

['Sleep Now in the Fire by Rage Against the Machine',
 'Guerrilla Radio by Rage Against the Machine',
 'Cochise by Audioslave',
 'Show Me How To Live by Audioslave',
 'All My Life by Foo Fighters',
 'Epic by Faith No More',
 'My Hero by Foo Fighters',
 'Bullet With Butterfly Wings by The Smashing Pumpkins',
 'I Hate Everything About You by Three Days Grace',
 'The Outsider by A Perfect Circle',
 'Testify by Rage Against the Machine',
 'Sleep Now in the Fire by Rage Against the Machine',
 'Break Stuff by Limp Bizkit',
 'Nookie by Limp Bizkit',
 'Show Me How To Live by Audioslave',
 'Cochise by Audioslave',
 'All My Life by Foo Fighters',
 'Last Resort by Papa Roach',
 'The Pretender by Foo Fighters',
 'Butterfly by Crazy Town']

In [7]:
# Vectorize the data using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=NUM_FEATURES)
tfidf_matrix = tfidf_vectorizer.fit_transform(processed_lyrics)

#add in numeric features?
#lyric_tfidf = tfidf_vectorizer.fit_transform(lyrics)
#X = hstack([lyric_tfidf, numerical_features_sparse])

tfidf_matrix = tfidf_matrix.toarray()


In [8]:
# Scale the features using StandardScaler - Maybe not needed
scaler = StandardScaler()
X_scaled = scaler.fit_transform(tfidf_matrix)


In [9]:
X_train, X_test, y_train, y_test = [], [], [], []


In [10]:
#Vectorize the data

"""TF-IDF Vectorizer

We'll use sklearn's `TfidfVectorizer` class to handle our preprocessed data"""


for i in range(0, len(lyrics), 10):  
    X_train_chunk = X_scaled[i:i+8]  # Select 8 samples for training
    X_test_chunk = X_scaled[i+8:i+10]  # Select 2 sample for testing
    y_train_chunk = song_title[i:i+8]  # Corresponding labels for training
    y_test_chunk = song_title[i+8:i+10]  # Corresponding label for testing
    
    # Append chunks to total lists
    X_train.append(X_train_chunk)
    X_test.append(X_test_chunk)
    y_train.append(y_train_chunk)
    y_test.append(y_test_chunk)

X_train = np.vstack(X_train)
X_test = np.vstack(X_test)
y_train = np.hstack(y_train)
y_test = np.hstack(y_test)


In [11]:
print("Training set size:", X_train[:5])
print("labels for training set:", y_train[:5])
print("Testing set size:", X_test[:5])
print("labels for training set:", y_test[:5])


Training set size: [[-0.16888379 -0.02186561 -0.14159751 ... -0.03750293 -0.03750293
  -0.03750293]
 [-0.16888379 -0.02186561 -0.14159751 ... -0.03750293 -0.03750293
  -0.03750293]
 [-0.16888379 -0.02186561 -0.14159751 ... -0.03750293 -0.03750293
  -0.03750293]
 [-0.16888379 -0.02186561 -0.14159751 ... -0.03750293 -0.03750293
  -0.03750293]
 [-0.16888379 -0.02186561 -0.14159751 ... -0.03750293 -0.03750293
  -0.03750293]]
labels for training set: ['Sleep Now in the Fire by Rage Against the Machine'
 'Guerrilla Radio by Rage Against the Machine' 'Cochise by Audioslave'
 'Show Me How To Live by Audioslave' 'All My Life by Foo Fighters']
Testing set size: [[-0.16888379 -0.02186561 -0.14159751 ... -0.03750293 -0.03750293
  -0.03750293]
 [-0.16888379 -0.02186561 -0.14159751 ... -0.03750293 -0.03750293
  -0.03750293]
 [-0.16888379 -0.02186561 -0.14159751 ... -0.03750293 -0.03750293
  -0.03750293]
 [-0.16888379 -0.02186561 -0.14159751 ... -0.03750293 -0.03750293
  -0.03750293]
 [ 1.68602806 -0

In [12]:
print(tfidf_matrix.shape)
print(np.count_nonzero(tfidf_matrix[0])/len(tfidf_matrix[0])*100)

(7120, 5000)
1.18


In [13]:
print(tfidf_matrix[:3][:10])
print(y_train[:3])

print((np.count_nonzero(tfidf_matrix[0])/len(tfidf_matrix[0]))*100)
print((np.count_nonzero(tfidf_matrix[1])/len(tfidf_matrix[1]))*100)
print((np.count_nonzero(tfidf_matrix[2])/len(tfidf_matrix[1]))*100)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
['Sleep Now in the Fire by Rage Against the Machine'
 'Guerrilla Radio by Rage Against the Machine' 'Cochise by Audioslave']
1.18
1.7399999999999998
1.02


In [14]:
regression_model = lr.LogisticRegression(LEARNING_RATE,NUM_ITERATIONS)
print(f"x shape: {X_train.shape}")

x shape: (5696, 5000)


In [15]:
#start_time = time.time()
#regression_model.train(X_train, y_train, True)
#end_time = time.time()
#print(f"time: {end_time - start_time}")
regression_model.train(X_train, y_train, True, load_weights_file="model/logistic_regression_weights_3492x5001.pkl")

Training for 500 iterations
class mappings:  {'$wing by Fever 333': 0, "'Deed I Do by Chris Smither": 1, "'til You by Alanis Morissette": 2, '(Ghost) Riders in the Sky by Johnny Cash': 3, '(God Must Have Spent) A Little More Time On You - Remix by *NSYNC': 4, '(I Got The) Same Old Blues by Lynyrd Skynyrd': 5, '(I Hate You) Big Daddy by John C. Reilly': 6, "(It's Good) To Be Free by Oasis": 7, '(No One) Not Even the Rain by The Charlatans': 8, "(She's Got) The Fever by The Pointer Sisters": 9, '(She) Got Me Bad by Hall & Oates': 10, '(Song for My) Sugar Spun Sister by The Stone Roses': 11, "(There's) Always Something There to Remind Me by Naked Eyes": 12, "(There's) No Gettin' Over Me by Ronnie Milsap": 13, '(Well) Dusted by Giant Sand': 14, '(What This World Needs Is) A Few More Rednecks by Charlie Daniels Band': 15, '(You Were) Going Somewhere by David Wilcox': 16, "(You'll Be) Satisfied by the subdudes": 17, '...I Love by Low': 18, '...Slowdance on the Inside by Taking Back Sunday': 

In [16]:
print(X_test[0])
output = regression_model.predict(X_test[0])
print(output)


[-0.16888379 -0.02186561 -0.14159751 ... -0.03750293 -0.03750293
 -0.03750293]
In Love and I Hate It by A1


In [17]:
print(regression_model.weights)

[[-8.71417737e-05 -3.40138197e-05 -1.67748746e-04 ...  3.59897404e-05
   3.59897404e-05 -1.59340405e-03]
 [-1.13522777e-04 -3.54944904e-05 -1.88136384e-04 ...  3.17316896e-05
   3.17316896e-05 -1.46962864e-03]
 [-6.93356275e-05 -3.00785135e-05 -1.56603473e-04 ...  4.02077490e-05
   4.02077490e-05 -1.70137589e-03]
 ...
 [-1.20090829e-04 -4.04597441e-05 -1.81195955e-04 ...  3.22503943e-05
   3.22503943e-05 -1.48308673e-03]
 [ 2.65153759e-06 -2.17669886e-05 -9.11248909e-05 ...  5.55377185e-05
   5.55377185e-05 -2.11223972e-03]
 [-6.81389974e-06 -2.24636347e-05  2.23245765e-03 ...  5.41186692e-05
   5.41186692e-05 -2.07612208e-03]]


In [18]:
y_pred =[]

for test_plot in X_test:
  y_pred.append(regression_model.predict(test_plot))

print(f"Predicted Song Titles: {y_pred[:10]}")
print(f"Actual Song Titles: {y_test[:10]}")


a_score = accuracy_score(y_test, y_pred)
p_score = precision_score(y_test, y_pred, average='weighted')
r_score = recall_score(y_test, y_pred, average='weighted')
f_score = f1_score(y_test, y_pred, average='weighted')
#ocab_size = classifier.get_vocab_size()
print(f"accuracy score = {a_score}")
print(f"precision score = {p_score}")
print(f"recall score = {r_score}")
print(f"f1 score = {f_score}")

Predicted Song Titles: ['In Love and I Hate It by A1', 'The Outsider by A Perfect Circle', 'The Pretender by Foo Fighters', 'The Man by Sponge', 'Born as Ghosts by Rage Against the Machine', 'Maria by Rage Against the Machine', 'Cochise by Audioslave', 'Show Me How To Live by Audioslave', 'Gasoline by Audioslave', 'Cochise by Audioslave']
Actual Song Titles: ['I Hate Everything About You by Three Days Grace'
 'The Outsider by A Perfect Circle' 'The Pretender by Foo Fighters'
 'Butterfly by Crazy Town' 'Born as Ghosts by Rage Against the Machine'
 'Maria by Rage Against the Machine' 'Cochise by Audioslave'
 'Show Me How To Live by Audioslave' 'Gasoline by Audioslave'
 'Cochise by Audioslave']
accuracy score = 0.5077247191011236
precision score = 0.47930711610486887
recall score = 0.5077247191011236
f1 score = 0.4890204209024433


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [23]:
print(y_test[:5])
print(y_pred[:5])

['I Hate Everything About You by Three Days Grace'
 'The Outsider by A Perfect Circle' 'The Pretender by Foo Fighters'
 'Butterfly by Crazy Town' 'Born as Ghosts by Rage Against the Machine']
['In Love and I Hate It by A1', 'The Outsider by A Perfect Circle', 'The Pretender by Foo Fighters', 'The Man by Sponge', 'Born as Ghosts by Rage Against the Machine']


In [25]:
regression_model.save_model_weights()

Weights saved successfully to model/logistic_regression_weights_3492x5001.pkl!
