<a href="https://colab.research.google.com/github/hannahbhchou/song-recommendation/blob/main/Lyrics_Topic_Recommeder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Preprocessing Lyrics Text

In [None]:
import numpy as np
import pandas as pd
import time
import csv
import random
import re
import os

In [None]:
import warnings

In [None]:
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [None]:
lyrics_url = "https://gist.githubusercontent.com/hannahbhchou/6c1608f211a85d4d013d95a51c32a3f8/raw/833566eb910c086081b3b06951d2dc789b095c5d/song_lyrics_final.csv"

In [None]:
lyrics_df = pd.read_csv(url, names= ["URL", "Lyrics"])

In [None]:
lyrics_df.head()

Unnamed: 0,URL,Lyrics
0,https://genius.com/The-beatles-let-it-be-lyrics,"When I find myself in times of trouble, Mother..."
1,https://genius.com/The-beatles-come-together-l...,Shoot me\nShoot me\nShoot me\nShoot me\nHere c...
2,https://genius.com/The-beatles-yesterday-lyrics,Yesterday\nAll my troubles seemed so far away\...
3,https://genius.com/The-beatles-something-lyrics,Something in the way she moves\nAttracts me li...
4,https://genius.com/The-beatles-here-comes-the-...,"Here comes the sun, doo da doo doo\nHere comes..."


In [None]:
#remove non-English text using unicode
lyrics_df['Lyrics'] = lyrics_df["Lyrics"].apply(lambda x: ''.join([" " if ord(i) < 32 or ord(i) > 126 else i for i in x]))

In [None]:
#remove newline commands
lyrics_df["Lyrics"] = lyrics_df["Lyrics"].str.replace('\n',' ')

In [None]:
lyrics_df['Lyrics'] = lyrics_df['Lyrics'].map(lambda x: x.lower())

In [None]:
#remove the snippets such as [chorus]
lyrics_df['Lyrics'] = lyrics_df['Lyrics'].map(lambda x: re.sub(r'[\(\[].*?[\)\]]', '', x))

In [None]:
#remove punctuations
lyrics_df['Lyrics'] = lyrics_df['Lyrics'].map(lambda x: re.sub('[,!+?:"()]', '', x))

In [None]:
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
np.random.seed(2018)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
stop_words = stopwords.words("english")

In [None]:
added_words = ["oh","yeah","i'm","ooh","woo","ya","hoo", "hmm","ooooooohhh", "na", "lalala", "ah", "hey", "la", "uh", ]

In [None]:
for x in added_words:
  stop_words.append(x)

In [None]:
def remove_stopwords(raw_text):

    words = raw_text.split()
    not_stop_words = [w for w in words if not w in stop_words]
  
    return( " ".join(not_stop_words))

In [None]:
lyrics_df['Lyrics'] = lyrics_df['Lyrics'].apply(lambda x: remove_stopwords(x))

### Lemmatizing & Stemming Text

In [None]:
import gensim
from gensim.utils import simple_preprocess
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *

In [None]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
#testing lemmatizer
print(WordNetLemmatizer().lemmatize('went', pos='v'))

go


In [None]:
#testing stemmer
stemmer = SnowballStemmer('english')
original_words = ['candies', 'babies', 'dies', 'shoot' ,'died', 'agreed', 'owned', 
           'humbled', 'sized','meeting', 'stating','sensational']
singles = [stemmer.stem(plural) for plural in original_words]
pd.DataFrame(data = {'original word': original_words, 'stemmed': singles})

Unnamed: 0,original word,stemmed
0,candies,candi
1,babies,babi
2,dies,die
3,shoot,shoot
4,died,die
5,agreed,agre
6,owned,own
7,humbled,humbl
8,sized,size
9,meeting,meet


In [None]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [None]:
#using the song https://genius.com/Lady-gaga-and-blackpink-sour-candy-lyrics to test
sample = lyrics_df.iloc[342,1]

In [None]:
print('original document: ')
words = []
for word in sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(sample))

original document: 
['sou-sour', 'candy', 'sour', 'candy', 'sweet', 'get', 'little', 'angry', 'sour', 'candy', 'super', 'psycho', 'make', 'crazy', 'turn', 'lights', 'low', 'sour', 'candy', 'ask', 'nice', "i'll", 'extra', 'mean', 'wanna', 'fix', "let's", 'break', 'uh-huh', 'uh-huh', 'hard', 'outside', 'give', 'time', 'could', 'make', 'time', 'love', 'hard', 'outside', 'see', 'inside', 'inside', 'inside', 'might', 'messed', 'know', "what's", 'want', 'real', 'taste', 'least', 'fake', 'come', 'come', 'unwrap', 'come', 'come', 'unwrap', "i'll", 'show', "what's", 'close', 'eyes', 'peek', 'undressing', 'unwrap', 'sour', 'candy', 'come', 'come', 'unwrap', 'come', 'come', 'unwrap', 'come', 'sour', 'candy', 'hard', 'outside', 'give', 'time', 'could', 'make', 'time', 'love', 'hard', 'outside', 'see', 'inside', 'inside', 'inside', 'sour', 'candy', 'sweet', "'til", 'get', 'little', 'angry', 'sour', 'candy', 'super', 'psycho', 'make', 'crazy', 'turn', 'lights', 'low', 'sour', 'candy', 'take', 'bite'

In [None]:
processed_lyrics = lyrics_df["Lyrics"].map(preprocess)

In [None]:
processed_lyrics[:10]

0    [time, troubl, mother, mari, come, speak, word...
1    [shoot, shoot, shoot, shoot, come, flat, come,...
2    [yesterday, troubl, away, look, stay, believ, ...
3    [move, attract, like, lover, woo, want, leav, ...
4    [come, come, right, littl, darl, long, cold, l...
5    [jude, song, better, rememb, heart, start, bet...
6    [read, news, today, lucki, grade, news, laugh,...
7    [blackbird, sing, dead, night, break, wing, le...
8    [like, pig, cri, sit, cornflak, wait, come, co...
9    [look, lone, peopl, look, lone, peopl, eleanor...
Name: Lyrics, dtype: object

In [None]:
#displaying the most common tokens in the dataset
from collections import Counter
from itertools import *

text_word_frequency = Counter(chain.from_iterable(processed_lyrics))
text_word_frequency.most_common()[:20]

[('love', 3098),
 ('know', 2766),
 ('like', 2363),
 ('babi', 1631),
 ('come', 1575),
 ('want', 1287),
 ('time', 1259),
 ('feel', 1205),
 ('caus', 1151),
 ('girl', 997),
 ('wanna', 953),
 ('tell', 941),
 ('night', 854),
 ('gonna', 851),
 ('look', 845),
 ('think', 832),
 ('need', 814),
 ('right', 782),
 ('go', 751),
 ('away', 719)]

In [None]:
#display the least frequent tokens in the dataset
text_word_frequency.most_common()[-20:]

[('gimmick', 1),
 ('crumb', 1),
 ('loxvill', 1),
 ('jada', 1),
 ('jackass', 1),
 ('knoxvill', 1),
 ('thousandair', 1),
 ('outrun', 1),
 ('distribut', 1),
 ('fluid', 1),
 ('beefin', 1),
 ('relatin', 1),
 ('jason', 1),
 ('scrappin', 1),
 ('olympus', 1),
 ('psychiatrist', 1),
 ('uninspir', 1),
 ('handout', 1),
 ('welfar', 1),
 ('backstabbin', 1)]

### LDA Model Training ###

In [None]:
dictionary = gensim.corpora.Dictionary(processed_lyrics)

In [None]:
#setting the threshold of what to incorporate in the the training, 
dictionary.filter_extremes(no_below=5, no_above=0.3)

#the no_below is absolute frequency, no_above is percentage of of documents(songs) that has the token

In [None]:
#Bag-of-words transformation
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_lyrics]

In [None]:
#testing on the same song https://genius.com/Lady-gaga-and-blackpink-sour-candy-lyrics
bow_doc_342 = bow_corpus[342]

for i in range(len(bow_doc_342)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_342[i][0], 
                                                     dictionary[bow_doc_342[i][0]], 
                                                     bow_doc_342[i][1]))

Word 5 ("light") appears 2 time.
Word 33 ("hard") appears 4 time.
Word 75 ("littl") appears 2 time.
Word 140 ("turn") appears 2 time.
Word 142 ("break") appears 1 time.
Word 144 ("eye") appears 1 time.
Word 202 ("mean") appears 1 time.
Word 242 ("wanna") appears 1 time.
Word 246 ("sweet") appears 2 time.
Word 259 ("real") appears 1 time.
Word 315 ("angri") appears 2 time.
Word 379 ("close") appears 1 time.
Word 471 ("insid") appears 6 time.
Word 496 ("bite") appears 8 time.
Word 584 ("candi") appears 13 time.
Word 618 ("tast") appears 1 time.
Word 696 ("mess") appears 1 time.
Word 718 ("outsid") appears 4 time.
Word 896 ("fake") appears 1 time.
Word 986 ("crazi") appears 2 time.
Word 1027 ("nice") appears 1 time.
Word 1296 ("super") appears 2 time.
Word 1374 ("psycho") appears 2 time.
Word 1384 ("undress") appears 1 time.
Word 1761 ("extra") appears 1 time.


In [None]:
#training the model with 10 topics
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)

In [None]:
#print out the most prominent tokens for each topic
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.034*"away" + 0.024*"gonna" + 0.017*"care" + 0.017*"littl" + 0.015*"heart" + 0.012*"wanna" + 0.012*"turn" + 0.012*"hear" + 0.010*"thing" + 0.009*"live"
Topic: 1 
Words: 0.020*"away" + 0.015*"sail" + 0.013*"bitch" + 0.013*"nigga" + 0.010*"girl" + 0.010*"song" + 0.009*"need" + 0.009*"gonna" + 0.009*"fuck" + 0.008*"beat"
Topic: 2 
Words: 0.030*"girl" + 0.018*"nigga" + 0.011*"say" + 0.009*"gonna" + 0.008*"gotta" + 0.008*"fuck" + 0.007*"light" + 0.007*"bitch" + 0.007*"high" + 0.007*"rain"
Topic: 3 
Words: 0.026*"gimm" + 0.016*"fuck" + 0.013*"live" + 0.013*"somebodi" + 0.012*"thing" + 0.012*"roll" + 0.011*"happi" + 0.010*"birthday" + 0.009*"burn" + 0.008*"real"
Topic: 4 
Words: 0.017*"need" + 0.014*"girl" + 0.013*"leav" + 0.011*"thing" + 0.011*"life" + 0.011*"live" + 0.010*"gonna" + 0.010*"take" + 0.009*"light" + 0.009*"say"
Topic: 5 
Words: 0.024*"need" + 0.012*"long" + 0.011*"gonna" + 0.010*"face" + 0.010*"thing" + 0.009*"mind" + 0.009*"nigga" + 0.007*"song" + 0.007*"play

In [None]:
#testing it on the same song https://genius.com/Lady-gaga-and-blackpink-sour-candy-lyrics
for index, score in sorted(lda_model[bow_corpus[342]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.5326701998710632	 
Topic: 0.022*"away" + 0.015*"gonna" + 0.011*"work" + 0.010*"gimm" + 0.008*"nigga" + 0.008*"leav" + 0.008*"real" + 0.008*"sorri" + 0.007*"say" + 0.007*"life"

Score: 0.45482686161994934	 
Topic: 0.035*"stand" + 0.030*"roll" + 0.021*"rock" + 0.015*"everybodi" + 0.015*"hear" + 0.013*"lose" + 0.012*"cold" + 0.011*"care" + 0.011*"reveng" + 0.010*"bodi"


### LDA Visualization ###

In [None]:
!pip install pyldavis



In [None]:
import pickle
import pyLDAvis

In [None]:
import pyLDAvis.gensim_models as gensimvis

In [None]:
pyLDAvis.enable_notebook()

In [None]:
LDAvis_prepared = gensimvis.prepare(lda_model, bow_corpus, dictionary)

In [None]:
LDAvis_prepared

### Recommender ###

In [None]:
doc_lda = lda_model[bow_corpus]

In [None]:
from gensim import similarities

In [None]:
index = similarities.MatrixSimilarity(doc_lda)

In [None]:
def song_recommender(url):
    songs_checked = 0
    for i in range(len(lyrics_df)):
        recommendation_scores = []
        if lyrics_df.iloc[i,0] == url:
            lda_vectors = doc_lda[i]
            sims = index[lda_vectors]
            sims = list(enumerate(sims))
            for sim in sims:
                song_num = sim[0]
                recommendation_score = [lyrics_df.iloc[song_num, 0], sim[1]]
                recommendation_scores.append(recommendation_score)
            
            recommendation = sorted(recommendation_scores, key=lambda x: x[1], reverse=True)
            print("Your song's most prominent tokens are:")
            article_tokens = bow_corpus[i]
            sorted_tokens = sorted(article_tokens, key=lambda x: x[1], reverse=True)
            sorted_tokens_10 = sorted_tokens[:10]
            for i in range(len(sorted_tokens_10)):
                print("Word {} (\"{}\") appears {} time(s).".format(sorted_tokens_10[i][0], 
                                                             dictionary[sorted_tokens_10[i][0]], 
                                                             sorted_tokens_10[i][1]))
            print('-----')
            print("Your song's most prominant topic is:")
            print(lda_model.print_topic(max(lda_vectors, key=lambda item: item[1])[0]))
            print('-----')
            print('Here are your recommendations for "{}":'.format(url))
            display(recommendation[1:21])
        
        else:
            songs_checked +=1
            
        if songs_checked == len(lyrics_df):
            book_suggestions = []
            print('Sorry, but it looks like "{}" is not available.'.format(url))
           

In [None]:
song_recommender('https://genius.com/The-beatles-yesterday-lyrics')

Your song's most prominent tokens are:
Word 60 ("yesterday") appears 9 time(s).
Word 44 ("away") appears 3 time(s).
Word 45 ("believ") appears 3 time(s).
Word 46 ("easi") appears 2 time(s).
Word 47 ("game") appears 2 time(s).
Word 50 ("hide") appears 2 time(s).
Word 51 ("long") appears 2 time(s).
Word 52 ("need") appears 2 time(s).
Word 53 ("place") appears 2 time(s).
Word 54 ("play") appears 2 time(s).
-----
Your song's most prominant topic is:
0.027*"wanna" + 0.025*"gonna" + 0.018*"long" + 0.017*"danc" + 0.016*"girl" + 0.013*"wait" + 0.012*"hold" + 0.011*"life" + 0.011*"home" + 0.010*"away"
-----
Here are your recommendations for "https://genius.com/The-beatles-yesterday-lyrics":


[['https://genius.com/Elvis-presley-love-me-tender-lyrics', 1.0],
 ['https://genius.com/Elton-john-rocket-man-i-think-its-going-to-be-a-long-long-time-lyrics',
  1.0],
 ['https://genius.com/Eagles-new-kid-in-town-lyrics', 1.0],
 ['https://genius.com/Whitney-houston-i-have-nothing-lyrics', 1.0],
 ['https://genius.com/Whitney-houston-i-wanna-dance-with-somebody-who-loves-me-lyrics',
  1.0],
 ['https://genius.com/Whitney-houston-its-not-right-but-its-okay-lyrics', 1.0],
 ['https://genius.com/Whitney-houston-run-to-you-lyrics', 1.0],
 ['https://genius.com/Drake-hold-on-were-going-home-lyrics', 1.0],
 ['https://genius.com/Garth-brooks-friends-in-low-places-live-version-lyrics',
  1.0],
 ['https://genius.com/Garth-brooks-friends-in-low-places-lyrics', 1.0],
 ['https://genius.com/Ed-sheeran-and-justin-bieber-i-dont-care-lyrics', 1.0],
 ['https://genius.com/Billy-joel-vienna-lyrics', 1.0],
 ['https://genius.com/Phil-collins-another-day-in-paradise-lyrics', 1.0],
 ['https://genius.com/Abba-danc

In [None]:
song_recommender('https://genius.com/Lady-gaga-and-blackpink-sour-candy-lyrics')

Your song's most prominent tokens are:
Word 584 ("candi") appears 13 time(s).
Word 496 ("bite") appears 8 time(s).
Word 471 ("insid") appears 6 time(s).
Word 33 ("hard") appears 4 time(s).
Word 718 ("outsid") appears 4 time(s).
Word 5 ("light") appears 2 time(s).
Word 75 ("littl") appears 2 time(s).
Word 140 ("turn") appears 2 time(s).
Word 246 ("sweet") appears 2 time(s).
Word 315 ("angri") appears 2 time(s).
-----
Your song's most prominant topic is:
0.022*"away" + 0.015*"gonna" + 0.011*"work" + 0.010*"gimm" + 0.008*"nigga" + 0.008*"leav" + 0.008*"real" + 0.008*"sorri" + 0.007*"say" + 0.007*"life"
-----
Here are your recommendations for "https://genius.com/Lady-gaga-and-blackpink-sour-candy-lyrics":


[['https://genius.com/The-rolling-stones-wild-horses-lyrics', 0.9145169],
 ['https://genius.com/Coldplay-everglow-lyrics', 0.91349584],
 ['https://genius.com/Green-day-holiday-lyrics', 0.88754],
 ['https://genius.com/Alabama-shakes-future-people-lyrics', 0.87743235],
 ['https://genius.com/Adele-hello-lyrics', 0.8600359],
 ['https://genius.com/Bruce-springsteen-the-river-lyrics', 0.8502728],
 ['https://genius.com/George-strait-jingle-bell-rock-lyrics', 0.82873297],
 ['https://genius.com/Aretha-franklin-walk-in-the-light-lyrics', 0.82347643],
 ['https://genius.com/James-taylor-shower-the-people-lyrics', 0.8118189],
 ['https://genius.com/Lady-gaga-and-ariana-grande-rain-on-me-lyrics',
  0.8086614],
 ['https://genius.com/Chicago-mass-choir-i-pray-well-be-ready-live-lyrics',
  0.7891511],
 ['https://genius.com/Eminem-river-lyrics', 0.7828656],
 ['https://genius.com/Kanye-west-ultralight-beam-lyrics', 0.7787315],
 ['https://genius.com/Gloria-estefan-hoy-lyrics', 0.76895267],
 ['https://geniu

In [None]:
song_recommender("https://genius.com/Eminem-lose-yourself-lyrics")

Your song's most prominent tokens are:
Word 84 ("better") appears 16 time(s).
Word 39 ("shoot") appears 9 time(s).
Word 147 ("moment") appears 8 time(s).
Word 100 ("blow") appears 7 time(s).
Word 2 ("chanc") appears 6 time(s).
Word 9 ("music") appears 6 time(s).
Word 289 ("miss") appears 6 time(s).
Word 605 ("lose") appears 6 time(s).
Word 1143 ("lifetim") appears 6 time(s).
Word 63 ("grow") appears 4 time(s).
-----
Your song's most prominant topic is:
0.022*"away" + 0.015*"gonna" + 0.011*"work" + 0.010*"gimm" + 0.008*"nigga" + 0.008*"leav" + 0.008*"real" + 0.008*"sorri" + 0.007*"say" + 0.007*"life"
-----
Here are your recommendations for "https://genius.com/Eminem-lose-yourself-lyrics":


[['https://genius.com/Eminem-stan-lyrics', 0.9898183],
 ['https://genius.com/Nicki-minaj-feeling-myself-lyrics', 0.9811677],
 ['https://genius.com/Bj-the-chicago-kid-smokin-and-ridin-lyrics', 0.9807108],
 ['https://genius.com/Elvis-presley-in-the-ghetto-lyrics', 0.9806458],
 ['https://genius.com/The-rolling-stones-gimme-shelter-lyrics', 0.9765784],
 ['https://genius.com/Gloria-estefan-hoy-lyrics', 0.97617775],
 ['https://genius.com/Red-hot-chili-peppers-cant-stop-lyrics', 0.9750625],
 ['https://genius.com/Mariah-carey-gtfo-lyrics', 0.9731284],
 ['https://genius.com/Dire-straits-brothers-in-arms-lyrics', 0.971901],
 ['https://genius.com/Elvis-presley-heartbreak-hotel-lyrics', 0.9717324],
 ['https://genius.com/Elton-john-goodbye-yellow-brick-road-lyrics', 0.9717324],
 ['https://genius.com/Elton-john-bennie-and-the-jets-lyrics', 0.9717324],
 ['https://genius.com/Elton-john-dont-let-the-sun-go-down-on-me-lyrics',
  0.9717324],
 ['https://genius.com/Rihanna-work-lyrics', 0.9717324],
 ['http