<a href="https://colab.research.google.com/github/hannahbhchou/song-recommendation/blob/main/Lyrics_Topic_Recommeder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Preprocessing Lyrics Text

In [1]:
import numpy as np
import pandas as pd
import time
import csv
import random
import re
import os

In [2]:
import warnings

In [3]:
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [4]:
lyrics_url = "https://gist.githubusercontent.com/hannahbhchou/6c1608f211a85d4d013d95a51c32a3f8/raw/833566eb910c086081b3b06951d2dc789b095c5d/song_lyrics_final.csv"

In [5]:
lyrics_df = pd.read_csv(lyrics_url, names= ["URL", "Lyrics"])

In [6]:
lyrics_df.head()

Unnamed: 0,URL,Lyrics
0,https://genius.com/The-beatles-let-it-be-lyrics,"When I find myself in times of trouble, Mother..."
1,https://genius.com/The-beatles-come-together-l...,Shoot me\nShoot me\nShoot me\nShoot me\nHere c...
2,https://genius.com/The-beatles-yesterday-lyrics,Yesterday\nAll my troubles seemed so far away\...
3,https://genius.com/The-beatles-something-lyrics,Something in the way she moves\nAttracts me li...
4,https://genius.com/The-beatles-here-comes-the-...,"Here comes the sun, doo da doo doo\nHere comes..."


In [7]:
#remove non-English text using unicode
lyrics_df['Lyrics'] = lyrics_df["Lyrics"].apply(lambda x: ''.join([" " if ord(i) < 32 or ord(i) > 126 else i for i in x]))

In [8]:
#remove newline commands
lyrics_df["Lyrics"] = lyrics_df["Lyrics"].str.replace('\n',' ')

In [9]:
lyrics_df['Lyrics'] = lyrics_df['Lyrics'].map(lambda x: x.lower())

In [10]:
#remove the snippets such as [chorus]
lyrics_df['Lyrics'] = lyrics_df['Lyrics'].map(lambda x: re.sub(r'[\(\[].*?[\)\]]', '', x))

In [11]:
#remove punctuations
lyrics_df['Lyrics'] = lyrics_df['Lyrics'].map(lambda x: re.sub('[,!+?:"()]', '', x))

In [12]:
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
np.random.seed(2018)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
stop_words = stopwords.words("english")

In [14]:
added_words = ["oh","yeah","i'm","ooh","woo","ya","hoo", "hmm","ooooooohhh", "na", "lalala", "ah", "hey", "la", "uh", ]

In [15]:
for x in added_words:
  stop_words.append(x)

In [16]:
def remove_stopwords(raw_text):

    words = raw_text.split()
    not_stop_words = [w for w in words if not w in stop_words]
  
    return( " ".join(not_stop_words))

In [17]:
lyrics_df['Lyrics'] = lyrics_df['Lyrics'].apply(lambda x: remove_stopwords(x))

### Lemmatizing & Stemming Text

In [18]:
import gensim
from gensim.utils import simple_preprocess
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *

In [19]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [20]:
#testing lemmatizer
print(WordNetLemmatizer().lemmatize('went', pos='v'))

go


In [21]:
#testing stemmer
stemmer = SnowballStemmer('english')
original_words = ['candies', 'babies', 'dies', 'shoot' ,'died', 'agreed', 'owned', 
           'humbled', 'sized','meeting', 'stating','sensational']
singles = [stemmer.stem(plural) for plural in original_words]
pd.DataFrame(data = {'original word': original_words, 'stemmed': singles})

Unnamed: 0,original word,stemmed
0,candies,candi
1,babies,babi
2,dies,die
3,shoot,shoot
4,died,die
5,agreed,agre
6,owned,own
7,humbled,humbl
8,sized,size
9,meeting,meet


In [22]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [23]:
#using the song https://genius.com/Lady-gaga-and-blackpink-sour-candy-lyrics to test
sample = lyrics_df.iloc[342,1]

In [24]:
print('original document: ')
words = []
for word in sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(sample))

original document: 
['sou-sour', 'candy', 'sour', 'candy', 'sweet', 'get', 'little', 'angry', 'sour', 'candy', 'super', 'psycho', 'make', 'crazy', 'turn', 'lights', 'low', 'sour', 'candy', 'ask', 'nice', "i'll", 'extra', 'mean', 'wanna', 'fix', "let's", 'break', 'uh-huh', 'uh-huh', 'hard', 'outside', 'give', 'time', 'could', 'make', 'time', 'love', 'hard', 'outside', 'see', 'inside', 'inside', 'inside', 'might', 'messed', 'know', "what's", 'want', 'real', 'taste', 'least', 'fake', 'come', 'come', 'unwrap', 'come', 'come', 'unwrap', "i'll", 'show', "what's", 'close', 'eyes', 'peek', 'undressing', 'unwrap', 'sour', 'candy', 'come', 'come', 'unwrap', 'come', 'come', 'unwrap', 'come', 'sour', 'candy', 'hard', 'outside', 'give', 'time', 'could', 'make', 'time', 'love', 'hard', 'outside', 'see', 'inside', 'inside', 'inside', 'sour', 'candy', 'sweet', "'til", 'get', 'little', 'angry', 'sour', 'candy', 'super', 'psycho', 'make', 'crazy', 'turn', 'lights', 'low', 'sour', 'candy', 'take', 'bite'

In [25]:
processed_lyrics = lyrics_df["Lyrics"].map(preprocess)

In [26]:
processed_lyrics[:10]

0    [time, troubl, mother, mari, come, speak, word...
1    [shoot, shoot, shoot, shoot, come, flat, come,...
2    [yesterday, troubl, away, look, stay, believ, ...
3    [move, attract, like, lover, woo, want, leav, ...
4    [come, come, right, littl, darl, long, cold, l...
5    [jude, song, better, rememb, heart, start, bet...
6    [read, news, today, lucki, grade, news, laugh,...
7    [blackbird, sing, dead, night, break, wing, le...
8    [like, pig, cri, sit, cornflak, wait, come, co...
9    [look, lone, peopl, look, lone, peopl, eleanor...
Name: Lyrics, dtype: object

In [27]:
#displaying the most common tokens in the dataset
from collections import Counter
from itertools import *

text_word_frequency = Counter(chain.from_iterable(processed_lyrics))
text_word_frequency.most_common()[:20]

[('love', 3098),
 ('know', 2766),
 ('like', 2363),
 ('babi', 1631),
 ('come', 1575),
 ('want', 1287),
 ('time', 1259),
 ('feel', 1205),
 ('caus', 1151),
 ('girl', 997),
 ('wanna', 953),
 ('tell', 941),
 ('night', 854),
 ('gonna', 851),
 ('look', 845),
 ('think', 832),
 ('need', 814),
 ('right', 782),
 ('go', 751),
 ('away', 719)]

In [28]:
#display the least frequent tokens in the dataset
text_word_frequency.most_common()[-20:]

[('gimmick', 1),
 ('crumb', 1),
 ('loxvill', 1),
 ('jada', 1),
 ('jackass', 1),
 ('knoxvill', 1),
 ('thousandair', 1),
 ('outrun', 1),
 ('distribut', 1),
 ('fluid', 1),
 ('beefin', 1),
 ('relatin', 1),
 ('jason', 1),
 ('scrappin', 1),
 ('olympus', 1),
 ('psychiatrist', 1),
 ('uninspir', 1),
 ('handout', 1),
 ('welfar', 1),
 ('backstabbin', 1)]

### LDA Model Training ###

In [29]:
dictionary = gensim.corpora.Dictionary(processed_lyrics)

In [30]:
#setting the threshold of what to incorporate in the the training, 
dictionary.filter_extremes(no_below=5, no_above=0.3)

#the no_below is absolute frequency, no_above is percentage of of documents(songs) that has the token

In [31]:
#Bag-of-words transformation
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_lyrics]

In [32]:
#testing on the same song https://genius.com/Lady-gaga-and-blackpink-sour-candy-lyrics
bow_doc_342 = bow_corpus[342]

for i in range(len(bow_doc_342)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_342[i][0], 
                                                     dictionary[bow_doc_342[i][0]], 
                                                     bow_doc_342[i][1]))

Word 5 ("light") appears 2 time.
Word 33 ("hard") appears 4 time.
Word 75 ("littl") appears 2 time.
Word 140 ("turn") appears 2 time.
Word 142 ("break") appears 1 time.
Word 144 ("eye") appears 1 time.
Word 202 ("mean") appears 1 time.
Word 242 ("wanna") appears 1 time.
Word 246 ("sweet") appears 2 time.
Word 259 ("real") appears 1 time.
Word 315 ("angri") appears 2 time.
Word 379 ("close") appears 1 time.
Word 471 ("insid") appears 6 time.
Word 496 ("bite") appears 8 time.
Word 584 ("candi") appears 13 time.
Word 618 ("tast") appears 1 time.
Word 696 ("mess") appears 1 time.
Word 718 ("outsid") appears 4 time.
Word 896 ("fake") appears 1 time.
Word 986 ("crazi") appears 2 time.
Word 1027 ("nice") appears 1 time.
Word 1296 ("super") appears 2 time.
Word 1374 ("psycho") appears 2 time.
Word 1384 ("undress") appears 1 time.
Word 1761 ("extra") appears 1 time.


In [33]:
#training the model with 10 topics
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)

In [34]:
#print out the most prominent tokens for each topic
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.022*"away" + 0.015*"gonna" + 0.011*"work" + 0.010*"gimm" + 0.008*"nigga" + 0.008*"leav" + 0.008*"real" + 0.008*"sorri" + 0.007*"say" + 0.007*"life"
Topic: 1 
Words: 0.035*"need" + 0.033*"girl" + 0.016*"wanna" + 0.014*"bring" + 0.013*"good" + 0.010*"admit" + 0.010*"bodi" + 0.010*"gonna" + 0.009*"shake" + 0.008*"mind"
Topic: 2 
Words: 0.017*"heart" + 0.011*"need" + 0.010*"play" + 0.010*"bitch" + 0.009*"eye" + 0.009*"light" + 0.009*"hold" + 0.008*"littl" + 0.008*"hand" + 0.007*"care"
Topic: 3 
Words: 0.020*"turn" + 0.019*"babe" + 0.013*"thing" + 0.012*"dream" + 0.011*"high" + 0.010*"hear" + 0.009*"say" + 0.008*"woah" + 0.008*"wanna" + 0.008*"good"
Topic: 4 
Words: 0.018*"life" + 0.014*"woman" + 0.014*"girl" + 0.013*"say" + 0.011*"chang" + 0.010*"gotta" + 0.010*"money" + 0.009*"thing" + 0.009*"friend" + 0.009*"good"
Topic: 5 
Words: 0.036*"nigga" + 0.026*"fuck" + 0.020*"bitch" + 0.017*"shit" + 0.014*"need" + 0.014*"wanna" + 0.010*"thing" + 0.010*"nothin" + 0.009*"life" +

In [35]:
#testing it on the same song https://genius.com/Lady-gaga-and-blackpink-sour-candy-lyrics
for index, score in sorted(lda_model[bow_corpus[342]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.5326711535453796	 
Topic: 0.022*"away" + 0.015*"gonna" + 0.011*"work" + 0.010*"gimm" + 0.008*"nigga" + 0.008*"leav" + 0.008*"real" + 0.008*"sorri" + 0.007*"say" + 0.007*"life"

Score: 0.4548259377479553	 
Topic: 0.035*"stand" + 0.030*"roll" + 0.021*"rock" + 0.015*"everybodi" + 0.015*"hear" + 0.013*"lose" + 0.012*"cold" + 0.011*"care" + 0.011*"reveng" + 0.010*"bodi"


### LDA Visualization ###

In [36]:
!pip install pyldavis



In [37]:
import pickle
import pyLDAvis

  from collections import Iterable
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  EPS = np.finfo(np.float).eps


In [38]:
import pyLDAvis.gensim_models as gensimvis

In [39]:
pyLDAvis.enable_notebook()

In [40]:
LDAvis_prepared = gensimvis.prepare(lda_model, bow_corpus, dictionary)

In [41]:
LDAvis_prepared

### Recommender ###

In [42]:
doc_lda = lda_model[bow_corpus]

In [43]:
from gensim import similarities

In [44]:
index = similarities.MatrixSimilarity(doc_lda)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  if np.issubdtype(vec.dtype, np.int):
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  if np.issubdtype(vec.dtype, np.int):
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  if np.issubdtype(vec.dtype, np.int):
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  if np.issubdtype(vec.dtype, np.int):
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  if np.issubdtype(vec.dtype, np.int):
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  if np.issubdtype(vec.dtype, np.int):
Deprecated in NumPy 1.20; for more

In [45]:
def song_recommender(url):
    songs_checked = 0
    for i in range(len(lyrics_df)):
        recommendation_scores = []
        if lyrics_df.iloc[i,0] == url:
            lda_vectors = doc_lda[i]
            sims = index[lda_vectors]
            sims = list(enumerate(sims))
            for sim in sims:
                song_num = sim[0]
                recommendation_score = [lyrics_df.iloc[song_num, 0], sim[1]]
                recommendation_scores.append(recommendation_score)
            
            recommendation = sorted(recommendation_scores, key=lambda x: x[1], reverse=True)
            print("Your song's most prominent tokens are:")
            article_tokens = bow_corpus[i]
            sorted_tokens = sorted(article_tokens, key=lambda x: x[1], reverse=True)
            sorted_tokens_10 = sorted_tokens[:10]
            for i in range(len(sorted_tokens_10)):
                print("Word {} (\"{}\") appears {} time(s).".format(sorted_tokens_10[i][0], 
                                                             dictionary[sorted_tokens_10[i][0]], 
                                                             sorted_tokens_10[i][1]))
            print('-----')
            print("Your song's most prominant topic is:")
            print(lda_model.print_topic(max(lda_vectors, key=lambda item: item[1])[0]))
            print('-----')
            print('Here are your recommendations for "{}":'.format(url))
            display(recommendation[1:21])
        
        else:
            songs_checked +=1
            
        if songs_checked == len(lyrics_df):
            book_suggestions = []
            print('Sorry, but it looks like "{}" is not available.'.format(url))
           

In [46]:
song_recommender('https://genius.com/The-beatles-yesterday-lyrics')

Your song's most prominent tokens are:
Word 60 ("yesterday") appears 9 time(s).
Word 44 ("away") appears 3 time(s).
Word 45 ("believ") appears 3 time(s).
Word 46 ("easi") appears 2 time(s).
Word 47 ("game") appears 2 time(s).
Word 50 ("hide") appears 2 time(s).
Word 51 ("long") appears 2 time(s).
Word 52 ("need") appears 2 time(s).
Word 53 ("place") appears 2 time(s).
Word 54 ("play") appears 2 time(s).
-----
Your song's most prominant topic is:
0.027*"wanna" + 0.025*"gonna" + 0.018*"long" + 0.017*"danc" + 0.016*"girl" + 0.013*"wait" + 0.012*"hold" + 0.011*"life" + 0.011*"home" + 0.010*"away"
-----
Here are your recommendations for "https://genius.com/The-beatles-yesterday-lyrics":


[['https://genius.com/Elvis-presley-love-me-tender-lyrics', 1.0],
 ['https://genius.com/Elton-john-rocket-man-i-think-its-going-to-be-a-long-long-time-lyrics',
  1.0],
 ['https://genius.com/Eagles-new-kid-in-town-lyrics', 1.0],
 ['https://genius.com/Whitney-houston-i-have-nothing-lyrics', 1.0],
 ['https://genius.com/Whitney-houston-i-wanna-dance-with-somebody-who-loves-me-lyrics',
  1.0],
 ['https://genius.com/Whitney-houston-its-not-right-but-its-okay-lyrics', 1.0],
 ['https://genius.com/Whitney-houston-run-to-you-lyrics', 1.0],
 ['https://genius.com/Drake-hold-on-were-going-home-lyrics', 1.0],
 ['https://genius.com/Garth-brooks-friends-in-low-places-live-version-lyrics',
  1.0],
 ['https://genius.com/Garth-brooks-friends-in-low-places-lyrics', 1.0],
 ['https://genius.com/Ed-sheeran-and-justin-bieber-i-dont-care-lyrics', 1.0],
 ['https://genius.com/Billy-joel-vienna-lyrics', 1.0],
 ['https://genius.com/Phil-collins-another-day-in-paradise-lyrics', 1.0],
 ['https://genius.com/Abba-danc

In [47]:
song_recommender('https://genius.com/Lady-gaga-and-blackpink-sour-candy-lyrics')

Your song's most prominent tokens are:
Word 584 ("candi") appears 13 time(s).
Word 496 ("bite") appears 8 time(s).
Word 471 ("insid") appears 6 time(s).
Word 33 ("hard") appears 4 time(s).
Word 718 ("outsid") appears 4 time(s).
Word 5 ("light") appears 2 time(s).
Word 75 ("littl") appears 2 time(s).
Word 140 ("turn") appears 2 time(s).
Word 246 ("sweet") appears 2 time(s).
Word 315 ("angri") appears 2 time(s).
-----
Your song's most prominant topic is:
0.022*"away" + 0.015*"gonna" + 0.011*"work" + 0.010*"gimm" + 0.008*"nigga" + 0.008*"leav" + 0.008*"real" + 0.008*"sorri" + 0.007*"say" + 0.007*"life"
-----
Here are your recommendations for "https://genius.com/Lady-gaga-and-blackpink-sour-candy-lyrics":


[['https://genius.com/The-rolling-stones-wild-horses-lyrics', 0.91451645],
 ['https://genius.com/Coldplay-everglow-lyrics', 0.913496],
 ['https://genius.com/Green-day-holiday-lyrics', 0.8875401],
 ['https://genius.com/Alabama-shakes-future-people-lyrics', 0.8774216],
 ['https://genius.com/Adele-hello-lyrics', 0.8600352],
 ['https://genius.com/Bruce-springsteen-the-river-lyrics', 0.8502741],
 ['https://genius.com/George-strait-jingle-bell-rock-lyrics', 0.8287339],
 ['https://genius.com/Aretha-franklin-walk-in-the-light-lyrics', 0.8234769],
 ['https://genius.com/James-taylor-shower-the-people-lyrics', 0.81181896],
 ['https://genius.com/Lady-gaga-and-ariana-grande-rain-on-me-lyrics',
  0.80866313],
 ['https://genius.com/Chicago-mass-choir-i-pray-well-be-ready-live-lyrics',
  0.7891496],
 ['https://genius.com/Eminem-river-lyrics', 0.78286666],
 ['https://genius.com/Kanye-west-ultralight-beam-lyrics', 0.77873224],
 ['https://genius.com/Gloria-estefan-hoy-lyrics', 0.7689538],
 ['https://geni

In [48]:
song_recommender("https://genius.com/Eminem-lose-yourself-lyrics")

Your song's most prominent tokens are:
Word 84 ("better") appears 16 time(s).
Word 39 ("shoot") appears 9 time(s).
Word 147 ("moment") appears 8 time(s).
Word 100 ("blow") appears 7 time(s).
Word 2 ("chanc") appears 6 time(s).
Word 9 ("music") appears 6 time(s).
Word 289 ("miss") appears 6 time(s).
Word 605 ("lose") appears 6 time(s).
Word 1143 ("lifetim") appears 6 time(s).
Word 63 ("grow") appears 4 time(s).
-----
Your song's most prominant topic is:
0.022*"away" + 0.015*"gonna" + 0.011*"work" + 0.010*"gimm" + 0.008*"nigga" + 0.008*"leav" + 0.008*"real" + 0.008*"sorri" + 0.007*"say" + 0.007*"life"
-----
Here are your recommendations for "https://genius.com/Eminem-lose-yourself-lyrics":


[['https://genius.com/Eminem-stan-lyrics', 0.98981816],
 ['https://genius.com/Nicki-minaj-feeling-myself-lyrics', 0.9811678],
 ['https://genius.com/Bj-the-chicago-kid-smokin-and-ridin-lyrics', 0.9807109],
 ['https://genius.com/Elvis-presley-in-the-ghetto-lyrics', 0.9806458],
 ['https://genius.com/The-rolling-stones-gimme-shelter-lyrics', 0.9765781],
 ['https://genius.com/Gloria-estefan-hoy-lyrics', 0.97617775],
 ['https://genius.com/Red-hot-chili-peppers-cant-stop-lyrics', 0.97506255],
 ['https://genius.com/Mariah-carey-gtfo-lyrics', 0.97312856],
 ['https://genius.com/Dire-straits-brothers-in-arms-lyrics', 0.97190046],
 ['https://genius.com/Elvis-presley-heartbreak-hotel-lyrics', 0.9717324],
 ['https://genius.com/Elton-john-goodbye-yellow-brick-road-lyrics', 0.9717324],
 ['https://genius.com/Elton-john-bennie-and-the-jets-lyrics', 0.9717324],
 ['https://genius.com/Elton-john-dont-let-the-sun-go-down-on-me-lyrics',
  0.9717324],
 ['https://genius.com/Rihanna-work-lyrics', 0.9717324],
 [

In [49]:
test = song_recommender("https://genius.com/Nicki-minaj-feeling-myself-lyrics")

Your song's most prominent tokens are:
Word 1012 ("feelin") appears 41 time(s).
Word 687 ("bitch") appears 5 time(s).
Word 835 ("nigga") appears 5 time(s).
Word 55 ("say") appears 4 time(s).
Word 163 ("girl") appears 4 time(s).
Word 197 ("work") appears 4 time(s).
Word 1458 ("ridin") appears 4 time(s).
Word 23 ("world") appears 3 time(s).
Word 260 ("stop") appears 3 time(s).
Word 1483 ("texa") appears 3 time(s).
-----
Your song's most prominant topic is:
0.022*"away" + 0.015*"gonna" + 0.011*"work" + 0.010*"gimm" + 0.008*"nigga" + 0.008*"leav" + 0.008*"real" + 0.008*"sorri" + 0.007*"say" + 0.007*"life"
-----
Here are your recommendations for "https://genius.com/Nicki-minaj-feeling-myself-lyrics":


[['https://genius.com/Bj-the-chicago-kid-smokin-and-ridin-lyrics', 0.99891806],
 ['https://genius.com/Eminem-stan-lyrics', 0.99407154],
 ['https://genius.com/Red-hot-chili-peppers-cant-stop-lyrics', 0.99361575],
 ['https://genius.com/Gloria-estefan-hoy-lyrics', 0.9922431],
 ['https://genius.com/Kanye-west-ultralight-beam-lyrics', 0.990904],
 ['https://genius.com/Elvis-presley-heartbreak-hotel-lyrics', 0.99083],
 ['https://genius.com/Elton-john-goodbye-yellow-brick-road-lyrics', 0.99083],
 ['https://genius.com/Elton-john-bennie-and-the-jets-lyrics', 0.99083],
 ['https://genius.com/Elton-john-dont-let-the-sun-go-down-on-me-lyrics',
  0.99083],
 ['https://genius.com/Rihanna-work-lyrics', 0.99083],
 ['https://genius.com/Rihanna-kiss-it-better-lyrics', 0.99083],
 ['https://genius.com/Rihanna-sex-with-me-lyrics', 0.99083],
 ['https://genius.com/Rihanna-diamonds-lyrics', 0.99083],
 ['https://genius.com/Taylor-swift-the-1-lyrics', 0.99083],
 ['https://genius.com/Justin-bieber-sorry-lyrics', 0.

### Web Connection

In [50]:
!pip install anvil-uplink

Collecting anvil-uplink
[?25l  Downloading https://files.pythonhosted.org/packages/9a/65/776713490bfd5145ddb87834355bf7936bd233b273098e37dc12f1ac253c/anvil_uplink-0.3.36-py2.py3-none-any.whl (61kB)
[K     |█████▍                          | 10kB 16.5MB/s eta 0:00:01[K     |██████████▊                     | 20kB 19.3MB/s eta 0:00:01[K     |████████████████                | 30kB 11.6MB/s eta 0:00:01[K     |█████████████████████▍          | 40kB 9.5MB/s eta 0:00:01[K     |██████████████████████████▊     | 51kB 8.0MB/s eta 0:00:01[K     |████████████████████████████████| 61kB 4.6MB/s 
[?25hCollecting ws4py
[?25l  Downloading https://files.pythonhosted.org/packages/53/20/4019a739b2eefe9282d3822ef6a225250af964b117356971bd55e274193c/ws4py-0.5.1.tar.gz (51kB)
[K     |██████▍                         | 10kB 21.9MB/s eta 0:00:01[K     |████████████▊                   | 20kB 27.5MB/s eta 0:00:01[K     |███████████████████▏            | 30kB 16.3MB/s eta 0:00:01[K     |███████

In [51]:
import anvil.server

In [52]:
anvil.server.connect("TU74PRLCE22H6Z7FDC42VU7O-6LRMTR2OJLWSBMSV")

Connecting to wss://anvil.works/uplink
Anvil websocket open
Connected to "Default environment (dev)" as SERVER


In [53]:
def song_recommendation(url):
    songs_checked = 0
    to_print = ""
    for i in range(len(lyrics_df)):
        recommendation_scores = []
        if lyrics_df.iloc[i,0] == url:
            lda_vectors = doc_lda[i]
            sims = index[lda_vectors]
            sims = list(enumerate(sims))
            for sim in sims:
                song_num = sim[0]
                recommendation_score = [lyrics_df.iloc[song_num, 0], sim[1]]
                recommendation_scores.append(recommendation_score)
            
            recommendation = sorted(recommendation_scores, key=lambda x: x[1], reverse=True)
            
            article_tokens = bow_corpus[i]
            sorted_tokens = sorted(article_tokens, key=lambda x: x[1], reverse=True)
            sorted_tokens_10 = sorted_tokens[:10]
            for i in range(len(sorted_tokens_10)):
                tokens = ("Word {} (\"{}\") appears {} time(s).".format(sorted_tokens_10[i][0], 
                                                             dictionary[sorted_tokens_10[i][0]], 
                                                             sorted_tokens_10[i][1]))
            
            topic = (lda_model.print_topic(max(lda_vectors, key=lambda item: item[1])[0]))

            
            recommendations = recommendation[1:21]

            to_print = ("The most prominent tokens of your song are {}, recommendations for your song are: {}".format(tokens, recommendations))

            return(to_print)
        else:
            songs_checked +=1
            
        if songs_checked == len(lyrics_df):
            song_suggestions = []
            return('Sorry, but it looks like "{}" is not available.'.format(url))

    

In [54]:
def song_recommendation_token(url):
    songs_checked = 0
    to_print = ""
    for i in range(len(lyrics_df)):
        recommendation_scores = []
        if lyrics_df.iloc[i,0] == url:
            lda_vectors = doc_lda[i]
            sims = index[lda_vectors]
            sims = list(enumerate(sims))
            for sim in sims:
                song_num = sim[0]
                recommendation_score = [lyrics_df.iloc[song_num, 0], sim[1]]
                recommendation_scores.append(recommendation_score)
            
            recommendation = sorted(recommendation_scores, key=lambda x: x[1], reverse=True)
            
            article_tokens = bow_corpus[i]
            sorted_tokens = sorted(article_tokens, key=lambda x: x[1], reverse=True)
            sorted_tokens_10 = sorted_tokens[:10]
            for i in range(len(sorted_tokens_10)):
                tokens = ("Word {} (\"{}\") appears {} time(s).".format(sorted_tokens_10[i][0], 
                                                             dictionary[sorted_tokens_10[i][0]], 
                                                             sorted_tokens_10[i][1]))
            
            topic = (lda_model.print_topic(max(lda_vectors, key=lambda item: item[1])[0]))

            
            recommendations = recommendation[1:21]

            to_print = ("The most prominent tokens of your song are {}".format(tokens))

            return(to_print)
        else:
            songs_checked +=1
            
        if songs_checked == len(lyrics_df):
            song_suggestions = []
            return('Sorry, but it looks like "{}" is not available.'.format(url))

In [55]:
def song_recommendation_rec1(url):
    songs_checked = 0
    to_print = ""
    for i in range(len(lyrics_df)):
        recommendation_scores = []
        if lyrics_df.iloc[i,0] == url:
            lda_vectors = doc_lda[i]
            sims = index[lda_vectors]
            sims = list(enumerate(sims))
            for sim in sims:
                song_num = sim[0]
                recommendation_score = [lyrics_df.iloc[song_num, 0], sim[1]]
                recommendation_scores.append(recommendation_score)
            
            recommendation = sorted(recommendation_scores, key=lambda x: x[1], reverse=True)
            recommendations = recommendation[1:21]

            to_print = recommendation[1]

            return(to_print)
        else:
            songs_checked +=1
            
        if songs_checked == len(lyrics_df):
            song_suggestions = []
            return('Sorry, but it looks like "{}" is not available.'.format(url))

In [56]:
def song_recommendation_rec2(url):
    songs_checked = 0
    to_print = ""
    for i in range(len(lyrics_df)):
        recommendation_scores = []
        if lyrics_df.iloc[i,0] == url:
            lda_vectors = doc_lda[i]
            sims = index[lda_vectors]
            sims = list(enumerate(sims))
            for sim in sims:
                song_num = sim[0]
                recommendation_score = [lyrics_df.iloc[song_num, 0], sim[1]]
                recommendation_scores.append(recommendation_score)
            
            recommendation = sorted(recommendation_scores, key=lambda x: x[1], reverse=True)
            recommendations = recommendation[1:21]

            to_print = recommendation[2]

            return(to_print)
        else:
            songs_checked +=1
            
        if songs_checked == len(lyrics_df):
            song_suggestions = []
            return('Sorry, but it looks like "{}" is not available.'.format(url))

In [57]:
def song_recommendation_rec3(url):
    songs_checked = 0
    to_print = ""
    for i in range(len(lyrics_df)):
        recommendation_scores = []
        if lyrics_df.iloc[i,0] == url:
            lda_vectors = doc_lda[i]
            sims = index[lda_vectors]
            sims = list(enumerate(sims))
            for sim in sims:
                song_num = sim[0]
                recommendation_score = [lyrics_df.iloc[song_num, 0], sim[1]]
                recommendation_scores.append(recommendation_score)
            
            recommendation = sorted(recommendation_scores, key=lambda x: x[1], reverse=True)
            recommendations = recommendation[1:21]

            to_print = recommendation[3]

            return(to_print)
        else:
            songs_checked +=1
            
        if songs_checked == len(lyrics_df):
            song_suggestions = []
            return('Sorry, but it looks like "{}" is not available.'.format(url))

In [58]:
def song_recommendation_rec4(url):
    songs_checked = 0
    to_print = ""
    for i in range(len(lyrics_df)):
        recommendation_scores = []
        if lyrics_df.iloc[i,0] == url:
            lda_vectors = doc_lda[i]
            sims = index[lda_vectors]
            sims = list(enumerate(sims))
            for sim in sims:
                song_num = sim[0]
                recommendation_score = [lyrics_df.iloc[song_num, 0], sim[1]]
                recommendation_scores.append(recommendation_score)
            
            recommendation = sorted(recommendation_scores, key=lambda x: x[1], reverse=True)
            recommendations = recommendation[1:21]

            to_print = recommendation[4]

            return(to_print)
        else:
            songs_checked +=1
            
        if songs_checked == len(lyrics_df):
            song_suggestions = []
            return('Sorry, but it looks like "{}" is not available.'.format(url))

In [59]:
def song_recommendation_rec5(url):
    songs_checked = 0
    to_print = ""
    for i in range(len(lyrics_df)):
        recommendation_scores = []
        if lyrics_df.iloc[i,0] == url:
            lda_vectors = doc_lda[i]
            sims = index[lda_vectors]
            sims = list(enumerate(sims))
            for sim in sims:
                song_num = sim[0]
                recommendation_score = [lyrics_df.iloc[song_num, 0], sim[1]]
                recommendation_scores.append(recommendation_score)
            
            recommendation = sorted(recommendation_scores, key=lambda x: x[1], reverse=True)
            recommendations = recommendation[1:21]

            to_print = recommendation[5]

            return(to_print)
        else:
            songs_checked +=1
            
        if songs_checked == len(lyrics_df):
            song_suggestions = []
            return('Sorry, but it looks like "{}" is not available.'.format(url))

In [60]:
@anvil.server.callable
def song_recommendations_token(url):
  tokens = song_recommendation_token(url)
  return tokens

In [61]:
@anvil.server.callable
def song_recommendations(url):
  results = song_recommendation(url)
  return results

In [62]:
@anvil.server.callable
def song_recommendations_rec1(url):
  results = song_recommendation_rec1(url)
  return results

In [63]:
@anvil.server.callable
def song_recommendations_rec2(url):
  results = song_recommendation_rec2(url)
  return results

In [64]:
@anvil.server.callable
def song_recommendations_rec3(url):
  results = song_recommendation_rec3(url)
  return results

In [65]:
@anvil.server.callable
def song_recommendations_rec4(url):
  results = song_recommendation_rec4(url)
  return results

In [66]:
@anvil.server.callable
def song_recommendations_rec5(url):
  results = song_recommendation_rec5(url)
  return results

In [None]:
anvil.server.wait_forever()

In [None]:
'''def recommendation_button_click(self, **event_args):
    """This method is called when the button is clicked"""
    # Call the google colab function and pass it the iris measurements
    song_recommendation = anvil.server.call('song_recommender', 
                                self.url.text)
                            
    # If a category is returned set our species
    if song_recommendation:
      self.recommendation_label.visible = True
      self.recommendation_label.text = song_recommendation'''