In [45]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from rake_nltk import Rake

import pandas as pd

lyrics_df = pd.read_csv("Rap_Lyrics_From_Different_Eras.csv")

In [46]:
def clean_data_frame(lyrics_df):
    #Remove duplicate values and where songs = artists (errors from parsing)
    lyrics_df = lyrics_df.drop_duplicates(subset=['Songs', 'Artists'], keep='first')
    lyrics_df = lyrics_df.drop(lyrics_df[lyrics_df['Songs'] == lyrics_df['Artists']].index)
    
    #Reset index to songs
    lyrics_df.set_index('Songs', inplace = True)
    lyrics_df.drop(columns = ['Unnamed: 0'], inplace = True) 
    
    return lyrics_df

In [47]:
def clean_data_frame(lyrics_df):
    #Remove duplicate values and where songs = artists (errors from parsing)
    lyrics_df = lyrics_df.drop_duplicates(subset=['Songs', 'Artists'], keep='first')
    lyrics_df = lyrics_df.drop(lyrics_df[lyrics_df['Songs'] == lyrics_df['Artists']].index)
    
    #Reset index to songs
    lyrics_df.set_index('Songs', inplace = True)
    lyrics_df.drop(columns = ['Unnamed: 0'], inplace = True) 
    
    return lyrics_df

In [48]:
clean_data_frame(lyrics_df).head(10)

Unnamed: 0_level_0,Lyrics,Artists,Era
Songs,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Empire State of Mind,"\n\n\nYeah\n\n\nYeah, I'm out that Brooklyn, n...",Jay-Z,2010-2020
Niggas in Paris,"\n\n\n“We're going to skate to one song, one s...",Jay-Z,2010-2020
Holy Grail,\n\n\nYou'd take the clothes off my back and I...,Jay-Z,2010-2020
BedRock,\n\n\nI-I-I-I can make your bed rock (Young Mo...,Young Money,2010-2020
Say Something,"\n\n\nThis shit was all I knew, you and me onl...",Timbaland,2010-2020
Over,\n\n\n\n\nYoung Money\n\n\nI know way too many...,Drake,2010-2020
Fancy,"\n\n\n\n\nGo, go 'head (Go, go, go)\nGo, go, g...",Drake,2010-2020
Headlines,\n\n\n\n\nI might be too strung out on complim...,Drake,2010-2020
Make Me Proud,\n\n\nI like a woman with a future and a past\...,Drake,2010-2020
Hotline Bling,"\n\n\nYou used to call me on my\nYou used to, ...",Drake,2010-2020


In [49]:
def convert_lyrics_to_keywords(lyrics_df):
     # assigning the key words to the new column
    lyrics_df['Key_Words'] = ""
      
    for index, row in lyrics_df.iterrows():
        lyric = row['Lyrics']
        
        # instantiating Rake, by default is uses english stopwords from NLTK
        # and discard all puntuation characters
        r = Rake()

        # extracting the words by passing the text
        r.extract_keywords_from_text(lyric)

        # getting the dictionary whith key words and their scores
        key_words_dict_scores = r.get_word_degrees()
    
        # assigning the key words to the new column
        row['Key_Words'] = ' '.join(list(key_words_dict_scores.keys()))
        lyrics_df.at[index, 'Key_Words'] = row['Key_Words']


    lyrics_df.drop(columns = ['Lyrics'], inplace = True)
    
    # merging together hip hop artist names to treat as unique values
    lyrics_df['Artists_Lower'] = lyrics_df['Artists'].map(lambda x: x.split(' '))
    for index, row in lyrics_df.iterrows():
        row['Artists_Lower'] = ''.join(row['Artists_Lower']).lower()
        lyrics_df.at[index, 'Artists_Lower'] = row['Artists_Lower']

    lyrics_df['Key_Words'] = lyrics_df['Artists_Lower'] + ' ' + lyrics_df['Key_Words']
    lyrics_df.drop(columns = ['Artists_Lower'], inplace = True)
    
    return lyrics_df

In [50]:
convert_lyrics_to_keywords(lyrics_df).head(15)
clean_data_frame(lyrics_df).head(10)

Unnamed: 0_level_0,Artists,Era,Key_Words
Songs,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Empire State of Mind,Jay-Z,2010-2020,jay-z lighters spiked bob marley statue made a...
Niggas in Paris,Jay-Z,2010-2020,jay-z bathroom stall fish fillet tyson ‘ cause...
Holy Grail,Jay-Z,2010-2020,jay-z fame bleed tyson feel like shit photo sh...
BedRock,Young Money,2010-2020,youngmoney pussy conscience murderers g flints...
Say Something,Timbaland,2010-2020,timbaland long gone sit want talented mr shit ...
Over,Drake,2010-2020,drake fit die fame end getting swisher shit yo...
Fancy,Drake,2010-2020,drake m3 beamer champagne range salons decline...
Headlines,Drake,2010-2020,drake want every night fell shit ‘ cause sayin...
Make Me Proud,Drake,2010-2020,drake wondering probably went want protest pop...
Hotline Bling,Drake,2010-2020,drake cell phone late night reputation champag...


In [51]:
def get_cosine_sim_matrix(lyrics_df):
    # instantiating and generating the count matrix
    count = CountVectorizer()
    count_matrix = count.fit_transform(lyrics_df['Key_Words'])
    
    # generating the cosine similarity matrix
    cosine_sim = cosine_similarity(count_matrix, count_matrix) 
    
    return cosine_sim

In [52]:
cm = get_cosine_sim_matrix(lyrics_df)

In [53]:
def get_recommended_songs(lyrics_df, cosine_sim, song):  
    recommended_songs = []
    
    # creating a Series for the song titles so they are associated to an ordered numerical
    indices = pd.DataFrame(lyrics_df.index, lyrics_df['Artists'])
    indices = indices.reset_index()
    
    # gettin the index of the song that matches the title
    idx = indices[indices['Songs'] == song].index[0]
    
    # creating a Series with the similarity scores in descending order
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)

    # getting the indexes of the 10 most similar songs
    top_10_indexes = list(score_series.iloc[1:11].index)
    
    # populating the list with the titles of the best 10 matching songs
    for i in top_10_indexes:
        recommended_songs.append(list(lyrics_df['Artists'])[i] + ': ' + list(lyrics_df.index)[i] + " " + list(lyrics_df['Era'])[i])
    
    return recommended_songs

In [54]:
get_recommended_songs(lyrics_df, cm, 'Empire State of Mind')

KeyError: 'Songs'