### Module and Data Imports

In [30]:
import pickle as pkl
import re
import string

import pandas as pd

from nltk import word_tokenize
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split

In [2]:
"""
Load artist names.
"""
with open('artist_names.pkl', 'rb') as f: 
    artists = pkl.load(f)

In [3]:
"""
Load artist API paths. 
"""
with open('all_artist_paths.pkl', 'rb') as f:
    all_artist_paths = pkl.load(f)

In [4]:
"""
Load song paths for every artist. 
"""
with open('all_song_paths.pkl', 'rb') as f: 
    all_song_paths = pkl.load(f)

In [5]:
"""
Load all song lyrics.
"""
with open('all_song_lyrics_v2.pkl', 'rb') as f: 
    all_song_lyrics = pkl.load(f)

### Data Cleaning

TO-DO: 
* Remove non-songs
* Cut down to only artists' verses if possible

In [6]:
"""
STEP 1: CONVERT FROM LYRICS TO STANDARD TEXT

Rebuild the lyrics dict with cleaned, confirmed songs: 
1) Confirm the artist has a song
2) Confirm the lyrics are to a song (not an interview transcript, etc)
3) Confirm it is at least 16 lines (1+ standard verses)
4) Replace bracketed text denoting verse/chorus separation

NOTES: 
* removed verse markers for initial MVP, but may later need them to separate artist verses
* potential interference from repeated/chorus heavy rappers, could consider dropping repeat choruses
* numeric and bracket remover doesn't seem to be replacing properly 
"""
cleaned_lyrics = {}
for artist in all_song_lyrics.keys():
    
    # confirm artist has a song
    if all_song_lyrics[artist] != {}:
        
        # add the artist and create a dict for their songs
        cleaned_lyrics[artist] = {}
        for song in all_song_lyrics[artist].keys():
            is_song = False
            is_sixteen = False
            # if there is a verse or chorus marker, add the song to the artist's dict
            if '[Verse' in all_song_lyrics[artist][song] or '[Chorus' in all_song_lyrics[artist][song]:
                is_song = True
            # if the song has at least 16 lines (standard verse)
            if all_song_lyrics[artist][song].count('\n') > 14:   
                is_sixteen = True
            
            if is_song and is_sixteen:
                # replace bracketed text with a space and store to cleaned list
                cleaned_lyrics[artist][song] = re.sub('\[.*?\]', ' ', all_song_lyrics[artist][song])
                # alphanumerics
                cleaned_lyrics[artist][song] = re.sub('\w*\d\w*', ' ', cleaned_lyrics[artist][song])
                # remove punctuation and capitalization
                cleaned_lyrics[artist][song] = re.sub('[%s]' % re.escape(string.punctuation), ' ', cleaned_lyrics[artist][song].lower())

            

In [23]:
"""
Cast to DF 
"""
# Collect all lyrics in DataFrame
song_dfs = []
for artist in cleaned_lyrics.keys():
    df = pd.DataFrame([[artist, item[0], item[1]] for item in cleaned_lyrics[artist].items()], columns=['artist', 'song', 'lyrics'])
    song_dfs.append(df)
all_lyrics_df = pd.concat(song_dfs)

all_lyrics_df

Unnamed: 0,artist,song,lyrics
0,03 Greedo,/songs/3546392,\nthree purple hearts\nthree purple hearts\nt...
1,03 Greedo,/songs/3801456,\ni ve been meditating with the money\nprayin...
2,03 Greedo,/songs/3545295,\nacetheface\n havin it havin it havin ...
3,03 Greedo,/songs/4485607,\nyou know me out here are you seeing \nkeep...
4,03 Greedo,/songs/3801450,\nif i go back\nturbo\n\n \ngo back to jail ...
...,...,...,...
5,Zion I,/songs/347240,yeah \n i got \n go \n\n \nain t not...
6,Zion I,/songs/48980,\nnow in a world full of pain hard strugglin...
7,Zion I,/songs/48968,\nzion i crew\namp live dj kg zion\nwe re l...
8,Zion I,/songs/48960,\nthis ain t for the weak hearted go on and ...


In [25]:
# Train-Test Split
X = all_lyrics_df.lyrics
y = all_lyrics_df.song


In [27]:
# Generate TF-IDF Matrix
vectorizer = TfidfVectorizer(stop_words='english')
X_tfidf = vectorizer.fit_transform(X)


In [32]:
X_tfidf.shape

(18278, 114747)

In [21]:
x = '[Intro: Nickelus F]\nNiggas in the city hating me man they tripping\nWhen I\'m the only one that\'s really spitting\nMy nigga Lil Lee the only one that\'s really gripping\nWhy you dap me up acting like like we cool if you don\'t feel him nigga?\nUh, I\'m bringing back hip hop\nMe and my nigga Drake\nVA, Toronto, Hip-Hop\n\n[Verse 1: Nickelus F]\nAiight let me take my jacket off\nUnlace the Timbs, here again comes the jabberjaw\nThis time I\'m blabbing on\nThese rappers that feel a certain way but they give me dap and all\nMad cuz the freestyle iller than they whole catalog\nI got the gift like Santa Claus\nFlow got spring like a catapult\nMatter fact the flow Supreme like Diana Ross\nI\'m at the do\' like a deer with the antlers off\nNiggas acting like Scummy was they nigga\nWouldn\'t even give my man a 12 bar feature\nI know who you are cause he wanted me to eat ya\nI said nah lets keep it peaceful\nMy nigga was a true weed source\nHe gave it to us for a cheap cost, at prices we all could eat off\nA real street dog\nThe type of nigga that y\'all niggas be studying learning how to get street off\nWe have an impostor, alert the doctor\nBout to hit him up with the chopper like Blaka\nShock em like Blanka from Street Fighter, heat got a\nHurricane kick, murder everything quick\nLook, my new Ryu nine is stupendous\nWhy do guys choose my crew to pick wit\nMy crew lie you guys into ditches\nI said it and meant it the Scorpio\'s Mind has many dimensions\nI\'ll say it in intent, sentence, indent, on instant pimpin\nHope your listenin\' to my diction\nIf you are then you now should be itchin\'\nThat it\'s crack we placed in your stereo system\nYeah, I be at your burial trippin\'\nNickelus F from Richmond, VA\nThe Scorpio\'s Mind is not a game to play wit all day\n\n[Verse 2: Drake]\nI\'ve been hated by many, wanted by plenty\nDisliked by some, but confronted by none\nSince they don\'t show me sincerity, I load up, lock up\nTake shots at em I guess you could call it a parody\nAnd compared to D\nThey one-fourth from watermelon to a quarter felon, dude you a pear to me\nIf that\'s not how it is it\'s how it appear to be\nYou got blind heaters, in my sweats is a mind reader\nAnd when the psychic get to touching my palm\nSeein\' your physical, the things that you never say to me visible\nEspecially when one of your artists feeling threatened\nCause I\'m harnessing a weapon, won\'t you pardon my reflection\nMirror, mirror tell me why they wanna get and scrimmage\nAnd play around, to perfection I\'m the spitting image\nMy verbal camp is vivid, I told you I\'m spitting image\nIt seems we often want to start but never get to finish\nMy verbal campus is Villanova, and those of you feelin Hova\nAnd writing college rhymes, look the thrill is over\nLet me assist you like a specialist\nSo you can pull it back and try catch the metaphors and the rest of this\nIt\'s not a problem wit X, I guess I\'m a pessimist\nWhich means if shit goes bad I say, "I expected this"\nAnd me and Julien we never got the chance to communicate\nInstead of understanding its a tune of hate\nThe city\'s mine like Oklahoma\'s a Sooner state\nAnd we\'re gonna have to cross paths whether soon or late\nSo, why don\'t you walk up in the spot using less strut\nYou ain\'t Morris Chestnut, you lighter and less cut\nAnd lets be honest, by now you should be your own scholar\nYou still a protegé, that\'s the reason I don\'t holler\nYou got rappers being repetitive actors\nYou stay ahead of the game, I\'m ahead of the practice boy'
x.count('\n')

75