In [122]:
# Dependencies and Setup
import pandas as pd
import json
import requests
import re
import pymongo
from config import apiseeds_apikey, musixmatch_key

In [123]:
#Read csv downloaded from Kaggle
songs = pd.read_csv('top10s.csv', engine = 'python')
songs.head()

Unnamed: 0.1,Unnamed: 0,title,artist,top genre,year,bpm,nrgy,dnce,dB,live,val,dur,acous,spch,pop
0,1,"Hey, Soul Sister",Train,neo mellow,2010,97,89,67,-4,8,80,217,19,4,83
1,2,Love The Way You Lie,Eminem,detroit hip hop,2010,87,93,75,-5,52,64,263,24,23,82
2,3,TiK ToK,Kesha,dance pop,2010,120,84,76,-3,29,71,200,10,14,80
3,4,Bad Romance,Lady Gaga,dance pop,2010,119,92,70,-4,8,71,295,0,4,79
4,5,Just the Way You Are,Bruno Mars,pop,2010,109,84,64,-5,9,43,221,2,4,78


In [124]:
#clean songs dataframe
songs_df = songs[['title', 'artist', 'year', 'top genre']]
songs_df = songs_df.dropna(how = 'any')
songs_df.head()

Unnamed: 0,title,artist,year,top genre
0,"Hey, Soul Sister",Train,2010,neo mellow
1,Love The Way You Lie,Eminem,2010,detroit hip hop
2,TiK ToK,Kesha,2010,dance pop
3,Bad Romance,Lady Gaga,2010,dance pop
4,Just the Way You Are,Bruno Mars,2010,pop


In [125]:
#create a dataframe that shows the number of songs for each artist that hit Spotify's top ten in the years data was gathered
artists_df = pd.DataFrame(songs_df['artist'].value_counts())
artists_df = artists_df.reset_index()
artists_df = artists_df.rename(columns = {'index':'artist', 'artist':'Number of Top-10 Songs 2010-2019'})
artists_df.head()

Unnamed: 0,artist,Number of Top-10 Songs 2010-2019
0,Katy Perry,17
1,Justin Bieber,16
2,Maroon 5,15
3,Rihanna,15
4,Lady Gaga,14


In [126]:
#sort artists dataframe by year, showing artist popularity per year
yearly_artists = pd.DataFrame(songs_df.groupby(['year']).apply(lambda x: x['artist'].value_counts().reset_index()))
yearly_artists = yearly_artists.rename(columns = {'index':'artist', 'artist':'Number of Top-10 Songs'})
yearly_artists.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,artist,Number of Top-10 Songs
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010,0,Kesha,4
2010,1,Christina Aguilera,4
2010,2,The Black Eyed Peas,4
2010,3,Alicia Keys,3
2010,4,Lady Gaga,3


In [127]:
#create a dataframe that will hold lyrics for each song
lyrics_df = songs[['title', 'artist']].copy()
lyrics_df['lyrics'] = ''
lyrics_df.head()

Unnamed: 0,title,artist,lyrics
0,"Hey, Soul Sister",Train,
1,Love The Way You Lie,Eminem,
2,TiK ToK,Kesha,
3,Bad Romance,Lady Gaga,
4,Just the Way You Are,Bruno Mars,


In [128]:
#Dropped one line from dataframe that produces a JSON Decode Error when passing through apiseeds api
no_245 = lyrics_df.iloc[245]
lyrics_df = lyrics_df.drop([245])
no_245

title     Let It Go - From "Frozen / Single Version
artist                                  Demi Lovato
lyrics                                             
Name: 245, dtype: object

In [129]:
#send api requests to apiseeds to get lyrics, then place those lyrics in the lyrics dataframe
for index, row in lyrics_df.iterrows():
    base_url = 'https://orion.apiseeds.com/api/music/lyric/'
    apikey = apiseeds_apikey
    artist = row['artist']
    title = row['title']
    url = (f'{base_url}{artist}/{title}?apikey={apikey}')
    response = requests.get(url).json()
    try:
        lyrics_df.loc[index, 'lyrics'] = response['result']['track']['text']
        print(f"Processing Record {index} of {len(lyrics_df)} for: {title} by {artist}.")
    except (KeyError, IndexError, NameError):
        print('Song not found, skipping')
        next
    print('===========================')
print('All done!')

Processing Record 0 of 602 for: Hey, Soul Sister by Train.
Processing Record 1 of 602 for: Love The Way You Lie by Eminem.
Processing Record 2 of 602 for: TiK ToK by Kesha.
Processing Record 3 of 602 for: Bad Romance by Lady Gaga.
Processing Record 4 of 602 for: Just the Way You Are by Bruno Mars.
Song not found, skipping
Processing Record 6 of 602 for: Dynamite by Taio Cruz.
Processing Record 7 of 602 for: Secrets by OneRepublic.
Processing Record 8 of 602 for: Empire State of Mind (Part II) Broken Down by Alicia Keys.
Processing Record 9 of 602 for: Only Girl (In The World) by Rihanna.
Processing Record 10 of 602 for: Club Can't Handle Me (feat. David Guetta) by Flo Rida.
Processing Record 11 of 602 for: Marry You by Bruno Mars.
Processing Record 12 of 602 for: Cooler Than Me - Single Mix by Mike Posner.
Processing Record 13 of 602 for: Telephone by Lady Gaga.
Song not found, skipping
Processing Record 15 of 602 for: OMG (feat. will.i.am) by Usher.
Processing Record 16 of 602 for: Ee

Processing Record 94 of 602 for: Jar of Hearts by Christina Perri.
Song not found, skipping
Processing Record 96 of 602 for: Turning Page by Sleeping At Last.
Processing Record 97 of 602 for: Super Bass by Nicki Minaj.
Processing Record 98 of 602 for: Raise Your Glass by P!nk.
Processing Record 99 of 602 for: Invading My Mind by Jennifer Lopez.
Song not found, skipping
Song not found, skipping
Song not found, skipping
Song not found, skipping
Song not found, skipping
Song not found, skipping
Song not found, skipping
Song not found, skipping
Song not found, skipping
Song not found, skipping
Song not found, skipping
Song not found, skipping
Song not found, skipping
Song not found, skipping
Song not found, skipping
Song not found, skipping
Song not found, skipping
Song not found, skipping
Song not found, skipping
Song not found, skipping
Song not found, skipping
Song not found, skipping
Song not found, skipping
Song not found, skipping
Song not found, skipping
Song not found, skipping
Son

Processing Record 228 of 602 for: Burn by Ellie Goulding.
Processing Record 229 of 602 for: She Looks So Perfect by 5 Seconds of Summer.
Processing Record 230 of 602 for: Fancy by Iggy Azalea.
Processing Record 231 of 602 for: Talk Dirty (feat. 2 Chainz) by Jason Derulo.
Processing Record 232 of 602 for: Gorilla by Bruno Mars.
Processing Record 233 of 602 for: human by Christina Perri.
Processing Record 234 of 602 for: Young Girls by Bruno Mars.
Processing Record 235 of 602 for: Wiggle (feat. Snoop Dogg) by Jason Derulo.
Processing Record 236 of 602 for: Love Runs Out by OneRepublic.
Processing Record 237 of 602 for: This Is How We Do by Katy Perry.
Processing Record 238 of 602 for: Mmm Yeah (feat. Pitbull) by Austin Mahone.
Processing Record 239 of 602 for: A Little Party Never Killed Nobody (All We Got) by Fergie.
Song not found, skipping
Song not found, skipping
Processing Record 242 of 602 for: Birthday by Katy Perry.
Song not found, skipping
Processing Record 244 of 602 for: Stay 

Song not found, skipping
Song not found, skipping
Song not found, skipping
Song not found, skipping
Song not found, skipping
Song not found, skipping
Song not found, skipping
Song not found, skipping
Song not found, skipping
Song not found, skipping
Song not found, skipping
Song not found, skipping
Song not found, skipping
Song not found, skipping
Song not found, skipping
Song not found, skipping
Song not found, skipping
Song not found, skipping
Song not found, skipping
Song not found, skipping
Song not found, skipping
Song not found, skipping
Song not found, skipping
Song not found, skipping
Song not found, skipping
Song not found, skipping
Song not found, skipping
Song not found, skipping
Song not found, skipping
Song not found, skipping
Song not found, skipping
Song not found, skipping
Song not found, skipping
Song not found, skipping
Song not found, skipping
Song not found, skipping
Song not found, skipping
Song not found, skipping
Song not found, skipping
Song not found, skipping


Processing Record 466 of 602 for: Mama by Jonas Blue.
Processing Record 467 of 602 for: Slide (feat. Frank Ocean & Migos) by Calvin Harris.
Processing Record 468 of 602 for: Swish Swish by Katy Perry.
Processing Record 469 of 602 for: Chained To The Rhythm by Katy Perry.
Processing Record 470 of 602 for: Cold (feat. Future) by Maroon 5.
Processing Record 471 of 602 for: Love by Lana Del Rey.
Song not found, skipping
Processing Record 473 of 602 for: All I Ask by Adele.
Song not found, skipping
Processing Record 475 of 602 for: The Cure by Lady Gaga.
Song not found, skipping
Processing Record 477 of 602 for: Bodak Yellow by Cardi B.
Processing Record 478 of 602 for: Rich Love (with Seeb) by OneRepublic.
Processing Record 479 of 602 for: Tired by Alan Walker.
Song not found, skipping
Processing Record 481 of 602 for: 24K Magic by Bruno Mars.
Processing Record 482 of 602 for: Strip That Down (feat. Quavo) by Liam Payne.
Processing Record 483 of 602 for: Cut To The Feeling by Carly Rae Jep

Song not found, skipping
Song not found, skipping
Song not found, skipping
Song not found, skipping
Song not found, skipping
Song not found, skipping
Song not found, skipping
Song not found, skipping
Song not found, skipping
All done!


In [130]:
#take out all rows of lyrics_df that were not filled by apiseeds, append no_245
empty_df = lyrics_df.loc[lyrics_df['lyrics'] == '']
empty_df = empty_df.append(no_245)
empty_df.head()

Unnamed: 0,title,artist,lyrics
5,Baby,Justin Bieber,
14,Like A G6,Far East Movement,
30,3,Britney Spears,
31,My First Kiss - feat. Ke$ha,3OH!3,
32,Blah Blah Blah (feat. 3OH!3),Kesha,


In [131]:
number_245 = empty_df.loc[empty_df['title'] == 'Let It Go - From "Frozen / Single Version']
number_245

Unnamed: 0,title,artist,lyrics
245,"Let It Go - From ""Frozen / Single Version",Demi Lovato,


In [132]:
#use function to convert jsonp data to json format
def convert(jsonp):
    try:
        l_index = jsonp.index('(') + 1
        r_index = jsonp.rindex(')')
    except ValueError:
        print("Input is not in a jsonp format.")
        return
    res = jsonp[l_index:r_index]
    return res

In [133]:
#loop through empty_df to fill in lyrics with musixmatch api
for index, row in empty_df.iterrows():
    base_url = 'https://api.musixmatch.com/ws/1.1/matcher.lyrics.get?format=jsonp&callback=callback'
    apikey = musixmatch_key
    artist = row['artist']
    title = row['title']
    url = (f'{base_url}&q_track={title}&q_artist={artist}&apikey={apikey}')
    response = requests.get(url).text
    json_result = convert(response)
    result = json.loads(json_result)
    try:
        empty_df.loc[index, 'lyrics'] = result['message']['body']['lyrics']['lyrics_body']
        print(f"Processing Record {index} of {len(empty_df)} for: {title} by {artist}.")
    except (KeyError, IndexError, NameError, TypeError):
        print('Song not found, skipping')
        next
    print('===========================')
print('All done!')

Processing Record 5 of 368 for: Baby by Justin Bieber.
Processing Record 14 of 368 for: Like A G6 by Far East Movement.
Processing Record 30 of 368 for: 3 by Britney Spears.
Processing Record 31 of 368 for: My First Kiss - feat. Ke$ha by 3OH!3.
Processing Record 32 of 368 for: Blah Blah Blah (feat. 3OH!3) by Kesha.
Processing Record 40 of 368 for: Something's Got A Hold On Me - Burlesque Original Motion Picture Soundtrack by Christina Aguilera.
Processing Record 48 of 368 for: Castle Walls (feat. Christina Aguilera) by T.I..
Processing Record 57 of 368 for: Moves Like Jagger - Studio Recording From The Voice Performance by Maroon 5.
Processing Record 62 of 368 for: Born This Way by Lady Gaga.
Processing Record 67 of 368 for: We R Who We R by Kesha.
Processing Record 71 of 368 for: On The Floor by Jennifer Lopez.
Processing Record 72 of 368 for: What's My Name? by Rihanna.
Processing Record 77 of 368 for: E.T. by Katy Perry.
Processing Record 81 of 368 for: You And I by Lady Gaga.
Proce

Processing Record 167 of 368 for: Beneath Your Beautiful by Labrinth.
Processing Record 168 of 368 for: Let Me Love You (Until You Learn To Love Yourself) by Ne-Yo.
Processing Record 169 of 368 for: Thrift Shop (feat. Wanz) by Macklemore & Ryan Lewis.
Processing Record 170 of 368 for: If I Lose Myself - Alesso vs OneRepublic by OneRepublic.
Processing Record 171 of 368 for: The Way by Ariana Grande.
Processing Record 172 of 368 for: Suit & Tie by Justin Timberlake.
Song not found, skipping
Processing Record 174 of 368 for: I Love It (feat. Charli XCX) by Icona Pop.
Processing Record 175 of 368 for: Play Hard (feat. Ne-Yo & Akon) - New Edit by David Guetta.
Processing Record 176 of 368 for: Daylight by Maroon 5.
Processing Record 177 of 368 for: Love Somebody by Maroon 5.
Processing Record 178 of 368 for: A Little Party Never Killed Nobody (All We Got) by Fergie.
Processing Record 179 of 368 for: Move by Little Mix.
Processing Record 180 of 368 for: Walks Like Rihanna by The Wanted.
Pro

Processing Record 332 of 368 for: American Oxygen by Rihanna.
Processing Record 333 of 368 for: Bang Bang by Jessie J.
Processing Record 334 of 368 for: Reality - Radio Edit by Lost Frequencies.
Processing Record 335 of 368 for: Alive by Sia.
Processing Record 336 of 368 for: Sugar (feat. Francesco Yates) by Robin Schulz.
Processing Record 337 of 368 for: Been You by Justin Bieber.
Processing Record 338 of 368 for: Prayer in C - Robin Schulz Radio Edit by Lilly Wood and The Prick.
Processing Record 339 of 368 for: See You Again (feat. Charlie Puth) by Wiz Khalifa.
Processing Record 340 of 368 for: Heroes (we could be) by Alesso.
Processing Record 341 of 368 for: Feel The Light - From The "Home" Soundtrack by Jennifer Lopez.
Processing Record 342 of 368 for: Perfect by One Direction.
Processing Record 343 of 368 for: Ghosttown by Madonna.
Processing Record 344 of 368 for: Bang My Head (feat. Sia & Fetty Wap) by David Guetta.
Processing Record 345 of 368 for: Bloodstream by Ed Sheeran.
P

Processing Record 422 of 368 for: Wish That You Were Here - From “Miss Peregrine’s Home for Peculiar Children” Original Motion Picture by Florence + The Machine.
Processing Record 431 of 368 for: Start by John Legend.
Processing Record 435 of 368 for: One Call Away (feat. Tyga) - Remix by Charlie Puth.
Processing Record 438 of 368 for: Do You Wanna Come Over? by Britney Spears.
Processing Record 440 of 368 for: Picky - Remix by Joey Montana.
Processing Record 441 of 368 for: Behind Your Back by Nelly Furtado.
Processing Record 445 of 368 for: Starboy by The Weeknd.
Processing Record 463 of 368 for: There for You by Martin Garrix.
Processing Record 472 of 368 for: Reggaetón Lento (Remix) by CNCO.
Processing Record 474 of 368 for: First Time by Kygo.
Processing Record 476 of 368 for: How Far I'll Go - From "Moana" by Alessia Cara.
Processing Record 480 of 368 for: Came Here for Love by Sigala.
Processing Record 484 of 368 for: OK - Spotify Version by Robin Schulz.
Processing Record 486 o

Processing Record 577 of 368 for: South of the Border (feat. Camila Cabello & Cardi B) by Ed Sheeran.
Processing Record 578 of 368 for: Trampoline (with ZAYN) by SHAED.
Processing Record 579 of 368 for: Happier by Marshmello.
Processing Record 580 of 368 for: Truth Hurts by Lizzo.
Processing Record 581 of 368 for: Good as Hell (feat. Ariana Grande) - Remix by Lizzo.
Processing Record 582 of 368 for: Higher Love by Kygo.
Processing Record 583 of 368 for: Only Human by Jonas Brothers.
Processing Record 584 of 368 for: Beautiful People (feat. Khalid) by Ed Sheeran.
Processing Record 585 of 368 for: Sucker by Jonas Brothers.
Processing Record 586 of 368 for: Don't Call Me Up by Mabel.
Processing Record 587 of 368 for: I Don't Care (with Justin Bieber) by Ed Sheeran.
Processing Record 588 of 368 for: Talk (feat. Disclosure) by Khalid.
Processing Record 589 of 368 for: Giant (with Rag'n'Bone Man) by Calvin Harris.
Processing Record 590 of 368 for: Takeaway by The Chainsmokers.
Processing Rec

In [86]:
empty_df.head()

Unnamed: 0,title,artist,lyrics
5,Baby,Justin Bieber,"Oh, woah\nOh, woah\nOh, woah, ohh\n\nYou know ..."
14,Like A G6,Far East Movement,"Popping bottles in the ice, like a blizzard\nW..."
30,3,Britney Spears,"One, Two, Three\nNot only you and me\nGot one ..."
31,My First Kiss - feat. Ke$ha,3OH!3,My first kiss went a little like this\nAnd twi...
32,Blah Blah Blah (feat. 3OH!3),Kesha,"Badda-da-dah, badda-da-bah-bah\nComing out'cha..."


In [114]:
#drop empty rows from lyrics_df, concatenate two dataframes with lyrics
lyrics_df = lyrics_df[lyrics_df['lyrics'] != '']
all_lyrics = pd.concat([lyrics_df, empty_df])
all_lyrics.head(10)

Unnamed: 0,title,artist,lyrics
0,"Hey, Soul Sister",Train,"Hey, hey, hey\nYour lipstick stains\nOn the fr..."
1,Love The Way You Lie,Eminem,[Chorus: Rihanna]\nJust gonna stand there and ...
2,TiK ToK,Kesha,[Verse 1: Ke$ha & P Diddy]\nWake up in the mor...
3,Bad Romance,Lady Gaga,[Intro]\nOh-oh-oh-oh-oh-oh-oh-oh-oh-oh-oh-oh\n...
4,Just the Way You Are,Bruno Mars,"Oh, her eyes, her eyes make the stars look lik..."
6,Dynamite,Taio Cruz,"[Verse 1: Taio Cruz]\nI came to dance, dance, ..."
7,Secrets,OneRepublic,Secrets (secrets) are no fun\nSecrets (secrets...
8,Empire State of Mind (Part II) Broken Down,Alicia Keys,"Ooh, New York\r\nOoh, New York\r\n\r\nGrew up ..."
9,Only Girl (In The World),Rihanna,La la la la\nLa la la la\nLa la la la\nLa la l...
10,Club Can't Handle Me (feat. David Guetta),Flo Rida,You know I know how\nTo make em stop and stare...


In [88]:
#remove "\n" from lyrics
for index, row in all_lyrics.iterrows():
    lyric = row['lyrics']
    lyric = re.sub('\n', " ", lyric)
    all_lyrics.loc[index, 'lyrics'] = lyric
    
all_lyrics.head(10)

Unnamed: 0,title,artist,lyrics
0,"Hey, Soul Sister",Train,"Hey, hey, hey Your lipstick stains On the fron..."
1,Love The Way You Lie,Eminem,[Chorus: Rihanna] Just gonna stand there and w...
2,TiK ToK,Kesha,[Verse 1: Ke$ha & P Diddy] Wake up in the morn...
3,Bad Romance,Lady Gaga,[Intro] Oh-oh-oh-oh-oh-oh-oh-oh-oh-oh-oh-oh Ca...
4,Just the Way You Are,Bruno Mars,"Oh, her eyes, her eyes make the stars look lik..."
6,Dynamite,Taio Cruz,"[Verse 1: Taio Cruz] I came to dance, dance, d..."
7,Secrets,OneRepublic,Secrets (secrets) are no fun Secrets (secrets)...
8,Empire State of Mind (Part II) Broken Down,Alicia Keys,"Ooh, New York\r Ooh, New York\r \r Grew up in ..."
9,Only Girl (In The World),Rihanna,La la la la La la la la La la la la La la la l...
10,Club Can't Handle Me (feat. David Guetta),Flo Rida,You know I know how To make em stop and stare ...


In [98]:
#create new collections in mongo to hold our song/lyrics/artists dataframes
import pymongo
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

db = client.music_db

collection1 = db.songs
collection2 = db.lyrics
collection3 = db.artists
collection4 = db.words

In [99]:
#loop through songs dataframe to post each row to mongo
for index, row in songs_df.iterrows():
    title = row['title']
    artist = row['artist']
    year = row['year'] 
    genre = row['top genre']
    
    post = {'title': title, 'artist': artist, 'year': year, 'genre': genre}
    collection1.insert_one(post)

In [100]:
#validate that the songs data is in mongo 
songs = db.songs.find()

for song in songs:
    print(song)

{'_id': ObjectId('5e618b2ce97687264b63d8b7'), 'title': 'Hey, Soul Sister', 'artist': 'Train', 'year': 2010, 'genre': 'neo mellow'}
{'_id': ObjectId('5e618b2ce97687264b63d8b8'), 'title': 'Love The Way You Lie', 'artist': 'Eminem', 'year': 2010, 'genre': 'detroit hip hop'}
{'_id': ObjectId('5e618b2ce97687264b63d8b9'), 'title': 'TiK ToK', 'artist': 'Kesha', 'year': 2010, 'genre': 'dance pop'}
{'_id': ObjectId('5e618b2ce97687264b63d8ba'), 'title': 'Bad Romance', 'artist': 'Lady Gaga', 'year': 2010, 'genre': 'dance pop'}
{'_id': ObjectId('5e618b2ce97687264b63d8bb'), 'title': 'Just the Way You Are', 'artist': 'Bruno Mars', 'year': 2010, 'genre': 'pop'}
{'_id': ObjectId('5e618b2ce97687264b63d8bc'), 'title': 'Baby', 'artist': 'Justin Bieber', 'year': 2010, 'genre': 'canadian pop'}
{'_id': ObjectId('5e618b2ce97687264b63d8bd'), 'title': 'Dynamite', 'artist': 'Taio Cruz', 'year': 2010, 'genre': 'dance pop'}
{'_id': ObjectId('5e618b2ce97687264b63d8be'), 'title': 'Secrets', 'artist': 'OneRepublic',

{'_id': ObjectId('5e61b551e97687264b63df39'), 'title': "I'm the One (feat. Justin Bieber, Quavo, Chance the Rapper & Lil Wayne)", 'artist': 'DJ Khaled', 'year': 2017, 'genre': 'dance pop'}
{'_id': ObjectId('5e61b551e97687264b63df3a'), 'title': 'Praying', 'artist': 'Kesha', 'year': 2017, 'genre': 'dance pop'}
{'_id': ObjectId('5e61b551e97687264b63df3b'), 'title': 'Despacito - Remix', 'artist': 'Luis Fonsi', 'year': 2017, 'genre': 'latin'}
{'_id': ObjectId('5e61b551e97687264b63df3c'), 'title': 'The Greatest', 'artist': 'Sia', 'year': 2017, 'genre': 'australian dance'}
{'_id': ObjectId('5e61b551e97687264b63df3d'), 'title': 'There for You', 'artist': 'Martin Garrix', 'year': 2017, 'genre': 'big room'}
{'_id': ObjectId('5e61b551e97687264b63df3e'), 'title': 'Paris', 'artist': 'The Chainsmokers', 'year': 2017, 'genre': 'electropop'}
{'_id': ObjectId('5e61b551e97687264b63df3f'), 'title': 'Crying in the Club', 'artist': 'Camila Cabello', 'year': 2017, 'genre': 'dance pop'}
{'_id': ObjectId('5e6

In [101]:
#loop through lyrics dataframe to post each row in mongo
for index, row in all_lyrics.iterrows():
    title = row['title']
    artist = row['artist']
    lyrics = row['lyrics'] 
    
    post = {'title': title, 'artist': artist, 'lyrics': lyrics}
    collection2.insert_one(post)

In [102]:
#validate that the lyrics collection was populated
lyrics = db.lyrics.find()

for lyric in lyrics:
    print(lyric)

{'_id': ObjectId('5e618b8ee97687264b63db12'), 'title': 'Hey, Soul Sister', 'artist': 'Train', 'lyrics': "Hey, hey, hey\nYour lipstick stains\nOn the front lobe of\nMy left side brain\nI knew I wouldn't forget you\nAnd so I went and let you blow my mind\n\nYour sweet moonbeam\nThe smell of you in every single dream I dream\nI knew when we collided\nYou're the one I have decided\nWho's one of my kind.\n\nHey, soul sister\nAin't that Mr. Mister\nOn the radio, stereo\nThe way you move\nAin't fair you know\n\nHey, soul sister\nI don't wanna miss\nA single thing you do\nTonight\n\nHey, hey, hey\n\nJust in time\nI'm so glad\nYou have a one track\nMind like me.\n\nYou gave my life direction\nA game show love connection\nWe can't deny\n\nI'm so obsessed\nMy heart is bound to beat\nRight out my untrimmed chest\n\nI believe in you.\nLike a virgin\nYou're Madonna\nAnd I'm always gonna wanna\nBlow your mind\n\nHey, soul sister\nAin't that Mr. Mister\nOn the radio, stereo\nThe way you move\nAin't fa

{'_id': ObjectId('5e61b558e97687264b63e00e'), 'title': 'Good Life', 'artist': 'OneRepublic', 'lyrics': "Woke up in London yesterday Found myself in the city near Piccadilly Don't really know how I got here I got some pictures on my phone  New names and numbers that I don't know Address to places like Abbey Road Day turns to night, night turns to whatever we want We're young enough to say  Oh this has gotta be the good life This has gotta be the good life This could really be a good life, good life  Say oh, got this feeling that you can't fight Like this city is on fire tonight This could really be a good life A good, good life  To my friends in New York, I say hello My friends in L.A. they don't know Where I've been for the past few years or so Paris to China to Colorado  Sometimes there's airplanes I can't jump out Sometimes there's bullshit that don't work now We all got our stories but please tell me What there is to complain about  When you're happy like a fool Let it take you over

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [103]:
#Create a list of words from the lyrics we obtained
wordlist= []
wordfreq = []
for index, row in all_lyrics.iterrows():
    lyrics = str(row['lyrics'])
    words = lyrics.split()
    for word in words:
        wordlist.append(word)
wordlist

['Hey,',
 'hey,',
 'hey',
 'Your',
 'lipstick',
 'stains',
 'On',
 'the',
 'front',
 'lobe',
 'of',
 'My',
 'left',
 'side',
 'brain',
 'I',
 'knew',
 'I',
 "wouldn't",
 'forget',
 'you',
 'And',
 'so',
 'I',
 'went',
 'and',
 'let',
 'you',
 'blow',
 'my',
 'mind',
 'Your',
 'sweet',
 'moonbeam',
 'The',
 'smell',
 'of',
 'you',
 'in',
 'every',
 'single',
 'dream',
 'I',
 'dream',
 'I',
 'knew',
 'when',
 'we',
 'collided',
 "You're",
 'the',
 'one',
 'I',
 'have',
 'decided',
 "Who's",
 'one',
 'of',
 'my',
 'kind.',
 'Hey,',
 'soul',
 'sister',
 "Ain't",
 'that',
 'Mr.',
 'Mister',
 'On',
 'the',
 'radio,',
 'stereo',
 'The',
 'way',
 'you',
 'move',
 "Ain't",
 'fair',
 'you',
 'know',
 'Hey,',
 'soul',
 'sister',
 'I',
 "don't",
 'wanna',
 'miss',
 'A',
 'single',
 'thing',
 'you',
 'do',
 'Tonight',
 'Hey,',
 'hey,',
 'hey',
 'Just',
 'in',
 'time',
 "I'm",
 'so',
 'glad',
 'You',
 'have',
 'a',
 'one',
 'track',
 'Mind',
 'like',
 'me.',
 'You',
 'gave',
 'my',
 'life',
 'direct

In [104]:
wordcount = pd.DataFrame({'index': index, 'word':wordlist})
wordcount.head()

Unnamed: 0,index,word
0,597,"Hey,"
1,597,"hey,"
2,597,hey
3,597,Your
4,597,lipstick


In [105]:
#create a dataframe to analyse frequency of word usage in our lyrics
word_df = pd.DataFrame(wordcount['word'].value_counts()).reset_index()
word_df = word_df.rename(columns = {'index':'word', 'word':'Number of times word is used in lyrics db'})

word_df.head()

Unnamed: 0,word,Number of times word is used in lyrics db
0,I,6782
1,you,5565
2,the,5035
3,me,3052
4,to,2841


In [106]:
#loop through songs dataframe to post each row to mongo
for index, row in word_df.iterrows():
    word = row['word']
    frequency_of_use = row['Number of times word is used in lyrics db']
    
    post = {'Word': word, 'Frequency of Use': frequency_of_use}
    collection4.insert_one(post)

In [107]:
#validate that the words collection was populated
words = db.words.find()

for word in words:
    print(word)

{'_id': ObjectId('5e7049d3efb34f9d40d2b808'), 'Word': 'I', 'Frequency of Use': 6782}
{'_id': ObjectId('5e7049d4efb34f9d40d2b809'), 'Word': 'you', 'Frequency of Use': 5565}
{'_id': ObjectId('5e7049d4efb34f9d40d2b80a'), 'Word': 'the', 'Frequency of Use': 5035}
{'_id': ObjectId('5e7049d4efb34f9d40d2b80b'), 'Word': 'me', 'Frequency of Use': 3052}
{'_id': ObjectId('5e7049d4efb34f9d40d2b80c'), 'Word': 'to', 'Frequency of Use': 2841}
{'_id': ObjectId('5e7049d4efb34f9d40d2b80d'), 'Word': 'a', 'Frequency of Use': 2617}
{'_id': ObjectId('5e7049d4efb34f9d40d2b80e'), 'Word': 'it', 'Frequency of Use': 2546}
{'_id': ObjectId('5e7049d4efb34f9d40d2b80f'), 'Word': 'my', 'Frequency of Use': 2218}
{'_id': ObjectId('5e7049d4efb34f9d40d2b810'), 'Word': "I'm", 'Frequency of Use': 1994}
{'_id': ObjectId('5e7049d4efb34f9d40d2b811'), 'Word': 'your', 'Frequency of Use': 1937}
{'_id': ObjectId('5e7049d4efb34f9d40d2b812'), 'Word': 'in', 'Frequency of Use': 1749}
{'_id': ObjectId('5e7049d4efb34f9d40d2b813'), 'Word

{'_id': ObjectId('5e7049d5efb34f9d40d2bc14'), 'Word': 'hanging', 'Frequency of Use': 18}
{'_id': ObjectId('5e7049d5efb34f9d40d2bc15'), 'Word': "who's", 'Frequency of Use': 18}
{'_id': ObjectId('5e7049d5efb34f9d40d2bc16'), 'Word': 'Party', 'Frequency of Use': 18}
{'_id': ObjectId('5e7049d5efb34f9d40d2bc17'), 'Word': 'most', 'Frequency of Use': 18}
{'_id': ObjectId('5e7049d5efb34f9d40d2bc18'), 'Word': 'clouds', 'Frequency of Use': 18}
{'_id': ObjectId('5e7049d5efb34f9d40d2bc19'), 'Word': 'Judas,', 'Frequency of Use': 18}
{'_id': ObjectId('5e7049d5efb34f9d40d2bc1a'), 'Word': 'this?', 'Frequency of Use': 18}
{'_id': ObjectId('5e7049d5efb34f9d40d2bc1b'), 'Word': 'tie', 'Frequency of Use': 18}
{'_id': ObjectId('5e7049d5efb34f9d40d2bc1c'), 'Word': 'happen', 'Frequency of Use': 18}
{'_id': ObjectId('5e7049d5efb34f9d40d2bc1d'), 'Word': 'brand', 'Frequency of Use': 18}
{'_id': ObjectId('5e7049d5efb34f9d40d2bc1e'), 'Word': 'fell', 'Frequency of Use': 18}
{'_id': ObjectId('5e7049d5efb34f9d40d2bc1f

{'_id': ObjectId('5e7049d7efb34f9d40d2c3b8'), 'Word': 'learning', 'Frequency of Use': 5}
{'_id': ObjectId('5e7049d7efb34f9d40d2c3b9'), 'Word': 'key', 'Frequency of Use': 5}
{'_id': ObjectId('5e7049d7efb34f9d40d2c3ba'), 'Word': 'cigarette', 'Frequency of Use': 5}
{'_id': ObjectId('5e7049d7efb34f9d40d2c3bb'), 'Word': 'side)', 'Frequency of Use': 5}
{'_id': ObjectId('5e7049d7efb34f9d40d2c3bc'), 'Word': 'Alone', 'Frequency of Use': 5}
{'_id': ObjectId('5e7049d7efb34f9d40d2c3bd'), 'Word': 'para', 'Frequency of Use': 5}
{'_id': ObjectId('5e7049d7efb34f9d40d2c3be'), 'Word': 'lived', 'Frequency of Use': 5}
{'_id': ObjectId('5e7049d7efb34f9d40d2c3bf'), 'Word': 'whenever', 'Frequency of Use': 5}
{'_id': ObjectId('5e7049d7efb34f9d40d2c3c0'), 'Word': '(What', 'Frequency of Use': 5}
{'_id': ObjectId('5e7049d7efb34f9d40d2c3c1'), 'Word': 'eso', 'Frequency of Use': 5}
{'_id': ObjectId('5e7049d7efb34f9d40d2c3c2'), 'Word': 'million,', 'Frequency of Use': 5}
{'_id': ObjectId('5e7049d7efb34f9d40d2c3c3'), 

{'_id': ObjectId('5e7049d8efb34f9d40d2c971'), 'Word': 'wires', 'Frequency of Use': 3}
{'_id': ObjectId('5e7049d8efb34f9d40d2c972'), 'Word': 'Pitbull', 'Frequency of Use': 3}
{'_id': ObjectId('5e7049d8efb34f9d40d2c973'), 'Word': 'disappears', 'Frequency of Use': 3}
{'_id': ObjectId('5e7049d8efb34f9d40d2c974'), 'Word': 'knives', 'Frequency of Use': 3}
{'_id': ObjectId('5e7049d8efb34f9d40d2c975'), 'Word': 'mistake', 'Frequency of Use': 3}
{'_id': ObjectId('5e7049d8efb34f9d40d2c976'), 'Word': 'evening', 'Frequency of Use': 3}
{'_id': ObjectId('5e7049d8efb34f9d40d2c977'), 'Word': 'tumbling', 'Frequency of Use': 3}
{'_id': ObjectId('5e7049d8efb34f9d40d2c978'), 'Word': 'attitude', 'Frequency of Use': 3}
{'_id': ObjectId('5e7049d8efb34f9d40d2c979'), 'Word': '(Crash', 'Frequency of Use': 3}
{'_id': ObjectId('5e7049d8efb34f9d40d2c97a'), 'Word': 'tattooed', 'Frequency of Use': 3}
{'_id': ObjectId('5e7049d8efb34f9d40d2c97b'), 'Word': 'couch', 'Frequency of Use': 3}
{'_id': ObjectId('5e7049d8efb34f

{'_id': ObjectId('5e7049dbefb34f9d40d2d2b3'), 'Word': 'Ferg', 'Frequency of Use': 1}
{'_id': ObjectId('5e7049dbefb34f9d40d2d2b4'), 'Word': 'mystery)', 'Frequency of Use': 1}
{'_id': ObjectId('5e7049dbefb34f9d40d2d2b5'), 'Word': 'sixteen', 'Frequency of Use': 1}
{'_id': ObjectId('5e7049dbefb34f9d40d2d2b6'), 'Word': 'doorbell', 'Frequency of Use': 1}
{'_id': ObjectId('5e7049dbefb34f9d40d2d2b7'), 'Word': 'digging', 'Frequency of Use': 1}
{'_id': ObjectId('5e7049dbefb34f9d40d2d2b8'), 'Word': 'brakes', 'Frequency of Use': 1}
{'_id': ObjectId('5e7049dbefb34f9d40d2d2b9'), 'Word': 'noggin?', 'Frequency of Use': 1}
{'_id': ObjectId('5e7049dbefb34f9d40d2d2ba'), 'Word': 'backstage,', 'Frequency of Use': 1}
{'_id': ObjectId('5e7049dbefb34f9d40d2d2bb'), 'Word': 'bacon', 'Frequency of Use': 1}
{'_id': ObjectId('5e7049dbefb34f9d40d2d2bc'), 'Word': 'twilight', 'Frequency of Use': 1}
{'_id': ObjectId('5e7049dbefb34f9d40d2d2bd'), 'Word': 'moguls', 'Frequency of Use': 1}
{'_id': ObjectId('5e7049dbefb34f9

{'_id': ObjectId('5e7049ddefb34f9d40d2dae5'), 'Word': 'sank', 'Frequency of Use': 1}
{'_id': ObjectId('5e7049ddefb34f9d40d2dae6'), 'Word': 'trampoline', 'Frequency of Use': 1}
{'_id': ObjectId('5e7049ddefb34f9d40d2dae7'), 'Word': "Nobody's", 'Frequency of Use': 1}
{'_id': ObjectId('5e7049ddefb34f9d40d2dae8'), 'Word': "with'cha", 'Frequency of Use': 1}
{'_id': ObjectId('5e7049ddefb34f9d40d2dae9'), 'Word': '(Aha)', 'Frequency of Use': 1}
{'_id': ObjectId('5e7049ddefb34f9d40d2daea'), 'Word': '(dress', 'Frequency of Use': 1}
{'_id': ObjectId('5e7049ddefb34f9d40d2daeb'), 'Word': 'tear,', 'Frequency of Use': 1}
{'_id': ObjectId('5e7049ddefb34f9d40d2daec'), 'Word': "signin'", 'Frequency of Use': 1}
{'_id': ObjectId('5e7049ddefb34f9d40d2daed'), 'Word': 'hook', 'Frequency of Use': 1}
{'_id': ObjectId('5e7049ddefb34f9d40d2daee'), 'Word': 'ringing,', 'Frequency of Use': 1}
{'_id': ObjectId('5e7049ddefb34f9d40d2daef'), 'Word': 'snap,', 'Frequency of Use': 1}
{'_id': ObjectId('5e7049ddefb34f9d40d2d

{'_id': ObjectId('5e7049deefb34f9d40d2e0eb'), 'Word': 'thrive', 'Frequency of Use': 1}
{'_id': ObjectId('5e7049deefb34f9d40d2e0ec'), 'Word': 'dad,', 'Frequency of Use': 1}
{'_id': ObjectId('5e7049deefb34f9d40d2e0ed'), 'Word': 'lil', 'Frequency of Use': 1}
{'_id': ObjectId('5e7049deefb34f9d40d2e0ee'), 'Word': 'Zach', 'Frequency of Use': 1}
{'_id': ObjectId('5e7049deefb34f9d40d2e0ef'), 'Word': 'tarnish', 'Frequency of Use': 1}
{'_id': ObjectId('5e7049deefb34f9d40d2e0f0'), 'Word': 'Swag,', 'Frequency of Use': 1}
{'_id': ObjectId('5e7049deefb34f9d40d2e0f1'), 'Word': 'overwhelm', 'Frequency of Use': 1}
{'_id': ObjectId('5e7049deefb34f9d40d2e0f2'), 'Word': 'Ow', 'Frequency of Use': 1}
{'_id': ObjectId('5e7049deefb34f9d40d2e0f3'), 'Word': '볼만해', 'Frequency of Use': 1}
{'_id': ObjectId('5e7049deefb34f9d40d2e0f4'), 'Word': 'khakis', 'Frequency of Use': 1}
{'_id': ObjectId('5e7049deefb34f9d40d2e0f5'), 'Word': 'Rub', 'Frequency of Use': 1}
{'_id': ObjectId('5e7049deefb34f9d40d2e0f6'), 'Word': '(F