In [1]:
import pandas as pd
import numpy as np
import string

from bs4 import BeautifulSoup
import requests

import time
import re

from pymongo import MongoClient

from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

## Connecting to Mongo Client

In [2]:
client = MongoClient()
db = client.genius
db.list_collection_names()

['song_urls', 'song_lyrics', 'artist_ids']

In [3]:
cursor = db.song_lyrics.find({}, {'_id':0, 'song_id':1, 'song_title':1, 'song_url':1, 'raw_lyrics':1})
df = pd.DataFrame(list(cursor))
df.head()

Unnamed: 0,raw_lyrics,song_id,song_title,song_url
0,"\n\nOooh, love\nYou're coming down on me\n\nOo...",987434,Berlin,https://genius.com/The-essex-green-berlin-lyrics
1,\n\nI just wanna remain an anonimity\nI don't ...,1009076,Big Green Tree,https://genius.com/The-essex-green-big-green-t...
2,\n\nA moment taken slowly by the sea\nWind blo...,1551187,By the Sea,https://genius.com/The-essex-green-by-the-sea-...
3,"\n\nI lie\nHalfway to sleep, sleep so restless...",1745373,Carballo,https://genius.com/The-essex-green-carballo-ly...
4,"\n\nYou were down when I met you, but\nDeep in...",1073422,Chartiers,https://genius.com/The-essex-green-chartiers-l...


## Clean Song Lyrics

``` python
RegEx:
rem_inside_paren = '\([^)]*\)'
rem_inside_brack = '\[[^)]*\]'
rem_inside_curly = '\{[^)]*\}'
```

In [4]:
enclosed_items = lambda x: re.sub(r'\[[^)]*\]|\([^)]*\)|\{[^)]*\}|\n', ' ', x)
alphabet = lambda x: re.sub('[\W]+|[0-9]+', ' ', x)
punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower())
double_space = lambda x: re.sub(' +', ' ', x)

df['raw_lyrics'] = df.raw_lyrics.map(alphabet).map(punc_lower).map(enclosed_items).map(double_space)
df.columns = ['lyrics','song_id','song_title','song_url']
df.head()

Unnamed: 0,lyrics,song_id,song_title,song_url
0,oooh love you re coming down on me oooh love ...,987434,Berlin,https://genius.com/The-essex-green-berlin-lyrics
1,i just wanna remain an anonimity i don t want...,1009076,Big Green Tree,https://genius.com/The-essex-green-big-green-t...
2,a moment taken slowly by the sea wind blowing...,1551187,By the Sea,https://genius.com/The-essex-green-by-the-sea-...
3,i lie halfway to sleep sleep so restlessly fo...,1745373,Carballo,https://genius.com/The-essex-green-carballo-ly...
4,you were down when i met you but deep in your...,1073422,Chartiers,https://genius.com/The-essex-green-chartiers-l...


### Remove common english stop words

In [80]:
def description_lemmatizer(lyrics):
    tokens = description.split()
    stemmed_tokens = [lem.lemmatize(token) for token in tokens]
    return ' '.join(stemmed_tokens)

In [83]:
stop = set(stopwords.words('english'))

In [86]:
df['lyrics'] = df.lyrics.apply(lambda x: 
                [item for item in x.split() if item not in stop])
df['lyrics'] = df.lyrics.str.join(' ')
df.head()

Unnamed: 0,lyrics,song_id,song_title,song_url
0,oooh love coming oooh love got show oooh frien...,987434,Berlin,https://genius.com/The-essex-green-berlin-lyrics
1,wanna remain anonimity want anyone know face w...,1009076,Big Green Tree,https://genius.com/The-essex-green-big-green-t...
2,moment taken slowly sea wind blowing bells rin...,1551187,By the Sea,https://genius.com/The-essex-green-by-the-sea-...
3,lie halfway sleep sleep restlessly want place ...,1745373,Carballo,https://genius.com/The-essex-green-carballo-ly...
4,met deep eyes could see sign thousand drummers...,1073422,Chartiers,https://genius.com/The-essex-green-chartiers-l...


## Adding artists to lyrics dataset

In [7]:
cursor = db.song_urls.find({}, {'_id':0, 'artist_id':1, 'id':1})
artist_id_df = pd.DataFrame(list(cursor))
artist_id_df.columns = ['artist_id','song_id']
artist_id_df.head()

Unnamed: 0,artist_id,song_id
0,358098,987434
1,358098,1009076
2,358098,1551187
3,358098,1745373
4,358098,1073422


In [8]:
df = df.merge(artist_id_df, on='song_id')
df.head()

Unnamed: 0,lyrics,song_id,song_title,song_url,artist_id
0,oooh love you re coming down on me oooh love ...,987434,Berlin,https://genius.com/The-essex-green-berlin-lyrics,358098
1,i just wanna remain an anonimity i don t want...,1009076,Big Green Tree,https://genius.com/The-essex-green-big-green-t...,358098
2,a moment taken slowly by the sea wind blowing...,1551187,By the Sea,https://genius.com/The-essex-green-by-the-sea-...,358098
3,i lie halfway to sleep sleep so restlessly fo...,1745373,Carballo,https://genius.com/The-essex-green-carballo-ly...,358098
4,you were down when i met you but deep in your...,1073422,Chartiers,https://genius.com/The-essex-green-chartiers-l...,358098


In [12]:
cursor = db.artist_ids.find({}, {'_id':0, 'artist_name':1, 'artist_id':1})
artist_name_df = pd.DataFrame(list(cursor))
artist_name_df.head()

Unnamed: 0,artist_id,artist_name
0,358098,The Essex Green
1,63636,FKA twigs
2,1602422,The Margarets
3,353058,Los Planetas
4,21765,Hayley Kiyoko


In [13]:
df = df.merge(artist_name_df, on='artist_id')
df.head()

Unnamed: 0,lyrics,song_id,song_title,song_url,artist_id,artist_name
0,oooh love you re coming down on me oooh love ...,987434,Berlin,https://genius.com/The-essex-green-berlin-lyrics,358098,The Essex Green
1,i just wanna remain an anonimity i don t want...,1009076,Big Green Tree,https://genius.com/The-essex-green-big-green-t...,358098,The Essex Green
2,a moment taken slowly by the sea wind blowing...,1551187,By the Sea,https://genius.com/The-essex-green-by-the-sea-...,358098,The Essex Green
3,i lie halfway to sleep sleep so restlessly fo...,1745373,Carballo,https://genius.com/The-essex-green-carballo-ly...,358098,The Essex Green
4,you were down when i met you but deep in your...,1073422,Chartiers,https://genius.com/The-essex-green-chartiers-l...,358098,The Essex Green


## Put it to a pickle!

In [None]:
df.to_pickle('cleaned_lyrics')