# Basic cleaning of lyrics obtained from Genius

In [1]:
import pandas as pd
import numpy as np
import pickle

from bs4 import BeautifulSoup
import requests

import time
import re
import string

from pymongo import MongoClient

from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
import gensim

from langdetect import detect_langs
import langdetect

## Importing data

In [2]:
df = pd.read_pickle('../Data/combined_genius_spotify_uncleaned')

## Clean Song Lyrics

``` python
RegEx:
rem_inside_paren = '\([^)]*\)'
rem_inside_brack = '\[[^)]*\]'
rem_inside_curly = '\{[^)]*\}'
```

In [3]:
enclosed_items = lambda x: re.sub(r'\[[^)]*\]|\([^)]*\)|\{[^)]*\}|\n', ' ', x)
alphabet = lambda x: re.sub('[\W]+|[0-9]+', ' ', x)
punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower())
double_space = lambda x: re.sub(' +', ' ', x)

df['raw_lyrics'] = df.raw_lyrics.map(alphabet).map(punc_lower).map(enclosed_items).map(double_space)
df.columns = ['spotify_album_uri', 'spotify_artist_id', 'artist_name',
       'spotify_artist_uri', 'duration_ms', 'explicit', 'spotify_song_id',
       'song_title', 'song_spotify_page', 'track_number', 'spotify_song_uri',
       'song_title', 'Unaltered_artist_name', 'lyrics',
       'genius_song_id', 'geenius_song_url', 'genius_artist_id']

df.head()

Unnamed: 0,spotify_album_uri,spotify_artist_id,artist_name,spotify_artist_uri,duration_ms,explicit,spotify_song_id,song_title,song_spotify_page,track_number,spotify_song_uri,song_title.1,artist_name.1,lyrics,genius_song_id,geenius_song_url,genius_artist_id
0,spotify:album:2h6MSR1rWemHOtmurgRq3T,7kAKqNxPBkfjgdHzUIdBtI,suburban kids with biblical names,spotify:artist:7kAKqNxPBkfjgdHzUIdBtI,138626,False,4okEZakOVppAtP4Dawd52x,marry me,https://open.spotify.com/track/4okEZakOVppAtP4...,1,spotify:track:4okEZakOVppAtP4Dawd52x,Marry Me,Suburban Kids With Biblical Names,any old chance i get i m gonna marry you marr...,861607,https://genius.com/Suburban-kids-with-biblical...,353411
1,spotify:album:2h6MSR1rWemHOtmurgRq3T,7kAKqNxPBkfjgdHzUIdBtI,suburban kids with biblical names,spotify:artist:7kAKqNxPBkfjgdHzUIdBtI,187106,False,2LV6sB5zTsu0R5r5kWohlD,loop duplicate my heart,https://open.spotify.com/track/2LV6sB5zTsu0R5r...,2,spotify:track:2LV6sB5zTsu0R5r5kWohlD,Loop Duplicate My Heart,Suburban Kids With Biblical Names,and it s bigger than everything i have ever d...,980120,https://genius.com/Suburban-kids-with-biblical...,353411
2,spotify:album:2h6MSR1rWemHOtmurgRq3T,7kAKqNxPBkfjgdHzUIdBtI,suburban kids with biblical names,spotify:artist:7kAKqNxPBkfjgdHzUIdBtI,176026,False,53uzYuKe433aXBjzBiuvqe,parakit,https://open.spotify.com/track/53uzYuKe433aXBj...,4,spotify:track:53uzYuKe433aXBjzBiuvqe,Parakit,Suburban Kids With Biblical Names,i m going back to the place i was born my fav...,1583231,https://genius.com/Suburban-kids-with-biblical...,353411
3,spotify:album:2h6MSR1rWemHOtmurgRq3T,7kAKqNxPBkfjgdHzUIdBtI,suburban kids with biblical names,spotify:artist:7kAKqNxPBkfjgdHzUIdBtI,198013,False,7p0pJgizlHS5msrgDPU6li,trees and squirrels,https://open.spotify.com/track/7p0pJgizlHS5msr...,5,spotify:track:7p0pJgizlHS5msrgDPU6li,Trees And Squirrels,Suburban Kids With Biblical Names,the trees are wild and undisputably beautiful...,1340623,https://genius.com/Suburban-kids-with-biblical...,353411
4,spotify:album:2h6MSR1rWemHOtmurgRq3T,7kAKqNxPBkfjgdHzUIdBtI,suburban kids with biblical names,spotify:artist:7kAKqNxPBkfjgdHzUIdBtI,178040,False,1drw02VGWNxVtZuF2Qjp8e,funeral face,https://open.spotify.com/track/1drw02VGWNxVtZu...,6,spotify:track:1drw02VGWNxVtZuF2Qjp8e,Funeral Face,Suburban Kids With Biblical Names,said i love you said i like you and i want yo...,1042112,https://genius.com/Suburban-kids-with-biblical...,353411


## Removing non-english songs

Detects languages. Returns `None` if no language detected.

In [4]:
print(detect_langs('this is a test'))
print(detect_langs('donde esta la biblioteca'))

[en:0.999996179089745]

In [5]:
def lyric_language(lyrics):
    try:
        out = detect_langs(lyrics)
    except:
        out = None
    return out

In [6]:
df['lyrics_language'] = df.lyrics.apply(lyric_language)
df = df[df.lyrics_language.notnull()]

Checks English is the predominant language of the lyrics

In [7]:
def is_english(lyrics_language):
    out = False
    for lang in lyrics_language:
        if 'en:0.9' in str(lang):
            out = True
        else:
            pass
    return out

In [8]:
df = df[df.lyrics_language.apply(is_english)]

### Remove common english stop words

I chose to use the snowball stemmer over lemmatization due to the fact that I'm assuming that I will not be able to interpret all topic models. Lemmatization increases readability but has the draw back of also increasing computational complexity.

In [9]:
def description_stemmer(lyrics):
    stem = SnowballStemmer(language='english')
    tokens = lyrics.split()
    stemmed_tokens = [stem.stem(token) for token in tokens]
    return ' '.join(stemmed_tokens)

In [10]:
stop = set(stopwords.words('english'))
abv_stop = ['re','ve','ll','chorus','vers']

pickle_in = open('../Data/word_count_stop_words_lt11','rb')
new_stopwords = pickle.load(pickle_in)
pickle_in.close()

In [11]:
df['lyrics'] = df.lyrics.apply(lambda x: 
                [item for item in x.split() if item not in stop])
df['lyrics'] = df.lyrics.apply(lambda x: 
                [item for item in x if item not in abv_stop])
df['lyrics'] = df.lyrics.apply(lambda x: 
               [item for item in x if item not in new_stopwords])
df['lyrics'] = df.lyrics.str.join(' ')
df['lyrics'] = df.lyrics.apply(description_stemmer)
df.head(1)

Unnamed: 0,spotify_album_uri,spotify_artist_id,artist_name,spotify_artist_uri,duration_ms,explicit,spotify_song_id,song_title,song_spotify_page,track_number,spotify_song_uri,song_title.1,artist_name.1,lyrics,genius_song_id,geenius_song_url,genius_artist_id,lyrics_language
0,spotify:album:2h6MSR1rWemHOtmurgRq3T,7kAKqNxPBkfjgdHzUIdBtI,suburban kids with biblical names,spotify:artist:7kAKqNxPBkfjgdHzUIdBtI,138626,False,4okEZakOVppAtP4Dawd52x,marry me,https://open.spotify.com/track/4okEZakOVppAtP4...,1,spotify:track:4okEZakOVppAtP4Dawd52x,Marry Me,Suburban Kids With Biblical Names,old chanc get gonna marri marri get act togeth...,861607,https://genius.com/Suburban-kids-with-biblical...,353411,[en:0.9999975562687033]
1,spotify:album:2h6MSR1rWemHOtmurgRq3T,7kAKqNxPBkfjgdHzUIdBtI,suburban kids with biblical names,spotify:artist:7kAKqNxPBkfjgdHzUIdBtI,187106,False,2LV6sB5zTsu0R5r5kWohlD,loop duplicate my heart,https://open.spotify.com/track/2LV6sB5zTsu0R5r...,2,spotify:track:2LV6sB5zTsu0R5r5kWohlD,Loop Duplicate My Heart,Suburban Kids With Biblical Names,bigger everyth ever done x found reason stay h...,980120,https://genius.com/Suburban-kids-with-biblical...,353411,[en:0.999998823659918]
2,spotify:album:2h6MSR1rWemHOtmurgRq3T,7kAKqNxPBkfjgdHzUIdBtI,suburban kids with biblical names,spotify:artist:7kAKqNxPBkfjgdHzUIdBtI,176026,False,53uzYuKe433aXBjzBiuvqe,parakit,https://open.spotify.com/track/53uzYuKe433aXBj...,4,spotify:track:53uzYuKe433aXBjzBiuvqe,Parakit,Suburban Kids With Biblical Names,go back place born favorit hood believ found c...,1583231,https://genius.com/Suburban-kids-with-biblical...,353411,[en:0.9999962698239809]
3,spotify:album:2h6MSR1rWemHOtmurgRq3T,7kAKqNxPBkfjgdHzUIdBtI,suburban kids with biblical names,spotify:artist:7kAKqNxPBkfjgdHzUIdBtI,198013,False,7p0pJgizlHS5msrgDPU6li,trees and squirrels,https://open.spotify.com/track/7p0pJgizlHS5msr...,5,spotify:track:7p0pJgizlHS5msrgDPU6li,Trees And Squirrels,Suburban Kids With Biblical Names,tree wild undisput beauti today see squirrel b...,1340623,https://genius.com/Suburban-kids-with-biblical...,353411,[en:0.9999973351330183]
4,spotify:album:2h6MSR1rWemHOtmurgRq3T,7kAKqNxPBkfjgdHzUIdBtI,suburban kids with biblical names,spotify:artist:7kAKqNxPBkfjgdHzUIdBtI,178040,False,1drw02VGWNxVtZuF2Qjp8e,funeral face,https://open.spotify.com/track/1drw02VGWNxVtZu...,6,spotify:track:1drw02VGWNxVtZuF2Qjp8e,Funeral Face,Suburban Kids With Biblical Names,said love said like want see sun goe wrote tol...,1042112,https://genius.com/Suburban-kids-with-biblical...,353411,[en:0.9999975014770117]


## Looking at most common words across all songs - adding to stopwords

Note that this section is used out of order of other sections as stopword removal was done interatively

In [12]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

In [13]:
clean_sents = list(sent_to_words(df.lyrics))
all_words = [item for sublist in clean_sents for item in sublist]

In [14]:
word_counts = pd.DataFrame(pd.Series(all_words).value_counts())
word_counts.head()

Unnamed: 0,0
know,26487
oh,24967
love,23175
chorus,20722
like,19849


In [15]:
word_count_stop_words_lt11 = word_counts[word_counts[0] <= 10].index.tolist()

In [16]:
# pickle_out = open('../Data/word_count_stop_words_lt11','wb')
# pickle.dump(word_count_stop_words_lt11, pickle_out)
# pickle_out.close()

## Put it to a pickle!

In [17]:
df.to_pickle('../Data/cleaned_lyrics')

In [18]:
df.head(1)

Unnamed: 0,spotify_album_uri,spotify_artist_id,artist_name,spotify_artist_uri,duration_ms,explicit,spotify_song_id,song_title,song_spotify_page,track_number,spotify_song_uri,song_title.1,artist_name.1,lyrics,genius_song_id,geenius_song_url,genius_artist_id,lyrics_language
0,spotify:album:2h6MSR1rWemHOtmurgRq3T,7kAKqNxPBkfjgdHzUIdBtI,suburban kids with biblical names,spotify:artist:7kAKqNxPBkfjgdHzUIdBtI,138626,False,4okEZakOVppAtP4Dawd52x,marry me,https://open.spotify.com/track/4okEZakOVppAtP4...,1,spotify:track:4okEZakOVppAtP4Dawd52x,Marry Me,Suburban Kids With Biblical Names,old chanc get gonna marri marri get act togeth...,861607,https://genius.com/Suburban-kids-with-biblical...,353411,[en:0.9999975562687033]
1,spotify:album:2h6MSR1rWemHOtmurgRq3T,7kAKqNxPBkfjgdHzUIdBtI,suburban kids with biblical names,spotify:artist:7kAKqNxPBkfjgdHzUIdBtI,187106,False,2LV6sB5zTsu0R5r5kWohlD,loop duplicate my heart,https://open.spotify.com/track/2LV6sB5zTsu0R5r...,2,spotify:track:2LV6sB5zTsu0R5r5kWohlD,Loop Duplicate My Heart,Suburban Kids With Biblical Names,bigger everyth ever done x found reason stay h...,980120,https://genius.com/Suburban-kids-with-biblical...,353411,[en:0.999998823659918]
2,spotify:album:2h6MSR1rWemHOtmurgRq3T,7kAKqNxPBkfjgdHzUIdBtI,suburban kids with biblical names,spotify:artist:7kAKqNxPBkfjgdHzUIdBtI,176026,False,53uzYuKe433aXBjzBiuvqe,parakit,https://open.spotify.com/track/53uzYuKe433aXBj...,4,spotify:track:53uzYuKe433aXBjzBiuvqe,Parakit,Suburban Kids With Biblical Names,go back place born favorit hood believ found c...,1583231,https://genius.com/Suburban-kids-with-biblical...,353411,[en:0.9999962698239809]
3,spotify:album:2h6MSR1rWemHOtmurgRq3T,7kAKqNxPBkfjgdHzUIdBtI,suburban kids with biblical names,spotify:artist:7kAKqNxPBkfjgdHzUIdBtI,198013,False,7p0pJgizlHS5msrgDPU6li,trees and squirrels,https://open.spotify.com/track/7p0pJgizlHS5msr...,5,spotify:track:7p0pJgizlHS5msrgDPU6li,Trees And Squirrels,Suburban Kids With Biblical Names,tree wild undisput beauti today see squirrel b...,1340623,https://genius.com/Suburban-kids-with-biblical...,353411,[en:0.9999973351330183]
4,spotify:album:2h6MSR1rWemHOtmurgRq3T,7kAKqNxPBkfjgdHzUIdBtI,suburban kids with biblical names,spotify:artist:7kAKqNxPBkfjgdHzUIdBtI,178040,False,1drw02VGWNxVtZuF2Qjp8e,funeral face,https://open.spotify.com/track/1drw02VGWNxVtZu...,6,spotify:track:1drw02VGWNxVtZuF2Qjp8e,Funeral Face,Suburban Kids With Biblical Names,said love said like want see sun goe wrote tol...,1042112,https://genius.com/Suburban-kids-with-biblical...,353411,[en:0.9999975014770117]
