### Module and Data Imports

In [2]:
"""
Most of these imports are unused, but were used in experimentation during cleaning and prosodic feature engineering.
"""
import pickle as pkl
import re
import string

import pandas as pd

import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords

import syllables
import cmudict as cmu
from langdetect import detect, detect_langs

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split

In [3]:
"""
Load artist names.
"""
with open('data/artist_names.pkl', 'rb') as f: 
    artists = pkl.load(f)

In [3]:
"""
Load artist API paths. 
"""
with open('data/all_artist_paths.pkl', 'rb') as f:
    all_artist_paths = pkl.load(f)

In [4]:
"""
Load song paths for every artist. 
"""
with open('data/all_song_paths.pkl', 'rb') as f: 
    all_song_paths = pkl.load(f)

In [5]:
"""
Load all song lyrics.
"""
with open('data/all_song_lyrics_v2.pkl', 'rb') as f: 
    all_song_lyrics = pkl.load(f)

### Data Cleaning & Feature Engineering
This notebook focuses on extracting vocabulary and prosodic information, rather than straightforward lemmatization and tokenization. 

Due to the significance of slang, interjections and repetition in lyrics as opposed to prose, we choose to retain what would generally be irrelevant or noisy features of the text for our analysis. 

In [6]:
"""
CONVERT FROM LYRICS TO STANDARD TEXT

Rebuild the lyrics dict with cleaned, confirmed songs: 
1) Confirm the artist has a song
2) Confirm the lyrics are to a song (not an interview transcript, etc)
3) Confirm it is at least 16 lines (1+ standard verses)
4) Replace bracketed text denoting verse/chorus separation
"""
cleaned_lyrics = {}
for artist in all_song_lyrics.keys():
    
    # confirm artist has a song
    if all_song_lyrics[artist] != {}:
        
        # add the artist and create a dict for their songs
        cleaned_lyrics[artist] = {}
        for song in all_song_lyrics[artist].keys():
            is_song = False
            is_sixteen = False
            # if there is a verse or chorus marker, add the song to the artist's dict
            if '[Verse' in all_song_lyrics[artist][song] or '[Chorus' in all_song_lyrics[artist][song]:
                is_song = True
            # if the song has at least 16 lines (standard verse)
            if all_song_lyrics[artist][song].count('\n') > 14:   
                is_sixteen = True
            
            if is_song and is_sixteen:
                # replace bracketed text with a space and store to cleaned list
                cleaned_lyrics[artist][song] = re.sub('\[.*?\]', ' ', all_song_lyrics[artist][song])
                # alphanumerics
                cleaned_lyrics[artist][song] = re.sub('\w*\d\w*', ' ', cleaned_lyrics[artist][song])
                # remove punctuation and capitalization
                cleaned_lyrics[artist][song] = re.sub('[%s]' % re.escape(string.punctuation), '', cleaned_lyrics[artist][song].lower())

            

In [87]:
"""
Cast to DF 
"""
# Collect all lyrics in DataFrame
song_dfs = []
for artist in cleaned_lyrics.keys():
    df = pd.DataFrame([[artist, item[0], item[1]] for item in cleaned_lyrics[artist].items()], columns=['artist', 'song', 'lyrics'])
    song_dfs.append(df)
all_lyrics_df = pd.concat(song_dfs)

# Remove ~500 non-english songs (they seriously obfuscate our later metrics)
all_lyrics_df['language'] = all_lyrics_df['lyrics'].apply(lambda x: detect_langs(x))
# Ensure English is detected and remove anything with possible Korean text, our largest source of foreign data
all_lyrics_df = all_lyrics_df[all_lyrics_df['language'].apply(lambda x: 'en' in ''.join(list(map(str, x))))]
all_lyrics_df = all_lyrics_df[all_lyrics_df['language'].apply(lambda x:'ko' not in ''.join(list(map(str, x))))]

# Remove manually identified foreign artists not caught by above 
drop_artists = ['Bang Yong-guk', 'Beenzino', 'Bigg D', 'Chingo Bling', 'Christopher Martin', 'Crucial Star', 'Davido', 'Hanhae', 'Iyanya', 'Jack Parow', 'Jay Park', "K'naan", 'K-OS', 'KOHH', 'Loon', 'Olamide', 'Phyno', 'Sarkodie', 'Sean Paul', 'Sik-K', 'Suga', 'Verbal Jint', 'Yama Buddha', 'Yhaunai Takiyal', 'Yo Yo Honey Singh', 'Yoon Mi-rae', 'Zico', 'E-Sens', 'The Quiett', 'Woo Won Jae', 'G-Dragon', 'Heize', 'Zeebra', 'Badshah', 'Sjava']
all_lyrics_df = all_lyrics_df[all_lyrics_df['artist'].apply(lambda x: x not in drop_artists)]



#### Taxis Information
Build out a DataFrame tracking total and average words, lines and unique words

In [141]:
taxis_df = all_lyrics_df.copy()

# Words, lines, word density
taxis_df['lines'] = taxis_df['lyrics'].apply(lambda x: x.count('\n') + 1)
taxis_df['words'] = taxis_df['lyrics'].apply(lambda x: len(x))
taxis_df['words_per_line'] = taxis_df['words'] / taxis_df['lines']

In [142]:
# Highest word density artists
taxis_df.groupby('artist').mean('words_per_line').sort_values('words_per_line', ascending=False).head(20)

Unnamed: 0_level_0,lines,words,words_per_line
artist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Kung Fu Vampire,23.0,1248.0,54.26087
Qwel,45.0,2307.333333,49.611522
Ray Cash,66.571429,2539.571429,48.340395
Serius Jones,407.5,19088.5,46.930763
Crime Boss,24.0,1124.0,46.833333
Polo G,70.0,3270.0,46.714286
D-Loc,64.0,2989.0,45.838152
Percee P,61.428571,2621.857143,45.564001
Troy Ave,35.0,1572.0,44.914286
Montana of 300,90.333333,3856.166667,44.669456


In [143]:
# unique words
# split into words, remove newlines, remove spaces and empty entries
split_lyrics = taxis_df['lyrics'].apply(lambda x: re.sub('\\n', ' ', x)).apply(lambda x: x.split(' ')).apply(lambda x: [y for y in x if (y!='' and y!=' ')])
# convert each list to a set to count unique words
taxis_df['unique_words'] = [len(set(lyrics)) for lyrics in split_lyrics]
# proportion of unique words (unique/total)
taxis_df['unique_word_rate'] = taxis_df['unique_words'] / taxis_df['words']


In [144]:
# Highest unique word rate artists
taxis_df.groupby('artist').mean().sort_values('unique_word_rate', ascending=False).head(20)

Unnamed: 0_level_0,lines,words,words_per_line,unique_words,unique_word_rate
artist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Mr. Muthafuckin' eXquire,22.0,937.0,42.590909,133.0,0.141942
Kung Fu Vampire,23.0,1248.0,54.26087,158.0,0.126603
Lord Jamar,29.0,914.0,31.517241,115.0,0.125821
Earl Sweatshirt,36.909091,1428.681818,38.492315,171.5,0.124831
Terrace Martin,42.25,1342.25,31.520532,158.25,0.12421
Juicy J,37.5,1426.5,38.046586,177.0,0.12408
Troy Ave,35.0,1572.0,44.914286,195.0,0.124046
Takeoff,23.0,574.0,24.956522,71.0,0.123693
Termanology,62.0,1780.0,28.709677,220.0,0.123596
Crime Boss,24.0,1124.0,46.833333,138.0,0.122776


#### Prosody Information 
Build out a DataFrame tracking syllables (total and averages by line and word)

In [145]:
prosody_df = all_lyrics_df.copy()

In [146]:
# Note, this takes a couple minutes because there are over 40 million words to estimate
prosody_df['syllables'] = split_lyrics.apply(lambda x: sum([syllables.estimate(word) for word in x]))
# Average syllables per line
prosody_df['syllables_per_line'] = prosody_df['syllables'] / taxis_df['lines']
# Average syllables per word 
prosody_df['syllables_per_word'] = prosody_df['syllables'] / taxis_df['words']

In [147]:
"""
use Phyme to calculate rhyme statistics
**for future expansion
"""
# from Phyme import Phyme
# ph = Phyme()
# ph.get_..._rhymes(word, num_sylls=None)

'\nuse Phyme to calculate rhyme statistics\n'

In [149]:
prosody_df.groupby('artist').mean().sort_values('syllables_per_word', ascending=False).head(20)

Unnamed: 0_level_0,syllables,syllables_per_line,syllables_per_word
artist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Bia,339.0,5.380952,0.307065
Kitty,481.0,5.722771,0.295019
Jipsta,481.0,5.722771,0.295019
Silla,432.333333,5.995531,0.294874
Milo,391.5,6.653482,0.285703
Afrika Bambaataa,706.0,6.059524,0.285356
Nicky da B,795.0,8.548387,0.285151
Roger Troutman,625.333333,7.002242,0.284763
Unk,390.0,5.416667,0.284719
Bahamadia,648.0,10.503152,0.28254


Pickle the processed DataFrames for quick access

In [150]:
with open('data/all_lyrics_df.pkl', 'wb') as f:
    pkl.dump(all_lyrics_df, f)

with open('data/taxis_df.pkl', 'wb') as f:
    pkl.dump(taxis_df, f)

with open('data/prosody_df.pkl', 'wb') as f:
    pkl.dump(prosody_df, f)