In [21]:
import pandas as pd
import numpy as np

import re
import string
import random
import operator

from collections import Counter

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import torch
from torch.utils.data import Dataset, DataLoader

In [172]:
music = pd.read_csv('lyrics.csv')
music.head()

Unnamed: 0,index,song,year,artist,genre,lyrics
0,0,ego-remix,2009,beyonce-knowles,Pop,"Oh baby, how you doing?\nYou know I'm gonna cu..."
1,1,then-tell-me,2009,beyonce-knowles,Pop,"playin' everything so easy,\nit's like you see..."
2,2,honesty,2009,beyonce-knowles,Pop,If you search\nFor tenderness\nIt isn't hard t...
3,3,you-are-my-rock,2009,beyonce-knowles,Pop,"Oh oh oh I, oh oh oh I\n[Verse 1:]\nIf I wrote..."
4,4,black-culture,2009,beyonce-knowles,Pop,"Party the people, the people the party it's po..."


In [173]:
# Check the genre
music['genre'].unique()

array(['Pop', 'Hip-Hop', 'Not Available', 'Other', 'Rock', 'Metal',
       'Country', 'Jazz', 'Electronic', 'Folk', 'R&B', 'Indie'],
      dtype=object)

In [174]:
# Get how many Hip-Hop songs in the dataset
sum(music['genre'] == 'Hip-Hop')

33965

In [175]:
# Initial clean of hip-hop dataframe
hip_hop_df = music[music.genre == "Hip-Hop"]

# 1. Take out all the rows with nan in lyrics
hip_hop_df = hip_hop_df[~hip_hop_df.lyrics.isna()]

# 2. Take out irregular years
num_yrs = len(sorted(hip_hop_df.year.unique()))
print("Below is the distribution in terms of release years:")
print(Counter(hip_hop_df.year.values).most_common(num_yrs))
hip_hop_df = hip_hop_df[hip_hop_df.year >= 1989]

# Print result after initial clean
print(hip_hop_df.shape)

Below is the distribution in terms of release years:
[(2006, 6062), (2007, 3808), (2012, 1778), (2008, 1646), (2010, 1537), (2016, 1508), (2011, 1474), (2014, 1324), (2009, 1294), (2013, 1229), (2015, 1161), (2005, 564), (2004, 435), (2002, 200), (1999, 127), (2003, 111), (1996, 95), (2001, 89), (2000, 82), (1995, 81), (1992, 59), (1994, 54), (1989, 48), (1998, 34), (1993, 13), (1997, 12), (1991, 11), (1990, 7), (112, 3), (1982, 2), (702, 1), (67, 1)]
(24843, 6)


In [176]:
# Readjust index number of the dataframe
hip_hop_df = hip_hop_df.drop("index", axis=1)\
                       .reset_index()\
                       .drop("index", axis=1)
hip_hop_df.head(5)

Unnamed: 0,song,year,artist,genre,lyrics
0,i-got-that,2007,eazy-e,Hip-Hop,(horns)...\n(chorus)\nTimbo- When you hit me o...
1,8-ball-remix,2007,eazy-e,Hip-Hop,"Verse 1:\nI don't drink brass monkey, like to ..."
2,extra-special-thankz,2007,eazy-e,Hip-Hop,"19 muthaphukkin 93,\nand I'm back in this bitc..."
3,boyz-in-da-hood,2007,eazy-e,Hip-Hop,"Hey yo man, remember that shit Eazy did a whil..."
4,automoblie,2007,eazy-e,Hip-Hop,"Yo, Dre, man, I take this bitch out to the mov..."


In [177]:
# Get number of "\n" for each lyrics
hip_hop_df["num_of_line_change"] = hip_hop_df.lyrics.apply(lambda x: x.count("\n"))

In [178]:
# Get the distribution of number of line changes
print("Overview of number of line changes:")
print(hip_hop_df.num_of_line_change.describe())

# Calculate the threshold of the filter
# The motivation is to take out 
# 1. Intro
# 2. Outro
# 3. Interlude
# 4. Skit
thres_ = hip_hop_df.num_of_line_change.describe()["mean"] - hip_hop_df.num_of_line_change.describe()["std"]
print(f"\nThreshold: {thres_}")

# Filter out songs with number of line changes less than thres_
hip_hop_df = hip_hop_df[hip_hop_df.num_of_line_change >= int(thres_)]
print(f"\nnew shape after filter: {hip_hop_df.shape}")

Overview of number of line changes:
count    24843.000000
mean        63.062513
std         31.558344
min          0.000000
25%         44.000000
50%         62.000000
75%         81.000000
max       1090.000000
Name: num_of_line_change, dtype: float64

Threshold: 31.504168699269503

new shape after filter: (21534, 6)


In [179]:
# For consistency, we feel there is a need to set a maximum line length
# The motivation behind it is that some lyrics just have a 500 number of characters for a line

hip_hop_df["max_line_length"] = hip_hop_df["lyrics"].apply(lambda x: max([len(l) for l in x.split("\n")]))

In [180]:
# Get the distribution of max line length
print("Overview of number of max line length:")
print(hip_hop_df.max_line_length.describe())

# Calculate the threshold of the filter
# Get the data within 2 std's
lower_bound = hip_hop_df.max_line_length.describe()["mean"] - hip_hop_df.max_line_length.describe()["std"]
upper_bound = hip_hop_df.max_line_length.describe()["mean"] + hip_hop_df.max_line_length.describe()["std"]
print(f"\nlower bound: {lower_bound}")
print(f"upper bound: {upper_bound}")

# Filter out songs with number of max line length outside of 1 standard deviation
hip_hop_df = hip_hop_df[(hip_hop_df.max_line_length >= int(lower_bound)) &
                        (hip_hop_df.max_line_length <= int(upper_bound))]
print(f"\nnew shape after filter: {hip_hop_df.shape}")

Overview of number of max line length:
count    21534.000000
mean        70.498885
std         44.375378
min         17.000000
25%         59.000000
50%         66.000000
75%         73.000000
max       1705.000000
Name: max_line_length, dtype: float64

lower bound: 26.12350747623585
upper bound: 114.87426349060726

new shape after filter: (21021, 7)


In [258]:
# In the lyrics, there are many identifying lines that are 
# actually not a part of the actual lyric in the song
# For example,
# - Chorus:
# - Verse 1:
# (Hook)
# We aim to take out these identifying lines

# Usually, these identifying lines are anotated with "[]" or "()"

# Build lists for bracket and parenthesis, respectively
bracket_list = []
parenthesis_list = []
i = 0 # processing flag

for lyrics in hip_hop_df.lyrics.values:
    # show process
    if i % 2000 == 0:
        print(i)
    lines = lyrics.split("\n")
    for line in lines:
        if line.startswith("[") or line.endswith("]") or line.startswith('{') or line.endswith('}'):
            bracket_list.append(line)
        if line.startswith("(") or line.endswith(")"):
            parenthesis_list.append(line)
    i += 1
    
bracket_dict = Counter(bracket_list)
parenthesis_dict = Counter(parenthesis_list)
print(f"# keys for bracket: {len(bracket_dict.keys())}")
print(f"# keys for parenthesis: {len(parenthesis_dict.keys())}")

0
2000
4000
6000
8000
10000
12000
14000
16000
18000
20000
# keys for bracket: 14120
# keys for parenthesis: 47827


In [259]:
bracket_dict.most_common(20)

[('[Chorus]', 6267),
 ('[Hook]', 1504),
 ('[Verse 1]', 801),
 ('[Verse 2]', 794),
 ('[Chorus: x2]', 696),
 ('[Chorus:]', 660),
 ('[Intro]', 349),
 ('[Verse 3]', 318),
 ('[Bridge]', 247),
 ('[chorus]', 214),
 ('[Hook:]', 204),
 ('[Verse 2:]', 190),
 ('[Chorus x2]', 186),
 ('[Verse 1:]', 182),
 ('[Baby]', 170),
 ('[Outro]', 162),
 ('[Eminem]', 158),
 ('[Ghostface Killah]', 156),
 ('[Busta Rhymes]', 153),
 ('[Bizzy Bone]', 142)]

In [273]:
# Get the filter for bracket
bracket_list_elim = []
for key in bracket_dict.keys():
    if (bracket_dict[key] >= 5 and len(key) <= 20) \
    or "verse" in key.lower() \
    or "chorus" in key.lower() \
    or key.endswith("}") \
    or key.endswith(")"):
        bracket_list_elim.append(key)
        
len(bracket_list_elim)

5253

In [261]:
parenthesis_dict.most_common(20)

[('(Chorus)', 594),
 ('(chorus)', 207),
 ('(Hook)', 197),
 ('(Yeah)', 137),
 ('(CHORUS)', 112),
 ('(Bun)', 99),
 ('(Verse 2)', 94),
 ('(Verse)', 93),
 ('(Verse 1)', 85),
 ('(x2)', 84),
 ('(Who?)', 79),
 ('I did it for my dawgz (I did it for my dawgz)', 69),
 ('(Give it here)', 68),
 ("(He'll make a way)", 66),
 ("(C'mon)", 65),
 ('(Shake it, shake it)', 65),
 ("Don't walk away boy (Don't walk away)", 64),
 ('(repeat)', 63),
 ('(Come on)', 62),
 ('(Bridge)', 62)]

In [277]:
# Get the filter for parenthesis
parenthesis_list_elim = []
for key in parenthesis_dict.keys():
    if (parenthesis_dict[key] >= 5 and len(key) <= 15) \
    or "verse" in key.lower() \
    or "chorus" in key.lower():
        parenthesis_list_elim.append(key)
        
len(parenthesis_list_elim)

1319

In [279]:
# Replace the parenthesis and bracket in the lyrics
def lyricFilter(lyric):
    lines = lyric.split("\n")
    temp_list = []
    for line in lines:
        if line not in bracket_list_elim and line not in parenthesis_list_elim and ":" not in line:
            temp_list.append(line)
    return '\n'.join(temp_list)
            
hip_hop_df_filtered = hip_hop_df.copy()
hip_hop_df_filtered["lyrics"] = hip_hop_df_filtered["lyrics"].apply(lambda x: lyricFilter(x))

In [280]:
hip_hop_df_filtered.head(5)

Unnamed: 0,song,year,artist,genre,lyrics,num_of_line_change,max_line_length
1,8-ball-remix,2007,eazy-e,Hip-Hop,"I don't drink brass monkey, like to be funky\n...",70,80
3,boyz-in-da-hood,2007,eazy-e,Hip-Hop,"Hey yo man, remember that shit Eazy did a whil...",97,54
5,i-d-rather-fuck-you,2007,eazy-e,Hip-Hop,"Aah, this is one of them songs\nYou can kick b...",44,51
6,boyz-in-the-hood-remix,2007,eazy-e,Hip-Hop,"Hey yo man, remember that shit Eazy did a whil...",97,54
7,fuck-dre,2007,eazy-e,Hip-Hop,This is a ghetto public announcement (weed smo...,48,69


In [285]:
print(hip_hop_df_filtered.lyrics.values[0])

I don't drink brass monkey, like to be funky
Nickname Eazy-E your 8 ball junkie
Bass drum kickin', to show my shit
Rap a hole in my dick, boy I don't quit
Crowd rocking motherfucker from around the way
I got a six shooter yo mean hombre
Rolling through the hood to find the boys kick dust and cuss crank up some noise
Police on my drawers, I have to pause
40 ounce in my lap and it's freezing my balls
Hook a right turn and let the boys go past
Then I say to myself, They can kiss my ass
Hip to get drunk got the 8 in my lips
Put in the old tape Marvin Gaye's greatest hits
Turn the shit up had the bass cold whomping
Cruising through the east side south of Compton
See a big ass and I say word
I took a look at the face, and the bitch was to the curb
Hoes on my tip for the title I'm holding
Eazy-E's fucked up and got the 8 ball rolling
Riding on Slauson down towards Crenshaw
Turned down south, to dish the law
Stopped at a light and had a fit,
Cos a mexican almost wrecked my shit.
Flipped his as

In [286]:
hip_hop_df_filtered.to_csv("hip_hop_filtered.csv", index=False)