# Final Project

### Data Cleanup

In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('data/lyrics.csv.zip',index_col='index')

In [3]:
display(data.head())
data.shape

Unnamed: 0_level_0,song,year,artist,genre,lyrics
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,ego-remix,2009,beyonce-knowles,Pop,"Oh baby, how you doing?\nYou know I'm gonna cu..."
1,then-tell-me,2009,beyonce-knowles,Pop,"playin' everything so easy,\nit's like you see..."
2,honesty,2009,beyonce-knowles,Pop,If you search\nFor tenderness\nIt isn't hard t...
3,you-are-my-rock,2009,beyonce-knowles,Pop,"Oh oh oh I, oh oh oh I\n[Verse 1:]\nIf I wrote..."
4,black-culture,2009,beyonce-knowles,Pop,"Party the people, the people the party it's po..."


(362237, 5)

In [4]:
# drop na 
data = data.dropna()
data.shape

(266556, 5)

In [5]:
year = data['year'].unique()
year

array([2009, 2007, 2013, 2010, 2012, 2006, 2016, 2011, 2015, 2008, 2014,
       1998, 2002, 1995, 2004, 1972, 2005, 1978, 1970, 1981, 1994, 1997,
       1993, 1982, 1983, 1986, 1992, 1977, 1989, 1979, 1996, 2001, 1990,
       1987, 2003, 1975, 1973, 1991, 1999, 1974, 2000, 1980, 1984, 1976,
        702, 1971, 1985, 1988,  112, 1968,   67], dtype=int64)

In [6]:
# drop those in the wrong year
wrong_year = data.loc[data['year'].isin([67,2038,112,702])]
display(wrong_year)
print('Wrong Year shape',wrong_year.shape)
data.drop(data.loc[data['year'].isin([67,2038,112,702])].index, inplace=True)
data.shape

Unnamed: 0_level_0,song,year,artist,genre,lyrics
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
27657,star,702,clipse,Hip-Hop,You're my star\nIt's such a wonder how you shi...
69708,anywhere-remix,112,dru-hill,Hip-Hop,"Here we are all alone\nYou and me, privacy\nAn..."
147914,it-s-over-now-remix,112,g-dep,Hip-Hop,What is this?\nNumbers in your pocket\nI remem...
238541,come-see-me-remix,112,black-rob,Hip-Hop,"Baby, you can come see me 'cause I need you he..."
315540,let-s-lurk,67,giggs,Hip-Hop,Verse 1:\nStill pulling up on smoke\nSkeng in ...
335205,i-can-t-believe,112,faith-evans,Pop,[Chorus]\nI can't believe that love has gone a...


Wrong Year shape (6, 5)


(266550, 5)

In [7]:
# drop Not available genre
print(data['genre'].unique())
data.drop(data.loc[data['genre']=='Not Available'].index, inplace=True)
data.shape

['Pop' 'Hip-Hop' 'Not Available' 'Rock' 'Metal' 'Other' 'Country' 'Jazz'
 'Electronic' 'Folk' 'R&B' 'Indie']


(242609, 5)

In [8]:
# check that column values are valid
print('Genres: ',np.sort(data['genre'].unique()))
print('Artist: ',np.sort(data['artist'].unique()))
print('Year: ',np.sort(data['year'].unique()))
print(data.shape)


Genres:  ['Country' 'Electronic' 'Folk' 'Hip-Hop' 'Indie' 'Jazz' 'Metal' 'Other'
 'Pop' 'R&B' 'Rock']
Artist:  ['009-sound-system' '047' '1-800-zombie' ... 'the-grates'
 'the-graveyard-boulevard' 'the-great-flood-catastrophe']
Year:  [1968 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982
 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996
 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010
 2011 2012 2013 2014 2015 2016]
(242609, 5)


In [9]:
# lower characters
data['lyrics'] = data['lyrics'].str.lower()
data.shape
data.head()

Unnamed: 0_level_0,song,year,artist,genre,lyrics
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,ego-remix,2009,beyonce-knowles,Pop,"oh baby, how you doing?\nyou know i'm gonna cu..."
1,then-tell-me,2009,beyonce-knowles,Pop,"playin' everything so easy,\nit's like you see..."
2,honesty,2009,beyonce-knowles,Pop,if you search\nfor tenderness\nit isn't hard t...
3,you-are-my-rock,2009,beyonce-knowles,Pop,"oh oh oh i, oh oh oh i\n[verse 1:]\nif i wrote..."
4,black-culture,2009,beyonce-knowles,Pop,"party the people, the people the party it's po..."


In [10]:
# word cleanup
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
stopwords+= ['verse','chorus']
from nltk.tokenize import RegexpTokenizer
print('there are ',len(stopwords),'stopwords')
# use regex to remove punctuation
data['lyrics'] = data['lyrics'].str.replace('[^\w\s]','')

[nltk_data] Downloading package stopwords to C:\Users\Jing
[nltk_data]     Yun\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


there are  181 stopwords


In [11]:
#remove line break
data = data.replace({r'\s+$': '', r'^\s+': ''}, regex=True).replace(r'\n',  ' ', regex=True)
data.head(30)

Unnamed: 0_level_0,song,year,artist,genre,lyrics
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,ego-remix,2009,beyonce-knowles,Pop,oh baby how you doing you know im gonna cut ri...
1,then-tell-me,2009,beyonce-knowles,Pop,playin everything so easy its like you seem so...
2,honesty,2009,beyonce-knowles,Pop,if you search for tenderness it isnt hard to f...
3,you-are-my-rock,2009,beyonce-knowles,Pop,oh oh oh i oh oh oh i verse 1 if i wrote a boo...
4,black-culture,2009,beyonce-knowles,Pop,party the people the people the party its popp...
5,all-i-could-do-was-cry,2009,beyonce-knowles,Pop,i heard church bells ringing i heard a choir s...
6,once-in-a-lifetime,2009,beyonce-knowles,Pop,this is just another day that i would spend wa...
7,waiting,2009,beyonce-knowles,Pop,waiting waiting waiting waiting waiting waitin...
8,slow-love,2009,beyonce-knowles,Pop,verse 1 i read all of the magazines while wait...
9,why-don-t-you-love-me,2009,beyonce-knowles,Pop,nnnow honey you better sit down and look aroun...


In [12]:
data.to_csv('lyrics_clean.csv',index=False)