In [1]:
import pandas as pd
import requests
import bs4
import re
from bs4 import BeautifulSoup
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('vader_lexicon')

[nltk_data] Downloading package punkt to /Users/sidaqin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sidaqin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/sidaqin/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

First we will scrape the following Wikipedia page.

In [2]:
wiki = "https://en.wikipedia.org/wiki/List_of_pop_musicians_who_died_of_drug_overdose"
page = requests.get(wiki)
soup = BeautifulSoup(page.content,'html.parser')
names = set()
tables = soup.findAll(name='table', class_='wikitable sortable')


for table in tables:
    body = table.findChild("tbody" , recursive=False)
    for row in body.findAll('tr'):
        cells = row.findAll('a')
        if(len(cells) == 0) :continue
        names.add(cells[0].text.strip())

Here is the list of the names.

In [3]:
names

{'Alan Wilson',
 'Allen Woody',
 'Amy Winehouse',
 'Andrew Wood',
 'Billy Mackenzie',
 'Billy Murcia',
 'Bobby Hatfield',
 'Bobby Sheehan',
 'Bon Scott',
 'Bradley Nowell',
 'Brent Mydland',
 'Brian Cole',
 'Carl Crack',
 'Chris Cornell',
 'Chris Kelly',
 'Danny Whitten',
 'Darby Crash',
 'Dave Brockie',
 'Dave Rubinstein',
 'Dave Schulthise',
 'David McComb',
 'David Ruffin',
 'Dee Dee Ramone',
 'Dickie Pride',
 'Dinah Washington',
 'Dwayne Goettel',
 'Erik Brødreskift',
 'Esther Phillips',
 'Frankie Lymon',
 'GG Allin',
 'Gary Holton',
 'Gary Moore',
 'Gary Thain',
 'Gerry Rafferty',
 'Gidget Gein',
 'Gram Parsons',
 'Gregory Herbert',
 'Hillel Slovak',
 'Hollywood Fats',
 'Howie Epstein',
 'Ike Turner',
 'Iosu Expósito',
 'James Honeyman-Scott',
 'Jani Lane',
 'Janis Joplin',
 'Jason Molina',
 'Jay Bennett',
 'Jay Reatard',
 'Jeremy Michael Ward',
 'Jesse Ed Davis',
 'Jim Morrison',
 'Jimi Hendrix',
 'Jimi Jamison',
 'Jimmy McCulloch',
 'John Baker Saunders',
 'John Belushi',
 'John

In [4]:
len(names)

112

Now we do the same thing for 2 more data sets online

In [5]:
page = requests.get("https://drugabuse.com/30-famous-musicians-who-have-battled-drug-addiction-and-alcoholism/")
soup = BeautifulSoup(page.content,'html.parser')
for entry in soup.findAll("h2"):
    try:
        names.add(entry.text.split(".",1)[1].strip())
    except:
        break
    

In [6]:
print(len(names))
namesPD = pd.DataFrame(data={"Band": list(names)})

138


In [7]:
wiki = "https://en.wikipedia.org/wiki/List_of_deaths_in_rock_and_roll"
page = requests.get(wiki)
soup = BeautifulSoup(page.content,'html.parser')
rock_names = set()
tables = soup.findAll(name='table', class_='wikitable')

for table in tables:
    body = table.findChild("tbody" , recursive=False)
    for row in body.findAll('tr'):
        cells = row.findAll('a')
        if(len(cells) == 0) :continue
        rock_names.add(cells[0].text.strip())

In [8]:
names.update(rock_names)

Now, we put the lyrics data into a CSV

In [9]:
songsTable1 = pd.read_csv("lyrics/lyrics1.csv")

In [10]:
songsTable1.head()

Unnamed: 0,Band,Lyrics,Song
0,Elijah Blake,"No, no\r\nI ain't ever trapped out the bando\r...",Everyday
1,Elijah Blake,"The drinks go down and smoke goes up, I feel m...",Live Till We Die
2,Elijah Blake,She don't live on planet Earth no more\r\nShe ...,The Otherside
3,Elijah Blake,"Trippin' off that Grigio, mobbin', lights low\...",Pinot
4,Elijah Blake,"I see a midnight panther, so gallant and so br...",Shadows & Diamonds


In [11]:
pd1= songsTable1.merge(namesPD, how="inner", on="Band")
pd2= songsTable1[songsTable1['Band'].isin(names)]

In [12]:
print(len(pd1))
print(len(pd2))


#This is just to show that these are the the same frames, that the two methods in the above cells do that same thing
pd1.sort_values(by=["Band",'Lyrics','Song'],inplace=True)
#pd2.sort_values(by=["Band",'Lyrics','Song'],inplace=True)

#pd2.reset_index(drop=True,inplace=True)
pd1.reset_index(drop=True,inplace=True)

print(pd2.equals(pd1))

2673
16807
False


In [13]:
#pd2[pd2.duplicated(keep=False)]
pd1[pd1.duplicated(keep=False)]

#there are duplicates, need to drop later

Unnamed: 0,Band,Lyrics,Song
1989,Ray Charles,For all we know\r\nWe may never meet again\r\n...,For All We Know
1990,Ray Charles,For all we know\r\nWe may never meet again\r\n...,For All We Know


In [14]:
songsTable2 = pd.read_csv("lyrics/lyrics2.csv")
pd3= songsTable2.merge(namesPD, how="inner", on="Band")
#pd4= songsTable2[songsTable2['Band'].isin(names)]
pd3.sort_values(by=["Band",'Lyrics','Song'],inplace=True)
#pd4.sort_values(by=["Band",'Lyrics','Song'],inplace=True)

pd3 = pd3.reset_index(drop=True)
#pd4 = pd4.reset_index(drop=True)

#print(pd3.equals(pd4))
pd3[pd3.duplicated(keep='first')]
#pd4[pd4.duplicated(keep='first')]

Unnamed: 0,Band,Lyrics,Song


In [15]:
pd5 = pd.concat([pd1,pd3])

pd 5 is the data frame containing artists, one of their songs, and its lyrics. We will clean it next.

In [16]:
pd5

Unnamed: 0,Band,Lyrics,Song
0,Amy Winehouse,"All I can ever be to you,\r\nIs a darkness tha...",Tears Dry [Original Version]
1,Amy Winehouse,Build your dreams to the stars above\r\nBut wh...,Don't Go To Strangers [Multimedia Track]
2,Amy Winehouse,I can't wait to get away from you\r\nUnsurpris...,"Best Friends, Right?"
3,Amy Winehouse,I can't wait to get away from you\r\nUnsurpris...,"Best Friends, Right? [Leicester Summer Sundae ..."
4,Amy Winehouse,I would die before I divorce you\r\nI'd take a...,Between the Cheats
5,Amy Winehouse,I've been so many places in my life and time\r...,A Song for You
6,Amy Winehouse,Lent you outsiders and my new Badu\r\nWhile yo...,You Sent Me Flying [Demo] [Original Demo]
7,Amy Winehouse,"My heart is sad and lonely\nFor you I sigh, fo...",Body & Soul
8,Amy Winehouse,Nobody stands in between me and my man\r\nIt's...,Me & Mr Jones [Multimedia Track]
9,Amy Winehouse,Our day will come (our day will come)\r\nAnd w...,Our Day Will Come


Removing the duplicates songs like remixes, live versions .. etc

In [17]:
#Cleaning the remixes
def getRidOfBrackets(input):
    return re.sub("[\(\[].*?[\)\]]", "", input).strip()
    
    
pd5["Song"] = pd5["Song"].apply(getRidOfBrackets)
pd5 = pd5.drop_duplicates(subset='Song', keep='first')

In [18]:
stop = stopwords.words('english')

Removing stop words and turning the lyrics into lower case

In [19]:
pd5['Lyrics'] = pd5['Lyrics'].str.lower()
pd5['Lyrics'] = pd5['Lyrics'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Get rid of punctuations

In [20]:
pd5['Lyrics'] = pd5['Lyrics'].str.replace('[^\w\s]','')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [21]:
pd5

Unnamed: 0,Band,Lyrics,Song
0,Amy Winehouse,ever you darkness knew regret get accustomed t...,Tears Dry
1,Amy Winehouse,build dreams stars need someone love go strang...,Don't Go To Strangers
2,Amy Winehouse,cant wait get away unsurprisingly hate communi...,"Best Friends, Right?"
4,Amy Winehouse,would die divorce id take thousand thumps love...,Between the Cheats
5,Amy Winehouse,ive many places life time ive sung lot songs i...,A Song for You
6,Amy Winehouse,lent outsiders new badu thinking clue tough so...,You Sent Me Flying
7,Amy Winehouse,heart sad lonely sigh dear seen im you body so...,Body & Soul
8,Amy Winehouse,nobody stands man mr jones me mr jones kind sh...,Me & Mr Jones
9,Amy Winehouse,day come our day come well everything well sha...,Our Day Will Come
10,Amy Winehouse,simple sweet guitar humbled bass beat kicks ev...,Halftime


We will put each song into the sentiment intensity analyzer, adding a column to the dataframe which is the score for each song, -1 is sad, 1 is happy.

In [22]:
si = SentimentIntensityAnalyzer()
scores = [];
for song in pd5["Lyrics"]:
    result = si.polarity_scores(song)
    score = result["compound"]
    scores.append(score)
pd5.insert(3, "score",scores, True)

We will average out each song's score for each artist. The result is a dataframe with artist name and its average score.

In [23]:
sentimentScores = pd5.groupby(["Band"]).mean()
sentimentScores

Unnamed: 0_level_0,score
Band,Unnamed: 1_level_1
Amy Winehouse,0.330705
Billy Mackenzie,0.9912
Bob Marley,0.341659
Bobby Brown,0.644654
Bon Scott,0.19065
Brian Wilson,0.554533
Britney Spears,0.442178
Chris Cornell,0.05296
David Bowie,0.154884
David Ruffin,0.573835


In [24]:
sentimentScores.plot.bar()
#Jimi Jamison with low low score

<matplotlib.axes._subplots.AxesSubplot at 0x1a27ce2a20>

Now we try to obtain the the word count for each artist

In [25]:
# concatenate all lyrics for each artist
def concat_(col):
    return "".join(col)
df = pd5.loc[:,['Band','Lyrics']].groupby(['Band']).aggregate(concat_)

In [26]:
word_counts = []
num_words = []
for lyrics in df['Lyrics']:
    word_count =  pd.Series(lyrics.split()).value_counts()
    word_counts.append(word_count)
for lyrics in df['Lyrics']:
    num_words.append(len(lyrics))

    
df.insert(1, "word count", word_counts, True)
df.insert(2, "number of words",num_words, True )
df

Unnamed: 0_level_0,Lyrics,word count,number of words
Band,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Amy Winehouse,ever you darkness knew regret get accustomed t...,im 102 love 100 know ...,39953
Billy Mackenzie,love me love me say let fly away love like win...,living 13 lives 12 paradise...,1342
Bob Marley,whooooooo mr brown mr brown clown rides town c...,love 420 yeah 410 go...,164502
Bobby Brown,ah ah ah things aint way used back day man mea...,girl 193 love 186 baby...,39489
Bon Scott,aint necessarily aint necessarily tings dat yo...,sookie 24 hang 18 baby...,1472
Brian Wilson,s wonderful s marvelous care s awful nice s pa...,la 209 im 156 love...,78608
Britney Spears,ladies gentlemen interrupt program dance music...,oh 786 im 642 baby ...,143252
Chris Cornell,start another day together million miles us tr...,im 100 know 93 li...,39392
David Bowie,hey aint over me hope im crazy feel driving wh...,oh 577 im 562 love ...,239470
David Ruffin,day day keep guessing whether mine ooh baby so...,love 71 baby 69 im ...,12156


TO GET word counts for one artist:

In [27]:
df.loc["Mac Miller","word count"]

im             433
like           252
get            207
got            184
shit           183
aint           175
go             159
yeah           147
know           147
fuck           137
love           128
keep           103
come            97
need            95
bitch           92
time            91
life            90
back            89
money           88
let             86
cant            82
mind            81
see             77
make            75
cause           74
one             74
tell            69
say             69
thats           67
hey             67
              ... 
lameness         1
playas           1
youre            1
porno            1
worn             1
foureven         1
assignment       1
potentially      1
gentleman        1
sprinter         1
bachelor         1
dikes            1
flags            1
grave            1
charles          1
losin            1
average          1
powders          1
burgh            1
the              1
lyin             1
stewart     

To get the word count for a specific word:

In [28]:
df.loc["Mac Miller","word count"]["shit"]

183

To get the density of each word:

In [29]:
d = df.loc["Mac Miller","word count"]["shit"] / df.loc["Mac Miller","number of words"]
print(d)

0.0017965835460435893
