In [2]:
import sys
import pandas as pd
import numpy as np
import spotipy 
sp=spotipy.Spotify()

In [3]:
from bokeh.embed import file_html

In [4]:
## Get information on each of the albums for a given artist 
def fetch_albums(musician): 
    artist=sp.search(q='artist:'+musician, type='artist')
    if (len(artist['artists'])==0): 
        print 'Error: Check Musician Name'
        return None
    else: 
        ## Pull album information 
        albums=sp.artist_albums(artist['artists']['items'][0]['id'], album_type='album', country='US')
        df_album=pd.DataFrame()
        for i in range(len(albums['items'])): 
            df_album.loc[i,'name']=albums['items'][i]['name']
            df_album.loc[i,'id']=albums['items'][i]['id']
        for j in range(len(df_album)): 
            album_info=sp.album(df_album.loc[j,'id'])
            df_album.loc[j,'release_date']=album_info['release_date']
            df_album.loc[j,'popularity']=album_info['popularity']
            
    return df_album


In [5]:
def fetch_tracks(df_album): 
    tracks=pd.DataFrame()
    print 'Querying Album Tracks'
    for i in range(len(df_album)):
        test=sp.album_tracks(df_album.loc[i,'id'])
        for j in range(len(test['items'])): 
            k=len(tracks)
            tracks.loc[k,'album']=df_album.loc[i,'name']
            tracks.loc[k, 'name']=test['items'][j]['name']
            tracks.loc[k,'track_id']=test['items'][j]['id']
            tracks.loc[k,'artist_num']=len(test['items'][j]['artists'])

    for i in range(len(tracks)): 
        if (i%20==0): 
            print i 
        test=sp.track(tracks.loc[i,'track_id'])
        tracks.loc[i,'popularity']=test['popularity']
        tracks.loc[i,'explicit']=test['explicit']
        if (tracks.loc[i, 'artist_num']>1): 
            stringer=''
            for j in range(0,tracks.loc[i, 'artist_num'].astype(int)): 
                if (j<tracks.loc[i, 'artist_num']-1):
                    namer=test['artists'][j]['name']
                    stringer=stringer+test['artists'][j]['name']+', '
                else: 
                    stringer=stringer+test['artists'][j]['name']
            tracks.loc[i,'artists_on_track']=stringer
            
    return tracks 


In [6]:
# Create list of collaborators
import unicodedata

def make_collab_list(df_tracks, main_artist):
    collaborators_list=[]
    for i in range(len(df_tracks)): 
        if (df_tracks.loc[i,'artist_num']>1): 
            test1=unicodedata.normalize('NFKD', df_tracks.loc[i,'artists_on_track']).encode('ascii','ignore')
            test=test1.split(',')
            for j in range(len(test)): 
                if (test[j].lstrip()!=main_artist) : 
                    name=test[j].lstrip()
                    matching = [s for s in collaborators_list if name in s]
                    if (len(matching)==0):
                        collaborators_list.append(name)
                        
        collab=pd.DataFrame()
        collab['collaborator']=collaborators_list

    for i in range(len(collab)): 
        person=collab.loc[i,'collaborator']
        ### BETTER WAY TO SORT OUT UNICODE ISSUES? 
        if (person=='Beyonce'): 
            person='Beyo'
        temp=df_tracks.loc[np.where((df_tracks['artists_on_track'].str.contains(person))==True)[0],['album', 'artists_on_track','popularity']]
        collab.loc[i, 'num_collab']=len(temp)
        collab.loc[i, 'mean_pop']=np.mean(temp['popularity'])
        collab.loc[i, 'std_pop']=np.std(temp['popularity'])
    
    return collab

In [6]:
kw_albums=fetch_albums('Kanye West')

In [7]:
kw_albums

Unnamed: 0,name,id,release_date,popularity
0,The Life Of Pablo,0WAuEfa5Lmg72xfydLVcca,2016-04-04,95
1,Yeezus,7D2NdGvBHIavgLhmcwhluK,2013-06-18,78
2,Watch The Throne (Explicit Version),7mCeLbChyegbRwwKK5shJs,2011-08-12,71
3,Watch The Throne (Edited Version),0eZRIt3Ht1IuMXsr0QR8OY,2011-08-12,42
4,Watch The Throne [Deluxe Edition (Explicit)],1YwzJz7CrV9fd9Qeb6oo1d,2011-08-12,80
5,Watch The Throne [Deluxe Edition (Edited)],4P63UgNDUcF11MnWzyvVrh,2011-08-12,51
6,My Beautiful Dark Twisted Fantasy (Explicit Ve...,20r762YmB5HeofjMCiPMLv,2010-11-19,81
7,My Beautiful Dark Twisted Fantasy (Edited Vers...,6klUp8sQyRXGuJhqZu4PG3,2010-11-22,51
8,808s & Heartbreak,2JK89jt4unItFroOr0kT3g,2008-11-24,75
9,Graduation (Explicit Version),3SZr5Pco2oqKFORCP3WNj9,2007-09-11,81


In [8]:
tracks=fetch_tracks(kw_albums)

Querying Album Tracks
0
20
40
60
80
100
120
140
160
180
200
220


There are multiple versions of individual albums (explicit/clean, standard/deluxe). First let's see if whether or not the track is explicit affects its popularity

In [9]:
ex_hist, ex_edges = np.histogram(tracks.loc[np.where(tracks['explicit']==True)[0],'popularity'],range=[0,100], bins=25)
ex_center=(ex_edges[:-1]+ex_edges[1:])/2.

cl_hist, cl_edges = np.histogram(tracks.loc[np.where(tracks['explicit']==False)[0],'popularity'],range=[0,100], bins=25)
cl_center=(cl_edges[:-1]+cl_edges[1:])/2.

histo=pd.DataFrame()
histo['explicit']=ex_hist
histo['ex_center']=ex_center
histo['clean']=cl_hist
histo['cl_center']=cl_center

In [11]:
from bokeh.plotting import figure, show, vplot
from bokeh.charts import Histogram,Bar, output_file, show, output_notebook
from bokeh.charts.attributes import cat, color
from bokeh.charts.operations import blend

p = figure(width=300, height=300)

p = Bar(histo, label='ex_center', 
        values=blend('explicit', 'clean', labels_name='typer'),  
        stack=cat(columns='typer', sort=False),
        color=color(columns='typer', palette=['red', 'blue'],sort=False), 
        legend='top_right', xlabel='Popularity', ylabel='Number',
        tooltips=[('number', '@typer'), ('popularity', '@ex_center')])


print 'Mean Popularity (Explicit): ', np.mean(tracks.loc[np.where(tracks['explicit']==True)[0],'popularity']),'+-',np.std(tracks.loc[np.where(tracks['explicit']==True)[0],'popularity'])
print 'Mean Popularity (Clean): ', np.mean(tracks.loc[np.where(tracks['explicit']==False)[0],'popularity']),'+-',np.std(tracks.loc[np.where(tracks['explicit']==False)[0],'popularity'])

output_notebook()

#show(p)

Mean Popularity (Explicit):  58.9596774194 +- 12.2332170706
Mean Popularity (Clean):  31.871559633 +- 12.1656011646


In [31]:
from bokeh.plotting import figure, show, vplot
from bokeh.charts import Histogram,Bar, output_file, show, output_notebook
from bokeh.charts.attributes import cat, color
from bokeh.charts.operations import blend
from bokeh.models import ColumnDataSource, HoverTool, BoxZoomTool, ResetTool    

p = figure(width=400, height=300)

p.quad(top=ex_hist, bottom=0, left=ex_edges[:-1],right=ex_edges[1:],\
       fill_color="white",line_color="red", legend="Explicit")
p.quad(top=cl_hist, bottom=0, left=cl_edges[:-1], right=cl_edges[1:],\
      fill_color="white", line_color="blue", fill_alpha=0.2, legend="Clean")

p.legend.location="top_right"
p.xaxis.axis_label="Popularity"
p.yaxis.axis_label="Number of Tracks"

print 'Mean Popularity (Explicit): ', np.mean(tracks.loc[np.where(tracks['explicit']==True)[0],'popularity']),'+-',np.std(tracks.loc[np.where(tracks['explicit']==True)[0],'popularity'])
print 'Mean Popularity (Clean): ', np.mean(tracks.loc[np.where(tracks['explicit']==False)[0],'popularity']),'+-',np.std(tracks.loc[np.where(tracks['explicit']==False)[0],'popularity'])

output_notebook()

show(p)

Mean Popularity (Explicit):  58.9596774194 +- 12.2332170706
Mean Popularity (Clean):  31.871559633 +- 12.1656011646


In [12]:
from bokeh.embed import components
script, div = components(p)

print div

f=open('p1.js', 'w')
f.write(script)
f.close()


<div class="plotdiv" id="50ccee83-62a6-4b3b-aabe-5fc03d240c39"></div>


Overall, the explicit version of Kanye songs are significantly more popular than the clean versions. I suppose that makes 
sense in a go big or go home sort of way. If you're going to listen to Kanye, listen to KANYE. 

However the distributions show that some clean versions of songs are as popular as other explicit versions. Let's look at how the difference in popularity between song versions varies by song. 

In [13]:
## Need to clean up song titles such that I can group them cleanly in pandas
pd.set_option('display.max_rows', 200)
indexer=np.where(tracks['name'].str.contains('(Edited)')==True)[0]
tracks['ed_name']=tracks['name']
tracks.loc[indexer, 'ed_name']=tracks.loc[indexer, 'name'].str[:-24]
tracks['ed_name']=tracks['ed_name'].str.rstrip()




In [14]:
tracks_temp=pd.DataFrame()
tracks_temp=tracks
tracks_temp['pop_max']=tracks['popularity']
tracks_temp['pop_diff']=tracks['popularity']

grouped=tracks_temp[['ed_name', 'album','popularity', 'pop_max', 'pop_diff']].groupby('ed_name')

pop_diff=grouped.agg({'ed_name' : 'first', 'album': 'first', 'popularity' : 'first', 'pop_max' : 'max', 'pop_diff' : lambda x: np.max(x)-np.min(x)})
pop_diff.reset_index(drop=True, inplace=True)

In [15]:
# Remove songs where there is only one version 
pop_diff_ed=pop_diff.loc[np.where(pop_diff['pop_diff']!=0)[0]]

In [16]:
from bokeh.charts import Scatter, output_file, show
from bokeh.models import ColumnDataSource, HoverTool, BoxZoomTool, ResetTool    
from scipy import stats 

source = ColumnDataSource(
        data=dict(
            x=pop_diff_ed['pop_max'].astype(int),
            y=pop_diff_ed['pop_diff'].astype(int),
            album=pop_diff_ed['album'].astype(str),
            song=pop_diff_ed['ed_name'].astype(object),
            per=pop_diff_ed['pop_diff']/pop_diff_ed['pop_max'],
        )
    )

TOOLS = [BoxZoomTool(), ResetTool(),HoverTool(tooltips=[("song", "@song"),])]

p = figure(plot_width=600, plot_height=500, tools=TOOLS)
p.xaxis.axis_label = "Popularity (Explicit Version)"
p.yaxis.axis_label = "Difference in Popularity"

slope, intercept, r_value, p_value, std_err = stats.linregress(source.data['x'],source.data['y'])
xvalue=np.arange(0, 100, dtype=np.float)
yvalue=slope*xvalue+intercept

p.line(xvalue, yvalue, line_dash="4 4", line_width=2, color='red')


p.circle('x', 'y', size=10, color='black', source=source)

#show(p)


<bokeh.models.renderers.GlyphRenderer at 0x1115a7b10>

In [17]:
from bokeh.embed import components
script, div = components(p)

print div

f=open('p2.js', 'w')
f.write(script)
f.close()


<div class="plotdiv" id="9439fc97-d77b-466b-852f-214bdff06354"></div>


In general, the differences between the popularity of the explicit and clean version increases as songs get more popular. I suppose this makes sense, as the popularity index takes into account number of plays. 

The biggest discrepancy is for the most popular song, 'Ni**as in Paris.' And for that song, I can't even imagine the clean version!

For the rest of this analysis, I'm going to just use the explicit versions of tracks when available. There also is a Deluxe version of one album, 'Watch the Throne'. I'll remove that one as well for now, and just stick with the regular explicit version. 

In [18]:
explicit=tracks.loc[np.where((tracks['explicit']==True) & (tracks['album'].str.contains('Deluxe')==False))[0]]
explicit.reset_index(drop=True,inplace=True)

Now that I've trimmed all the tracks down to the explicit versions, I want to see how Kanye alone fares against Kanye and Friends. Let's see how the distribution of popularity compares for these two categories. 

In [29]:
ex_hist, ex_edges = np.histogram(explicit.loc[np.where(explicit['artist_num']==1)[0],'popularity'],range=[0,100], bins=25)
ex_center=(ex_edges[:-1]+ex_edges[1:])/2.
## Fractional: 
#ex_hist=(ex_hist.astype(float))/float(sum(ex_hist))

cl_hist, cl_edges = np.histogram(explicit.loc[np.where(explicit['artist_num']>1)[0],'popularity'],range=[0,100], bins=25)
cl_center=(cl_edges[:-1]+cl_edges[1:])/2.
## Fractional: 
#cl_hist=(cl_hist.astype(float))/float(sum(cl_hist))

histo=pd.DataFrame()
histo['alone']=ex_hist
histo['ex_center']=ex_center
histo['collaboration']=cl_hist
histo['cl_center']=cl_center


In [31]:
from bokeh.plotting import figure, show, vplot
from bokeh.charts import Histogram,Bar, output_file, show, output_notebook
from bokeh.charts.attributes import cat, color
from bokeh.charts.operations import blend

p = figure(width=300, height=300)

p = Bar(histo, label='ex_center', 
        values=blend('alone', 'collaboration', labels_name='typer'),  
        stack=cat(columns='typer', sort=False),
        color=color(columns='typer', palette=['red', 'blue'], sort=False), 
        legend='top_right', xlabel='Popularity', ylabel='Number',
        tooltips=[('number', '@typer'), ('popularity', '@ex_center')])

print 'Average Track Popularity (Kanye Alone): ', np.mean(explicit.loc[np.where(explicit['artist_num']==1)[0],'popularity']), '+-', np.std(tracks.loc[np.where(tracks['artist_num']==1)[0],'popularity'])
print 'Average Track Popularity (with Collaborators): ', np.mean(explicit.loc[np.where(explicit['artist_num']!=1)[0],'popularity']), '+-', np.std(tracks.loc[np.where(tracks['artist_num']!=1)[0],'popularity'])

output_notebook()
#show(p)

Average Track Popularity (Kanye Alone):  61.1818181818 +- 19.8423146715
Average Track Popularity (with Collaborators):  56.5476190476 +- 15.8097893281


In [32]:
from bokeh.embed import components
script, div = components(p)

print div

f=open('p3.js', 'w')
f.write(script)
f.close()


<div class="plotdiv" id="f648aa69-f7e3-4c56-9150-24be663d6398"></div>


In general, the distributions of popularity for Kanye with and without collaborators are pretty similar, although Kanye alone is slightly more popular. 

Let's see which other musicians have the largest affect on song popularity... 

In [22]:
collab_info=make_collab_list(explicit, 'Kanye West')

In [23]:
collab_info['count']=pd.Series(range(0, len(collab_info)))

Now, I'm going to look at the mean popularity of songs for each collaborator and compare it to the popularity of Kanye alone, e.g., who drags him down and who raises him up. 

In [27]:
from bokeh.charts import Scatter, output_file, show
from bokeh.models import ColumnDataSource, HoverTool, BoxZoomTool, ResetTool    

source = ColumnDataSource(
        data=dict(
            x=collab_info['count'],
            y=collab_info['mean_pop'].astype(int),
            person=collab_info['collaborator'].astype(object),
        )
    )

TOOLS = [BoxZoomTool(), ResetTool(), HoverTool(tooltips=[("person","@person")])]

kanye_mean=np.mean(explicit.loc[np.where(explicit['artist_num']==1)[0],'popularity'])
kanye_std=np.std(explicit.loc[np.where(explicit['artist_num']==1)[0],'popularity'])

p = figure(plot_width=600, plot_height=500, tools=TOOLS)
p.xaxis.axis_label = "Collaborator"
p.yaxis.axis_label = "Mean Popularity of Collaboration"
p.circle(collab_info.loc[np.where(collab_info['collaborator']=='JAY Z')[0],'count'], collab_info.loc[np.where(collab_info['collaborator']=='JAY Z')[0],'mean_pop'], size=15, color='gold')
p.circle('x', 'y', size=10, color='black', source=source)
p.xaxis.major_label_text_font_size = '0pt'

p.line([-1,45],[kanye_mean, kanye_mean], line_width=2, color='red')
p.line([-1,45],[(kanye_mean+kanye_std), (kanye_mean+kanye_std)], line_dash="4 4", line_width=2, color='red')
p.line([-1,45],[(kanye_mean-kanye_std), (kanye_mean-kanye_std)], line_dash="4 4", line_width=2, color='red')


#show(p)

<bokeh.models.renderers.GlyphRenderer at 0x10105fd90>

In [28]:
from bokeh.embed import components
script, div = components(p)

print div

f=open('p4.js', 'w')
f.write(script)
f.close()


<div class="plotdiv" id="3bcfcc65-5dcc-4f64-9d4d-1221b9e19046"></div>


Most of the collaborative tracks are well within the range of popularity for Kanye alone. Except for that one little guy. Which leads to the all important question: do Kanye fans hate Chris Martin?!

The song 'Homecoming' is from the album 'Graduation.' 

In [33]:
explicit_albums=kw_albums.loc[np.where((kw_albums['name'].str.contains('Deluxe')==False) & (kw_albums['name'].str.contains('Edited')==False))[0]]
explicit_albums.reset_index(inplace=True, drop=True)
explicit_albums['count']=pd.Series(range(0, len(explicit_albums)))

In [34]:
explicit_albums['year']=explicit_albums['release_date'].astype(str).str[0:4].astype(int)

In [35]:
from bokeh.charts import Scatter, output_file, show
from bokeh.models import ColumnDataSource, HoverTool, BoxZoomTool, ResetTool    

source = ColumnDataSource(
        data=dict(
            x=explicit_albums['count'],
            y=explicit_albums['popularity'],
            album=explicit_albums['name'].astype(object),
        )
    )

TOOLS = [BoxZoomTool(), ResetTool(), HoverTool(tooltips=[("album","@album")])]

meaner=np.mean(explicit_albums['popularity'])
stder=np.std(explicit_albums['popularity'])

p = figure(plot_width=600, plot_height=500, tools=TOOLS)
p.xaxis.axis_label = "Album"
p.yaxis.axis_label = "Popularity"
p.circle(explicit_albums.loc[5,'count'], explicit_albums.loc[5,'popularity'], size=15, color='cyan')
p.circle('x', 'y', size=10, color='black', source=source)
p.xaxis.major_label_text_font_size = '0pt'

p.line([-1,10],[meaner, meaner], line_width=2, color='red')
p.line([-1,10],[(meaner+stder), (meaner+stder)], line_dash="4 4", line_width=2, color='red')
p.line([-1,10],[(meaner-stder), (meaner-stder)], line_dash="4 4", line_width=2, color='red')


#show(p)

<bokeh.models.renderers.GlyphRenderer at 0x101058250>

In [39]:
explicit.sort(['album', 'name'])

Unnamed: 0,album,name,track_id,artist_num,popularity,explicit,artists_on_track,ed_name,pop_max,pop_diff
60,Graduation (Explicit Version),Barry Bonds,7kXINLuqpicfE1sDCZ3Xwv,2,56,True,"Kanye West, Lil Wayne",Barry Bonds,56,56
66,Graduation (Explicit Version),Big Brother,19PtHjgLtiKGsrRoC1f9IF,1,55,True,,Big Brother,55,55
59,Graduation (Explicit Version),Can't Tell Me Nothing,4ImL3v98u2BLkwnyQDjfRm,1,69,True,,Can't Tell Me Nothing,69,69
55,Graduation (Explicit Version),Champion,7hNx9Dynz2fRO41L9AEVA8,1,62,True,,Champion,62,62
61,Graduation (Explicit Version),Drunk and Hot Girls,1xMT0HHQbQFmGJXDq1ApgX,2,55,True,"Kanye West, Mos Def",Drunk and Hot Girls,55,55
63,Graduation (Explicit Version),Everything I Am,1XQhZctQWzkznclbmbE7FQ,2,59,True,"Kanye West, DJ Premier",Everything I Am,59,59
62,Graduation (Explicit Version),Flashing Lights,0ua0Go4NN4Td7l0Zre6Ce3,1,66,True,,Flashing Lights,66,66
58,Graduation (Explicit Version),Good Life,1fLdeDTrJWNkwOeFyAVLvF,2,67,True,"Kanye West, T-Pain",Good Life,67,67
54,Graduation (Explicit Version),Good Morning,27eO3EGKIUU7yug1eOxUZu,1,63,True,,Good Morning,63,63
65,Graduation (Explicit Version),Homecoming,2iaCM7WvOknQI1230hA9eK,2,14,True,"Kanye West, Chris Martin",Homecoming,14,14


In [36]:
from bokeh.embed import components
script, div = components(p)

print div

f=open('p5.js', 'w')
f.write(script)
f.close()


<div class="plotdiv" id="df57716e-80ec-4840-974a-fe8b1a420da6"></div>


Here I have marked 'Graduation' with a cyan border. As you can see , the album itself is above average in popularity. Additionally, the average is a probably skewed a bit high, as 'Life of Pablo' is a fairly new release, so is probably quite popular at the moment. 

In [217]:
from bokeh.charts import Scatter, output_file, show
from bokeh.models import ColumnDataSource, HoverTool, BoxZoomTool, ResetTool    

source = ColumnDataSource(
        data=dict(
            x=explicit_albums['year'],
            y=explicit_albums['popularity'],
            album=explicit_albums['name'].astype(object),
        )
    )

TOOLS = [BoxZoomTool(), ResetTool(), HoverTool(tooltips=[("album","@album")])]

meaner=np.mean(explicit_albums['popularity'])
stder=np.std(explicit_albums['popularity'])

p = figure(plot_width=600, plot_height=500, tools=TOOLS)
p.xaxis.axis_label = "Release Year"
p.yaxis.axis_label = "Popularity"
p.circle(explicit_albums.loc[5,'year'], explicit_albums.loc[5,'popularity'], size=15, color='cyan')
p.circle('x', 'y', size=10, color='black', source=source)
p.xaxis.major_label_text_font_size = '0pt'

p.line([2000,2017],[meaner, meaner], line_width=2, color='red')
p.line([2000,2017],[(meaner+stder), (meaner+stder)], line_dash="4 4", line_width=2, color='red')
p.line([2000,2017],[(meaner-stder), (meaner-stder)], line_dash="4 4", line_width=2, color='red')


#show(p)

<bokeh.models.renderers.GlyphRenderer at 0x11494bad0>

There is not a clear correlation between age of album and popularity. But 'Life of Pablo' is so recent, I'm guessing it's value is a bit biased. I mean it's good, but I'm not sure it's 2sigma good. 

Anyways, you can see that 'Graduation' as an album is pretty popular. So what's up with that Chris Martin collaboration?!

In [216]:
### Want to see how it did in the RIAA rankings. 

Unnamed: 0,name,id,release_date,popularity,count,year
0,The Life Of Pablo,0WAuEfa5Lmg72xfydLVcca,2016-04-04,95,0,2016
1,Yeezus,7D2NdGvBHIavgLhmcwhluK,2013-06-18,78,1,2013
2,Watch The Throne (Explicit Version),7mCeLbChyegbRwwKK5shJs,2011-08-12,70,2,2011
3,My Beautiful Dark Twisted Fantasy (Explicit Ve...,20r762YmB5HeofjMCiPMLv,2010-11-19,81,3,2010
4,808s & Heartbreak,2JK89jt4unItFroOr0kT3g,2008-11-24,75,4,2008
5,Graduation (Explicit Version),3SZr5Pco2oqKFORCP3WNj9,2007-09-11,81,5,2007
6,Late Registration (Explicit Version),5ll74bqtkcXlKE7wwkMq4g,2005-09-30,79,6,2005
7,The College Dropout (Explicit),3ff2p3LnR6V7m6BinwhNaQ,2004-02-13,77,7,2004


In [270]:
from bs4 import BeautifulSoup
import urllib2
wiki = "https://en.wikipedia.org/wiki/Kanye_West_discography#Singles"
header = {'User-Agent': 'Mozilla/5.0'} #Needed to prevent 403 error on Wikipedia
req = urllib2.Request(wiki,headers=header)
page = urllib2.urlopen(req)
soup = BeautifulSoup(page)

In [296]:
# The list of singles is the 5th table on the wikipage
singles = soup.findAll("table", { "class" : "wikitable plainrowheaders" })[4]

In [542]:
wiki_data=pd.DataFrame()

rows=singles.findAll("tr")
#Null value for billboard ranking 
nuller=(rows[12].findAll('td')[0])

for i in range(2,len(rows)-1): 
    row=rows[i]
    ## pull out song title
    wiki_data.loc[i,'title']=row.findAll("th")[0].findAll("a")[0].get('title')
    cells=row.findAll("td")
    ## pull out US chart ranking 
    """
    if ((cells[0]!=nuller)): 
        if ((cells[0].renderContents()>2000) & (cells[1]!=nuller)): 
            wiki_data.loc[i, 'us_rank']=int(cells[1].renderContents())
        else if (): 
            wiki_data.loc[i, 'us_rank']=int(cells[0].renderContents())

    ## pull out RIAA Rating
    #if (len(cells)>11): 
    #    if (cells[11].li!=None): 
            temp=(unicode(cells).strip())
            if (temp.find(u'RIAA: Gold')>-1): 
                wiki_data.loc[i,'riaa']='G'
            if (temp.find(u'RIAA: Platinum')>-1): 
                wiki_data.loc[i,'riaa']='P'
            if (temp.find(u'RIAA: 2x Platinum')>-1): 
                wiki_data.loc[i,'riaa']='2P'
            if (temp.find(u'RIAA: 3x Platinum')>-1): 
                wiki_data.loc[i,'riaa']='3P'
            if (temp.find(u'RIAA: 4x Platinum')>-1): 
                wiki_data.loc[i,'riaa']='4P'
            if (temp.find(u'RIAA: 5x Platinum')>-1): 
                wiki_data.loc[i,'riaa']='5P'
            if (temp.find(u'RIAA: 6x Platinum')>-1): 
                wiki_data.loc[i,'riaa']='6P'
            if (temp.find(u'RIAA: 7x Platinum')>-1): 
                wiki_data.loc[i,'riaa']='7P'
    """


In [497]:
"""
print i
row=rows[i]
cells=(row.findAll('td'))
print cells[1]
if (cells!=test) : 
    barf=cells[0]
    print 'Barf', barf
"""

"\nprint i\nrow=rows[i]\ncells=(row.findAll('td'))\nprint cells[1]\nif (cells!=test) : \n    barf=cells[0]\n    print 'Barf', barf\n"

In [543]:
## Giving up at the moment 
[2, ], ['G', '']

Unnamed: 0,title
2,Through the Wire
3,Slow Jamz
4,All Falls Down
5,Jesus Walks (Kanye West song)
6,The New Workout Plan
7,Diamonds from Sierra Leone
8,Gold Digger (Kanye West song)
9,Heard 'Em Say
10,Touch the Sky (Kanye West song)
11,Twista


I was trying to webscrape wikipedia to see if I can compare songs on popularity and RIAA rating. Homecoming is certified Platinum. 

In [546]:
explicit['count']=pd.Series(range(0, len(explicit)))

In [551]:
from bokeh.charts import Scatter, output_file, show
from bokeh.models import ColumnDataSource, HoverTool, BoxZoomTool, ResetTool    

source = ColumnDataSource(
        data=dict(
            x=explicit['count'],
            y=explicit['popularity'],
            track=explicit['name'].astype(object),
        )
    )

TOOLS = [BoxZoomTool(), ResetTool(), HoverTool(tooltips=[("Track","@track")])]

meaner=np.mean(explicit['popularity'])
stder=np.std(explicit['popularity'])

p = figure(plot_width=600, plot_height=500, tools=TOOLS)
p.xaxis.axis_label = "track"
p.yaxis.axis_label = "Popularity"
p.circle(explicit.loc[np.where(explicit['name']=='Homecoming')[0],'count'], explicit.loc[np.where(explicit['name']=='Homecoming')[0],'popularity'], size=15, color='cyan')
p.circle('x', 'y', size=10, color='black', source=source)
p.xaxis.major_label_text_font_size = '0pt'

p.line([-1,10],[meaner, meaner], line_width=2, color='red')
p.line([-1,10],[(meaner+stder), (meaner+stder)], line_dash="4 4", line_width=2, color='red')
p.line([-1,10],[(meaner-stder), (meaner-stder)], line_dash="4 4", line_width=2, color='red')


#show(p)

<bokeh.models.renderers.GlyphRenderer at 0x116bd1b10>

Kanye brags that Jay Z followed his lead by putting Chris Martin on a track (Kingdom Come, Beach Chair). Is this track similarly reviled? 

In [40]:
jz_albums=fetch_albums('JAY Z') 
jz_tracks=fetch_tracks(jz_albums)
jz_explicit=jz_tracks.loc[np.where((jz_tracks['explicit']==True) & (jz_tracks['album'].str.contains('Deluxe')==False))[0]]
jz_explicit.reset_index(drop=True,inplace=True)

Querying Album Tracks
0
20
40
60
80
100
120
140
160
180
200
220
240
260


In [49]:
jz_explicit['count']=pd.Series(range(0, len(jz_explicit)))

source = ColumnDataSource(
        data=dict(
            x=jz_explicit['count'],
            y=jz_explicit['popularity'],
            track=jz_explicit['name'].astype(object),
        )
    )

TOOLS = [BoxZoomTool(), ResetTool(), HoverTool(tooltips=[("Track","@track")])]

meaner=np.mean(jz_explicit['popularity'])
stder=np.std(jz_explicit['popularity'])

p = figure(plot_width=600, plot_height=500, tools=TOOLS)
p.xaxis.axis_label = "track"
p.yaxis.axis_label = "Popularity"
p.circle(jz_explicit.loc[np.where(jz_explicit['name']=='Beach Chair')[0],'count'], jz_explicit.loc[np.where(jz_explicit['name']=='Beach Chair')[0],'popularity'], size=15, color='cyan')
p.circle('x', 'y', size=10, color='black', source=source)
p.xaxis.major_label_text_font_size = '0pt'

p.line([-1,130],[meaner, meaner], line_width=2, color='red')
p.line([-1,130],[(meaner+stder), (meaner+stder)], line_dash="4 4", line_width=2, color='red')
p.line([-1,130],[(meaner-stder), (meaner-stder)], line_dash="4 4", line_width=2, color='red')


show(p)

Nope! 'Beach Chair' is actually a bit more popular than average for Jay Z. And just a note, apparently people don't like it when Hova goes a cappella...

In [50]:
from bokeh.embed import components
script, div = components(p)

print div

f=open('p6.js', 'w')
f.write(script)
f.close()


<div class="plotdiv" id="3f97368a-c199-4286-9add-dd5a441587a7"></div>
