# Hypothesis: A track with a higher average billboard rank will remain on the billboard chart longer.

If a track has a low average rank in the Billboard, it is highly unlikely to stay on the billboard for more than 19 weeks.

In [1]:
import numpy as np
import scipy.stats as stats
import csv
import seaborn as sns
import pandas as pd
import math
%matplotlib inline

#Used to ignore annoying warnings.
pd.options.mode.chained_assignment = None

In [2]:
#Read in the data from the file.
df = pd.read_csv('assets/billboard.csv', encoding='latin-1')
flag = True

#Renames the first 9 weeks so that I can then sort it easier later on.
df.rename(columns={'x1st.week': 'x01st.week', \
                   'x2nd.week': 'x02nd.week', \
                  'x3rd.week': 'x03rd.week', \
                  'x4th.week': 'x04th.week', \
                  'x5th.week': 'x05th.week', \
                  'x6th.week': 'x06th.week', \
                  'x7th.week': 'x07th.week', \
                  'x8th.week': 'x08th.week', \
                  'x9th.week': 'x09th.week'}, inplace=True)

In [3]:
#Function converting columns to INT or Float if there's a NaN
def columnInt(ghost):
    k=0
    for y in ghost:
        df[ghost[k]] = pd.to_numeric(df[ghost[k]], errors='coerce')
        k+=1

#Turns all * to NaN
def replaceNulls(dang):
    for y in dang:
        for x in df[y]:
            if x == '*':
                return np.nan
            else:
                return x

#Converts time into int(seconds).
def timeSec():
    j=0
    for x in df['time']:
        seconds = int(x[2:4])
        seconds = seconds + int(x[0]) * 60
        df['time'][j] = seconds
        j+=1
    columnInt(df.columns[3:4].values)

#Converts unicode to string.
def uniStr(awwheck):
    j=0
    p=0
    for y in awwheck:
        for x in df[awwheck[p]]:
            df[awwheck[p]][j] = x.encode('ascii','ignore')
            j+=1
        p+=1
        j=0

#Combines R & B and R&B
def genreFix():
    j=0
    for x in df['genre']:
        if x == 'R & B':
            df['genre'][j] = 'R&B'
            j+=1
        else:
            j+=1


#Adds the average rank of the track on the chart to the dataframe
def add_rank():
    track_rank_mean = []
    for x in range(317):
         track_rank_mean.append(np.mean(dftest[x][7:83]))

    track_rank_mean = pd.Series(track_rank_mean).rename("Rank Average")
    track_rank_mean = track_rank_mean.to_frame().T


#Adds the count of weeks the track was on the top 100 chart.
def add_count():
    test_list = []
    relation_list = []
    for x in range(317):
        track_count = 0

        for y in dftest[x][weeks].values.tolist():
            if y > 0:
                track_count+=1

        test_list.append(track_count)
    ghj = pd.Series(test_list).rename("Rank Count")




In [4]:
#Flag is here to make sure I don't run this section more than once without reading in the CSV
if flag == True:
    #Runs the time into INT(seconds) function
    timeSec()

    #Stores the weeks columns in weeks
    weeks = df.columns[7:83]
    #Stores the genre, date.entered, and date.peaked into hk
    hk = df.columns[4:7]
    hj = df.columns[1:3]
    
    #Runs the replaceNulls function
    replaceNulls(weeks)
    
    #Runs the column to INT function
    columnInt(weeks)
    
    #Turns the column from Unicode to String
    uniStr(hk)
    uniStr(hj)
    
    #Runs the genre fix function.
    genreFix()
    
    df['genre'] = df['genre'].astype(str)
    
    #Changes the format of the data and is the primary dataframe used.
    dftest = df.T
    track_rank_mean = []
    add_rank()
    dftest = pd.concat([dftest, track_rank_mean])
    
    add_count()
    dftest = pd.concat([dftest, ghj.to_frame().T])
    
    flag = False

TypeError: cannot concatenate a non-NDFrame object

In [None]:
df.head()

In [None]:
#Function to make a scatter plot.
def scatter_graph(title, hjj, xlab, ylab):
    ax = hjj.plot(x='Rank Count', y='Rank Average', kind='scatter', color='dodgerblue',\
                     figsize=(15,7), s=250, fontsize=20)
    ax.set_title(title, fontsize=24, y=1.01)
    ax.set_ylim(0,101)
    ax.set_xlim(0,60)
    ax.set_xlabel(xlab, fontsize=18)
    ax.set_ylabel(ylab, fontsize=18)
    ax.invert_yaxis();

In [None]:
#Takes the Rank Count and Rank Average rows and puts it into a new DataFrame then graphs it.
test_dff = dftest[83:85].T
scatter_graph("Average Track Rank by Number of Weeks on Chart", test_dff,\
              "Number of Weeks on Chart", "Average Rank of Track on Chart")

In [None]:
#Stores the tracks with over 20 weeks on the chart into tester_dff
tester_dff = test_dff[test_dff['Rank Count']>20]
test_dff

In [None]:
#Stores the tracks with less than 21 weeks on the chart into teste_dff
teste_dff = test_dff[test_dff['Rank Count']<=20]

In [None]:
#Found the average of the tracks above and below 20 weeks
print np.mean(tester_dff['Rank Average'])
print np.mean(teste_dff['Rank Average'])

In [None]:
#Calculated the number of tracks each genre had with over 20 weeks.
dtestf = pd.concat([test_dff, df['genre'].to_frame()], axis=1)
pam = dtestf[dtestf['Rank Count']>20]
test_array = np.unique(pam['genre'], return_counts=True)
genre_name = pd.Series(test_array[0]).rename("Names")
genre_number = pd.Series(test_array[1]).rename("Number")

testert = [genre_name, genre_number]
genre_df = pd.DataFrame(testert).T
genre_df

In [None]:
#Graphed the above data into bar graph.
ax = genre_df.plot(x='Names', y='Number', kind="bar", color='dodgerblue',\
                 figsize=(15,7), fontsize=20)
ax.set_title("Genres of Tracks on Chart Over 20 Weeks", fontsize=24, y=1.01)
ax.set_ylim(0,30)
ax.set_xticklabels(test_array[0],rotation=45)
ax.set_xlabel("Genres", fontsize=18)
ax.set_ylabel("Number of Tracks", fontsize=18);

In [None]:
testsdf = dftest[7:83]

In [None]:
#First graph I made with the data which showed a mysterious falling off around week 20
ax = testsdf.plot(figsize=(15,7), fontsize=10, legend=False)

ax.set_title("Tracks in Top 100", fontsize=21, y=1.01)

ax.set_xticklabels([int(y) for y in ax.get_xticks()])
ax.set_ylim(0,100)

ax.invert_yaxis();

In [None]:
hanks = []

i=0
for y in weeks:
    for x in df[y]:
        if x > 0:
            i+=1
    hanks.append(i)
    i=0
tanks = pd.Series(hanks)

In [None]:
#graphed the track decay rate and found a large drop between weeks 19 and 20
ax = tanks.plot(color='dodgerblue', figsize=(15,7), fontsize=20)
ax.set_title("Track Fall Off Rate", fontsize=25, y=1.01)
ax.set_ylabel("Number of Tracks", fontsize=20)
ax.set_xlabel("Number of Weeks on Chart", fontsize=20)
ax.set_ylim(1,320);

In [None]:
#Used a pivot_table to find the averages of each genre per week.
t_hanks = pd.pivot_table(df, index=['genre'], values=weeks[0:20])
t_hanks2 = t_hanks.iloc[:,:].T
t_hanks2.head()

In [None]:
pr = t_hanks2.columns[:]
genre_average_on_chart = np.mean(t_hanks2[pr])

In [None]:
#Graphed the genre averages
ax = genre_average_on_chart.plot(figsize=(15,7), kind="bar", fontsize=10)

ax.set_title("Average Rank of Genre on Billboard", fontsize=21, y=1.01)
ax.set_ylim(0,101)