# Billboard Top 100 Charts Analysis

Frank Chen

In [1]:
# Import all libraries
import pandas as pd
import numpy as np
import seaborn as sns

# Standard plotly imports
import plotly.graph_objects as go

In [None]:
# Load data
bboard_data = pd.read_csv('../data/hot_100.csv')
yt_data = pd.read_csv('../data/yt_us_videos.csv')

In [None]:
bboard_data.tail()

# Initial Data Cleaning

First thing to do is separate the `WeekID` field into `month`, `day`, `year` for easier analysis in the future

In [None]:
bboard_data['WeekID'] = pd.to_datetime(bboard_data['WeekID'], format='%m/%d/%Y')

In [None]:
bboard_data['year'] = bboard_data['WeekID'].dt.year
bboard_data['month'] = bboard_data['WeekID'].dt.month
bboard_data['day'] = bboard_data['WeekID'].dt.day
bboard_data.to_csv('cleaned_bboard_data.csv')

# Analysis

We will first focus our analysis on the Billboard Leading Hot no. 1 songs.

### I. Analysis of Songs in terms of Popularity and Relevance

This dataset provides potential for rich analysis into both the popularity **and** relevance of no. 1 songs on the Billboard charts. We define the two terms below:

**Popularity**: how long the song stayed at no. 1 given all the weeks the song stayed on the chart. We represent this as the percentage of no. 1 counts over the total number of weeks the song spent on chart.

**Relevance**: how long the song stayed in the Billboard charts. We represent this as the total number of weeks the song spent on the chart.

I will be preparing a chart that represents 3 dimensions of data: x-axis will show the songs, y-axis will show the popularity percentage of that song, and the data point itself will be an area measuring the relevance of the song. 

#### Step 1: Data Preparation

In [None]:
bboard_data = pd.read_csv('cleaned_bboard_data.csv')

In [None]:
# use pivot table to extract counts of week positions for each song
# stackoverflow link: https://stackoverflow.com/questions/54527134/counting-column-values-based-on-values-in-other-columns-for-pandas-dataframes
bboard_data['count'] = 1
result = bboard_data.pivot_table(
    index=['Song'], columns='Week Position', values='count',
    fill_value=0, aggfunc=np.sum
)
# save result to csv for future use
result.to_csv('song_position_count.csv')

In [None]:
# read song_position_count.csv
song_position_count = pd.read_csv("song_position_count.csv")

In [None]:
# keep only the song and no. 1 column
song_position_count = song_position_count[['Song','1']]
song_position_count.to_csv('song_no1_count.csv')

In [None]:
# join song_no1_count with bboard_data
song_no1_count = pd.read_csv('song_no1_count.csv')
bboard_data = pd.merge(bboard_data, song_no1_count, how='left', left_on='Song', right_on='Song')

In [None]:
# remove irrelevant columns
# stackoverflow link: https://stackoverflow.com/questions/14940743/selecting-excluding-sets-of-columns-in-pandas
bboard_data = bboard_data.drop(['Unnamed: 0_x', 'Unnamed: 0_y'], axis=1)

In [None]:
# clean rows to keep only entry for total weeks on chart
# stackoverflow link: https://stackoverflow.com/questions/50283775/python-pandas-keep-row-with-highest-column-value
bboard_data_tmp = bboard_data.sort_values('Weeks on Chart').drop_duplicates(["Song"],keep='last')

In [None]:
# clean columns to keep only data needed for visualization
bboard_data_tmp = bboard_data_tmp[['Song', 'Performer', 'Weeks on Chart', '1', 'year']]

In [None]:
# rename column values
# stackoverflow link: https://stackoverflow.com/questions/11346283/renaming-columns-in-pandas
bboard_data_tmp.columns = ['Song', 'Performer', 'Relevance (Total Weeks on Chart)', 'Count of no. 1', 'Year']
# calculate popularity
# stackoverflow link: https://stackoverflow.com/questions/26133538/round-a-single-column-in-pandas
bboard_data_tmp['Popularity'] = bboard_data_tmp['Count of no. 1']/bboard_data_tmp['Relevance (Total Weeks on Chart)']
bboard_data_tmp['Popularity'] = bboard_data_tmp['Popularity'].round(2)

In [None]:
# save to csv for future analysis
bboard_data_tmp.to_csv('bboard_song_pop_relevance.csv')

#### Step 2: Data Visualization

In [2]:
# read in the data
bboard_song_pop_relevance = pd.read_csv('bboard_song_pop_relevance.csv')
bboard_song_pop_relevance = bboard_song_pop_relevance.drop(['Unnamed: 0'], axis=1)

In [3]:
# only use the top 100
bboard_song_pop_relevance_top100 = bboard_song_pop_relevance.nlargest(100, 'Count of no. 1')
bboard_song_pop_relevance_top100.to_csv('bboard_song_pop_relevance_top100.csv')

In [42]:
# create bubble chart
# stackoverflow link: https://plot.ly/python/bubble-charts/
# plotly colorscale: https://plot.ly/python/v3/matplotlib-colorscales/
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=bboard_song_pop_relevance_top100['Song'],
    y=bboard_song_pop_relevance_top100['Popularity'],
    mode='markers',
    marker=dict(
        size=16,
        cmax=1,
        cmin=0,
        color=bboard_song_pop_relevance_top100['Popularity'],
        colorbar=dict(
            title="Colorbar"
        ),
        colorscale="magma"
    ),
    marker_size=bboard_song_pop_relevance_top100['Relevance (Total Weeks on Chart)'],
    text=bboard_song_pop_relevance_top100['Performer'],
    hovertemplate = "<b>%{x}</b><br><i>%{text}</i><br><br>Popularity: %{y}<br>Relevance: %{marker.size}",
))

fig.update_layout(
    title={
        'text': "Billboard #1 Songs in terms of Popularity & Relevance",
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'})
fig.update_xaxes(title_text='Song')
fig.update_xaxes(tickangle=45)
fig.update_yaxes(title_text='Popularity')

fig.show()

#### Song Performance as a measure of how long the song remained in its peak position

#### Song Performance as a measure of velocity (how fast the song climbed to its peak position)

#### Song Performance as other streaming mediums are introduced (ex. YouTube)