In [1]:
import pandas as pd
import numpy as np

import plotly.offline as po
import plotly.graph_objs as go

from copy import deepcopy

po.init_notebook_mode(connected=True)

In [2]:
original_data = pd.read_csv("data.csv", parse_dates = ['Date'])
original_data.shape

(3441197, 7)

In [3]:
df = deepcopy(original_data)

#Remove all data with Region == global
df = df[df["Region"] != "global"]
df.shape

(3367397, 7)

In [4]:
# From https://www.kaggle.com/connerbrown/visualization-and-exploration
# Remove all NaN

df_nans = df.isnull()
print (df_nans.sum())

track_name_nans = df_nans['Track Name'][df_nans['Track Name'] == True].index
artist_nans = df_nans['Artist'][df_nans['Artist'] == True].index
nans_overlap = (track_name_nans == artist_nans).sum() / df_nans['Track Name'].sum() * 100.0
print ("\nPercent Overlap: {}%".format(nans_overlap))

df = df.drop(track_name_nans)
df.shape

Position        0
Track Name    641
Artist        641
Streams         0
URL             8
Date            0
Region          0
dtype: int64

Percent Overlap: 100.0%


(3366756, 7)

In [5]:
df.head(10)

Unnamed: 0,Position,Track Name,Artist,Streams,URL,Date,Region
0,1,Reggaetón Lento (Bailemos),CNCO,19272,https://open.spotify.com/track/3AEZUABDXNtecAO...,2017-01-01,ec
1,2,Chantaje,Shakira,19270,https://open.spotify.com/track/6mICuAdrwEjh6Y6...,2017-01-01,ec
2,3,Otra Vez (feat. J Balvin),Zion & Lennox,15761,https://open.spotify.com/track/3QwBODjSEzelZyV...,2017-01-01,ec
3,4,Vente Pa' Ca,Ricky Martin,14954,https://open.spotify.com/track/7DM4BPaS7uofFul...,2017-01-01,ec
4,5,Safari,J Balvin,14269,https://open.spotify.com/track/6rQSrBHf7HlZjtc...,2017-01-01,ec
5,6,La Bicicleta,Carlos Vives,12843,https://open.spotify.com/track/0sXvAOmXgjR2QUq...,2017-01-01,ec
6,7,Ay Mi Dios,IAmChino,10986,https://open.spotify.com/track/6stYbAJgTszHAHZ...,2017-01-01,ec
7,8,Andas En Mi Cabeza,Chino & Nacho,10653,https://open.spotify.com/track/5mey7CLLuFToM2P...,2017-01-01,ec
8,9,Traicionera,Sebastian Yatra,9807,https://open.spotify.com/track/5J1c3M4EldCfNxX...,2017-01-01,ec
9,10,Shaky Shaky,Daddy Yankee,9612,https://open.spotify.com/track/58IL315gMSTD37D...,2017-01-01,ec


### Top 10 tracks in the global throughout year 2017 with their total stream counts

In [6]:
df_tracks_streams = df.groupby('Track Name').sum().sort_values(by = 'Streams', ascending = False)\
    .drop('Position', axis = 1)
df_tracks_streams.head(10)

Unnamed: 0_level_0,Streams
Track Name,Unnamed: 1_level_1
Shape of You,1503176575
Despacito - Remix,910422437
Despacito (Featuring Daddy Yankee),717960928
Something Just Like This,693853930
Unforgettable,688520127
HUMBLE.,649063841
rockstar,628662532
I'm the One,617776515
It Ain't Me (with Selena Gomez),586223442
XO TOUR Llif3,578715699


### Top 10 artists (or groups) those has the most stream counts for all their tracks combined, with the stream counts of each of their tracks.

In [7]:
df_artists_streams = df.groupby('Artist').sum().sort_values(by = 'Streams', ascending = False)\
    .drop('Position', axis = 1)
df_artists_streams.head(10)

Unnamed: 0_level_0,Streams
Artist,Unnamed: 1_level_1
Ed Sheeran,4353885528
Drake,2285102445
The Chainsmokers,2081716050
Post Malone,1865412162
Luis Fonsi,1760377876
Kendrick Lamar,1753570898
J Balvin,1251838343
Calvin Harris,1152938608
Imagine Dragons,1145526622
DJ Khaled,1110441889


### Top 10 tracks in December, 2017 for each continent (North America, Europe, Asia, South America, Oceania)

In [8]:
df_Dec = deepcopy(df[df['Date'].dt.month == 12])
df_Dec['Region'] = df_Dec['Region'].str.upper()

countries = pd.read_json("countries.json", orient = "index").reset_index()
countries_index_continent = countries[['index', 'continent']]
countries.head(10)

Unnamed: 0,index,capital,continent,currency,languages,name,native,phone
0,AD,Andorra la Vella,EU,EUR,[ca],Andorra,Andorra,376
1,AE,Abu Dhabi,AS,AED,[ar],United Arab Emirates,دولة الإمارات العربية المتحدة,971
2,AF,Kabul,AS,AFN,"[ps, uz, tk]",Afghanistan,افغانستان,93
3,AG,Saint John's,,XCD,[en],Antigua and Barbuda,Antigua and Barbuda,1268
4,AI,The Valley,,XCD,[en],Anguilla,Anguilla,1264
5,AL,Tirana,EU,ALL,[sq],Albania,Shqipëria,355
6,AM,Yerevan,AS,AMD,"[hy, ru]",Armenia,Հայաստան,374
7,AO,Luanda,AF,AOA,[pt],Angola,Angola,244
8,AQ,,AN,,[],Antarctica,Antarctica,672
9,AR,Buenos Aires,SA,ARS,"[es, gn]",Argentina,Argentina,54


In [9]:
df_Dec = df_Dec.merge(countries_index_continent, left_on = 'Region', right_on = 'index', how = "left")\
    .drop('index', axis = 1)
df_Dec_group = df_Dec.groupby(['continent', 'Track Name']).sum().reset_index().drop('Position', axis = 1)

In [10]:
df_Dec_continent_track_top10 = df_Dec_group.groupby('continent')\
    .apply(lambda x: x.sort_values('Streams', ascending=False).head(10))\
    .drop(columns = 'continent', axis = 1)

df_Dec_continent_track_top10

Unnamed: 0_level_0,Unnamed: 1_level_0,Track Name,Streams
continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AS,388,Havana,14704046
AS,989,Too Good At Goodbyes,12151470
AS,731,Perfect,12126620
AS,1073,Wolves,11745050
AS,670,New Rules,11535430
AS,1043,What Lovers Do (feat. SZA),10880513
AS,733,Perfect Duet (Ed Sheeran & Beyoncé),9004183
AS,555,"Let Me Go (with Alesso, Florida Georgia Line &...",8279975
AS,1109,Young Dumb & Broke,8241744
AS,283,Dusk Till Dawn - Radio Edit,8202373


### Ranking changes of the Ed Sheeran's "Shape of You" alongside with the stream count changes.

In [11]:
df_soy = deepcopy(original_data)
df_soy = df_soy[df_soy['Track Name'] == "Shape of You"]

df_soy_global = df_soy[df_soy['Region'] == "global"].sort_values(by = 'Date', ascending = True)
df_soy_global.head(10)

Unnamed: 0,Position,Track Name,Artist,Streams,URL,Date,Region
3117643,1,Shape of You,Ed Sheeran,6151345,https://open.spotify.com/track/7qiZfU4dY1lWllz...,2017-01-06,global
3117843,1,Shape of You,Ed Sheeran,6602494,https://open.spotify.com/track/7qiZfU4dY1lWllz...,2017-01-07,global
3118043,1,Shape of You,Ed Sheeran,6557531,https://open.spotify.com/track/7qiZfU4dY1lWllz...,2017-01-08,global
3118243,1,Shape of You,Ed Sheeran,7261007,https://open.spotify.com/track/7qiZfU4dY1lWllz...,2017-01-09,global
3118443,1,Shape of You,Ed Sheeran,7145757,https://open.spotify.com/track/7qiZfU4dY1lWllz...,2017-01-10,global
3118643,1,Shape of You,Ed Sheeran,7193934,https://open.spotify.com/track/7qiZfU4dY1lWllz...,2017-01-11,global
3118843,1,Shape of You,Ed Sheeran,7081620,https://open.spotify.com/track/7qiZfU4dY1lWllz...,2017-01-12,global
3119043,1,Shape of You,Ed Sheeran,7275946,https://open.spotify.com/track/7qiZfU4dY1lWllz...,2017-01-13,global
3119243,1,Shape of You,Ed Sheeran,7093868,https://open.spotify.com/track/7qiZfU4dY1lWllz...,2017-01-14,global
3119443,1,Shape of You,Ed Sheeran,6451891,https://open.spotify.com/track/7qiZfU4dY1lWllz...,2017-01-15,global


In [12]:
ranking_plot = go.Scatter(x = df_soy_global['Date'], y = df_soy_global['Position'], name = "ranking", yaxis = "y2")
streams_plot = go.Scatter(x = df_soy_global['Date'], y = df_soy_global['Streams'], name = "streams")
layout = go.Layout(yaxis2 = dict(anchor = "x", overlaying = "y", side = "right"))
fig = go.Figure(data = [ranking_plot, streams_plot], layout = layout)
po.iplot(fig)