In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px 

# Clean up data

In [6]:
df = pd.read_csv('../data/cleaned_global_17-24_Spotify.csv')
df.head()

Unnamed: 0,artist_names,track_name,peak_streams,total_streams,max_peak_rank,min_peak_rank,max_days_on_chart,best_day_streams,first_appearance,uri,...,release_date,album_total_tracks,album_name,spotify_uri,relase-chart_days,chart-best_days,release-best_days,relase-chart_days_bins,is_colab,artist_count
0,The Weeknd,Blinding Lights,8453567,4552239347,8,1,1858,8453567,2019-11-29,spotify:track:0sf12qNH5qcw8qpgymFOqD,...,2019-11-29,1,Blinding Lights,spotify:track:0sf12qNH5qcw8qpgymFOqD,0,112,112,0 days,False,1
1,Harry Styles,As It Was,16103849,3640335748,40,1,1002,16103849,2022-03-31,spotify:track:4LRPiXqCikLlN15c3yImP7,...,2022-03-31,1,As It Was,spotify:track:4LRPiXqCikLlN15c3yImP7,0,1,1,0 days,False,1
2,Lewis Capaldi,Someone You Loved,3974554,3625415177,198,4,2166,3974554,2019-01-07,spotify:track:2TIlqbIneP0ZY1O0EzYLlc,...,2018-11-08,4,Breach,spotify:track:2TIlqbIneP0ZY1O0EzYLlc,60,296,356,4+ weeks,False,1
3,Ed Sheeran,Shape of You,9891056,3607629552,1,1,2407,9891056,2017-01-06,spotify:track:7qiZfU4dY1lWllzX7mPBI3,...,2017-01-06,16,÷ (Deluxe),spotify:track:7qiZfU4dY1lWllzX7mPBI3,0,56,56,0 days,False,1
4,"Post Malone, Swae Lee",Sunflower - Spider-Man: Into the Spider-Verse,5033261,3467020076,196,1,2187,5033261,2018-10-18,spotify:track:1A6OTy97kk0mMdm78rHsm8,...,2018-10-18,1,Sunflower (Spider-Man: Into the Spider-Verse),spotify:track:1A6OTy97kk0mMdm78rHsm8,0,92,92,0 days,True,2


## Fix release_date

In [7]:
df['first_appearance'] = pd.to_datetime(df['first_appearance'])
df['best_day_date'] = pd.to_datetime(df['best_day_date'])

In [8]:
invalid_rows = df[df['release_date'].str.len() != 10]
invalid_rows.head()

Unnamed: 0,artist_names,track_name,peak_streams,total_streams,max_peak_rank,min_peak_rank,max_days_on_chart,best_day_streams,first_appearance,uri,...,release_date,album_total_tracks,album_name,spotify_uri,relase-chart_days,chart-best_days,release-best_days,relase-chart_days_bins,is_colab,artist_count


In [9]:
df.loc[df['release_date'].str.len() < 10, 'release_date']  += '-01-01'
df.loc[df['release_date'].str.len() != 10, 'release_date'] = df['release_date'].str[:-3]

In [10]:
invalid_rows = df[df['release_date'].str.len() != 10]
invalid_rows.head()

Unnamed: 0,artist_names,track_name,peak_streams,total_streams,max_peak_rank,min_peak_rank,max_days_on_chart,best_day_streams,first_appearance,uri,...,release_date,album_total_tracks,album_name,spotify_uri,relase-chart_days,chart-best_days,release-best_days,relase-chart_days_bins,is_colab,artist_count


In [11]:
df['release_date'] = pd.to_datetime(df['release_date'])

In [12]:
df.query('release_date > first_appearance')

Unnamed: 0,artist_names,track_name,peak_streams,total_streams,max_peak_rank,min_peak_rank,max_days_on_chart,best_day_streams,first_appearance,uri,...,release_date,album_total_tracks,album_name,spotify_uri,relase-chart_days,chart-best_days,release-best_days,relase-chart_days_bins,is_colab,artist_count


Set release data to first appearance if first appearance was before release.

In [13]:
df.loc[df['release_date'] > df['first_appearance'], 'release_date'] = df['first_appearance']
df.query('release_date > first_appearance')

Unnamed: 0,artist_names,track_name,peak_streams,total_streams,max_peak_rank,min_peak_rank,max_days_on_chart,best_day_streams,first_appearance,uri,...,release_date,album_total_tracks,album_name,spotify_uri,relase-chart_days,chart-best_days,release-best_days,relase-chart_days_bins,is_colab,artist_count


In [14]:
df['relase-chart_days'] = abs((df['release_date'] - df['first_appearance']).dt.days)
df['chart-best_days'] = abs((df['best_day_date'] - df['first_appearance']).dt.days)
df['release-best_days'] = abs((df['best_day_date'] - df['release_date']).dt.days)
df 

Unnamed: 0,artist_names,track_name,peak_streams,total_streams,max_peak_rank,min_peak_rank,max_days_on_chart,best_day_streams,first_appearance,uri,...,release_date,album_total_tracks,album_name,spotify_uri,relase-chart_days,chart-best_days,release-best_days,relase-chart_days_bins,is_colab,artist_count
0,The Weeknd,Blinding Lights,8453567,4552239347,8,1,1858,8453567,2019-11-29,spotify:track:0sf12qNH5qcw8qpgymFOqD,...,2019-11-29,1,Blinding Lights,spotify:track:0sf12qNH5qcw8qpgymFOqD,0,112,112,0 days,False,1
1,Harry Styles,As It Was,16103849,3640335748,40,1,1002,16103849,2022-03-31,spotify:track:4LRPiXqCikLlN15c3yImP7,...,2022-03-31,1,As It Was,spotify:track:4LRPiXqCikLlN15c3yImP7,0,1,1,0 days,False,1
2,Lewis Capaldi,Someone You Loved,3974554,3625415177,198,4,2166,3974554,2019-01-07,spotify:track:2TIlqbIneP0ZY1O0EzYLlc,...,2018-11-08,4,Breach,spotify:track:2TIlqbIneP0ZY1O0EzYLlc,60,296,356,4+ weeks,False,1
3,Ed Sheeran,Shape of You,9891056,3607629552,1,1,2407,9891056,2017-01-06,spotify:track:7qiZfU4dY1lWllzX7mPBI3,...,2017-01-06,16,÷ (Deluxe),spotify:track:7qiZfU4dY1lWllzX7mPBI3,0,56,56,0 days,False,1
4,"Post Malone, Swae Lee",Sunflower - Spider-Man: Into the Spider-Verse,5033261,3467020076,196,1,2187,5033261,2018-10-18,spotify:track:1A6OTy97kk0mMdm78rHsm8,...,2018-10-18,1,Sunflower (Spider-Man: Into the Spider-Verse),spotify:track:1A6OTy97kk0mMdm78rHsm8,0,92,92,0 days,True,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9100,"Kygo, Maty Noyes",Stay (feat. Maty Noyes),347597,347597,185,185,1,347597,2017-01-01,spotify:track:2FiSTH0GYpIioUgjfzMIja,...,2016-05-13,15,Cloud Nine,spotify:track:2FiSTH0GYpIioUgjfzMIja,233,0,233,4+ weeks,True,2
9101,"Skrillex, Rick Ross",Purple Lamborghini (with Rick Ross),343807,343807,187,187,1,343807,2017-01-01,spotify:track:6JyuJFedEvPmdWQW0PkbGJ,...,2016-07-22,1,Purple Lamborghini (with Rick Ross),spotify:track:6JyuJFedEvPmdWQW0PkbGJ,163,0,163,4+ weeks,True,2
9102,M.I.A.,Paper Planes,341003,341003,192,192,1,341003,2017-01-01,spotify:track:1kusepF3AacIEtUTYrw4GV,...,2007-01-01,12,Kala,spotify:track:1kusepF3AacIEtUTYrw4GV,3653,0,3653,4+ weeks,False,1
9103,Maroon 5,Sugar,335115,335115,196,196,1,335115,2017-01-02,spotify:track:494OU6M7NOf4ICYb4zWCf5,...,2015-05-18,16,V (Deluxe),spotify:track:494OU6M7NOf4ICYb4zWCf5,595,0,595,4+ weeks,False,1


Add baskets to df so Analysis is easier 

In [15]:

conditions = [
    (df['relase-chart_days'] == 0),
    (df['relase-chart_days'] <= 7),
    (df['relase-chart_days'] <= 14),
    (df['relase-chart_days'] <= 21),
    (df['relase-chart_days'] <= 28),
    (df['relase-chart_days'] > 28),
]

labels = ['0 days', '<1 week', '1-2 weeks', '2-3 weeks', '3-4 weeks', '4+ weeks']

df['relase-chart_days_bins'] = np.select(conditions, labels)

df.head()



Unnamed: 0,artist_names,track_name,peak_streams,total_streams,max_peak_rank,min_peak_rank,max_days_on_chart,best_day_streams,first_appearance,uri,...,release_date,album_total_tracks,album_name,spotify_uri,relase-chart_days,chart-best_days,release-best_days,relase-chart_days_bins,is_colab,artist_count
0,The Weeknd,Blinding Lights,8453567,4552239347,8,1,1858,8453567,2019-11-29,spotify:track:0sf12qNH5qcw8qpgymFOqD,...,2019-11-29,1,Blinding Lights,spotify:track:0sf12qNH5qcw8qpgymFOqD,0,112,112,0 days,False,1
1,Harry Styles,As It Was,16103849,3640335748,40,1,1002,16103849,2022-03-31,spotify:track:4LRPiXqCikLlN15c3yImP7,...,2022-03-31,1,As It Was,spotify:track:4LRPiXqCikLlN15c3yImP7,0,1,1,0 days,False,1
2,Lewis Capaldi,Someone You Loved,3974554,3625415177,198,4,2166,3974554,2019-01-07,spotify:track:2TIlqbIneP0ZY1O0EzYLlc,...,2018-11-08,4,Breach,spotify:track:2TIlqbIneP0ZY1O0EzYLlc,60,296,356,4+ weeks,False,1
3,Ed Sheeran,Shape of You,9891056,3607629552,1,1,2407,9891056,2017-01-06,spotify:track:7qiZfU4dY1lWllzX7mPBI3,...,2017-01-06,16,÷ (Deluxe),spotify:track:7qiZfU4dY1lWllzX7mPBI3,0,56,56,0 days,False,1
4,"Post Malone, Swae Lee",Sunflower - Spider-Man: Into the Spider-Verse,5033261,3467020076,196,1,2187,5033261,2018-10-18,spotify:track:1A6OTy97kk0mMdm78rHsm8,...,2018-10-18,1,Sunflower (Spider-Man: Into the Spider-Verse),spotify:track:1A6OTy97kk0mMdm78rHsm8,0,92,92,0 days,True,2


In [16]:
bin_counts = df['relase-chart_days_bins'].value_counts().reindex(
    ['0 days', '<1 week', '1-2 weeks', '2-3 weeks', '3-4 weeks', '4+ weeks']
)

fig = px.pie(
    names=bin_counts.index,
    values=bin_counts.values,
    title='Track Counts per Release Age Group',
    color=bin_counts.index,
    color_discrete_sequence=px.colors.qualitative.Vivid
)

fig.update_traces(
    textinfo='percent+label',
)

fig.update_layout(
    template="plotly_dark"
)

fig.show()


In [17]:

mean_days_per_bin = df.groupby('relase-chart_days_bins')['max_days_on_chart'].mean().reindex(
    ['0 days', '<1 week', '1-2 weeks', '2-3 weeks', '3-4 weeks', '4+ weeks']
)

overall_mean = df['max_days_on_chart'].mean()

fig = px.bar(
    x=mean_days_per_bin.index,
    y=mean_days_per_bin.values,
    text=mean_days_per_bin.round(1),
    labels={'x': 'Time Baskets', 'y': 'Average Max Days on Chart'},
    title='Average Max Days on Chart per Time Basket',
    color=mean_days_per_bin.index,
    color_discrete_sequence=px.colors.qualitative.Vivid
)

fig.add_hline(
    y=overall_mean,
    line_dash="dash",
    line_color="red",
    annotation_text=f"Overall Mean: {overall_mean:.1f}",
    annotation_position="top left"
)

fig.update_traces(textposition='outside')
fig.update_layout(
    xaxis_title="Time Baskets",
    yaxis_title="Average Max Days on Chart",
    template="plotly_dark",
)

fig.show()


In [18]:

mean_days_per_bin = df.groupby('relase-chart_days_bins')['max_days_on_chart'].median().reindex(
    ['0 days', '<1 week', '1-2 weeks', '2-3 weeks', '3-4 weeks', '4+ weeks']
)

overall_mean = df['max_days_on_chart'].median()

fig = px.bar(
    x=mean_days_per_bin.index,
    y=mean_days_per_bin.values,
    text=mean_days_per_bin.round(1),
    labels={'x': 'Time Baskets', 'y': 'Average Max Days on Chart'},
    title='Average Max Days on Chart per Time Basket',
    color=mean_days_per_bin.index,
    color_discrete_sequence=px.colors.qualitative.Vivid
)

fig.add_hline(
    y=overall_mean,
    line_dash="dash",
    line_color="red",
    annotation_text=f"Overall Mean: {overall_mean:.1f}",
    annotation_position="top left"
)

fig.update_traces(textposition='outside')
fig.update_layout(
    xaxis_title="Time Baskets",
    yaxis_title="Average Max Days on Chart",
    template="plotly_dark",
)

fig.show()


In [19]:
mean_days_per_bin = df.groupby('relase-chart_days_bins')['total_streams'].median().reindex(
    ['0 days', '<1 week', '1-2 weeks', '2-3 weeks', '3-4 weeks', '4+ weeks']
)

overall_mean = df['total_streams'].median()

fig = px.bar(
    x=mean_days_per_bin.index,
    y=mean_days_per_bin.values,
    text=mean_days_per_bin.round(1),
    labels={'x': 'Time Baskets', 'y': 'Average Max Days on Chart'},
    title='Average Min Peak Rank per Time Basket',
    color=mean_days_per_bin.index,
    color_discrete_sequence=px.colors.qualitative.Vivid
)

fig.add_hline(
    y=overall_mean,
    line_dash="dash",
    line_color="red",
    annotation_text=f"Overall Mean: {overall_mean:.1f}",
    annotation_position="top left"
)

fig.update_traces(textposition='outside')
fig.update_layout(
    xaxis_title="Time Baskets",
    yaxis_title="Average Max Days on Chart",
    template="plotly_dark",
)

fig.show()


In [20]:
fig = px.violin(
    df,
    x='relase-chart_days_bins',
    y='max_days_on_chart',
    box=True,  
    points='all',  
    color='relase-chart_days_bins',
    color_discrete_sequence=px.colors.qualitative.Vivid,
    title='Distribution of Max Days on Chart per Time Basket (with Boxplot and Points)',
)

overall_median = df['max_days_on_chart'].median()
fig.add_hline(
    y=overall_median,
    line_dash="dash",
    line_color="red",
    annotation_text=f"Overall Median: {overall_median:.1f}",
    annotation_position="top left"
)

fig.update_layout(
    xaxis_title="Time Baskets",
    yaxis_title="Max Days on Chart",
    yaxis=dict(range=[0, 1000]),  
    template="plotly_dark",
)

fig.show()


In [21]:
df['is_colab'] = df['artist_names'].str.contains(',')
df['artist_count'] = df['artist_names'].str.count(',') + 1
df['artist_count'] = np.where(df['artist_count'] > 3, 4, df['artist_count'])
df

Unnamed: 0,artist_names,track_name,peak_streams,total_streams,max_peak_rank,min_peak_rank,max_days_on_chart,best_day_streams,first_appearance,uri,...,release_date,album_total_tracks,album_name,spotify_uri,relase-chart_days,chart-best_days,release-best_days,relase-chart_days_bins,is_colab,artist_count
0,The Weeknd,Blinding Lights,8453567,4552239347,8,1,1858,8453567,2019-11-29,spotify:track:0sf12qNH5qcw8qpgymFOqD,...,2019-11-29,1,Blinding Lights,spotify:track:0sf12qNH5qcw8qpgymFOqD,0,112,112,0 days,False,1
1,Harry Styles,As It Was,16103849,3640335748,40,1,1002,16103849,2022-03-31,spotify:track:4LRPiXqCikLlN15c3yImP7,...,2022-03-31,1,As It Was,spotify:track:4LRPiXqCikLlN15c3yImP7,0,1,1,0 days,False,1
2,Lewis Capaldi,Someone You Loved,3974554,3625415177,198,4,2166,3974554,2019-01-07,spotify:track:2TIlqbIneP0ZY1O0EzYLlc,...,2018-11-08,4,Breach,spotify:track:2TIlqbIneP0ZY1O0EzYLlc,60,296,356,4+ weeks,False,1
3,Ed Sheeran,Shape of You,9891056,3607629552,1,1,2407,9891056,2017-01-06,spotify:track:7qiZfU4dY1lWllzX7mPBI3,...,2017-01-06,16,÷ (Deluxe),spotify:track:7qiZfU4dY1lWllzX7mPBI3,0,56,56,0 days,False,1
4,"Post Malone, Swae Lee",Sunflower - Spider-Man: Into the Spider-Verse,5033261,3467020076,196,1,2187,5033261,2018-10-18,spotify:track:1A6OTy97kk0mMdm78rHsm8,...,2018-10-18,1,Sunflower (Spider-Man: Into the Spider-Verse),spotify:track:1A6OTy97kk0mMdm78rHsm8,0,92,92,0 days,True,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9100,"Kygo, Maty Noyes",Stay (feat. Maty Noyes),347597,347597,185,185,1,347597,2017-01-01,spotify:track:2FiSTH0GYpIioUgjfzMIja,...,2016-05-13,15,Cloud Nine,spotify:track:2FiSTH0GYpIioUgjfzMIja,233,0,233,4+ weeks,True,2
9101,"Skrillex, Rick Ross",Purple Lamborghini (with Rick Ross),343807,343807,187,187,1,343807,2017-01-01,spotify:track:6JyuJFedEvPmdWQW0PkbGJ,...,2016-07-22,1,Purple Lamborghini (with Rick Ross),spotify:track:6JyuJFedEvPmdWQW0PkbGJ,163,0,163,4+ weeks,True,2
9102,M.I.A.,Paper Planes,341003,341003,192,192,1,341003,2017-01-01,spotify:track:1kusepF3AacIEtUTYrw4GV,...,2007-01-01,12,Kala,spotify:track:1kusepF3AacIEtUTYrw4GV,3653,0,3653,4+ weeks,False,1
9103,Maroon 5,Sugar,335115,335115,196,196,1,335115,2017-01-02,spotify:track:494OU6M7NOf4ICYb4zWCf5,...,2015-05-18,16,V (Deluxe),spotify:track:494OU6M7NOf4ICYb4zWCf5,595,0,595,4+ weeks,False,1


In [22]:
median_streams = df.groupby('artist_count')['total_streams'].median().reset_index()

fig = px.bar(
    median_streams,
    x='artist_count',     
    y='total_streams',    
    title="Median Total Streams by Artist Count",
    labels={'artist_count': 'Number of Artists', 'total_streams': 'Median Total Streams'},
    color='artist_count', 
    color_discrete_sequence=px.colors.qualitative.Vivid
)
fig.update_traces(textposition='outside')
fig.update_layout(
    xaxis_title="Artist Count",
    yaxis_title="Median Total Streams",
    template="plotly_dark",
)

fig.show()


In [23]:
import plotly.express as px

median_streams = df.groupby('is_colab')['total_streams'].median().reset_index()

fig = px.bar(
    median_streams,
    x='is_colab',
    y='total_streams',
    title="Median Total Streams by Collaboration Status",
    labels={'is_colab': 'Collaboration', 'total_streams': 'Median Total Streams'},
    color='is_colab', 
    color_discrete_sequence=px.colors.qualitative.Vivid
)

fig.update_layout(
    xaxis_title="Collaboration",
    yaxis_title="Median Total Streams",
    template="plotly_dark"
)

fig.show()


In [24]:

colab_counts = df['is_colab'].value_counts().reset_index()
colab_counts.columns = ['is_colab', 'count']

fig = px.pie(
    colab_counts,
    names='is_colab',
    values='count',
    title="Collaboration Count Breakdown",
    color_discrete_sequence=px.colors.qualitative.Vivid
)

fig.update_traces(textinfo='percent+label')
fig.update_layout(template="plotly_dark")

fig.show()


In [25]:
df.to_csv('cleaned_global_17-19.csv', index=False)

In [26]:
df = pd.read_csv('/Users/milanmuntenjon/Desktop/data-science-project/data/cleaned_global_17-24_Spotify.csv')

In [27]:
df_colab = df.query('is_colab == True')

In [28]:
import pandas as pd
import networkx as nx
import plotly.graph_objects as go
from itertools import combinations


edges = []
for artists in df_colab["artist_names"]:
    artist_list = [artist.strip() for artist in artists.split(",")]  
    if len(artist_list) > 1:
        edges.extend(combinations(artist_list, 2))  
print(edges)

[('Post Malone', 'Swae Lee'), ('The Kid LAROI', 'Justin Bieber'), ('Billie Eilish', 'Khalid'), ('The Weeknd', 'Daft Punk'), ('Post Malone', '21 Savage'), ('Shawn Mendes', 'Camila Cabello'), ('Elton John', 'Dua Lipa'), ('Elton John', 'PNAU'), ('Dua Lipa', 'PNAU'), ('The Chainsmokers', 'Coldplay'), ('David Guetta', 'Bebe Rexha'), ('Lady Gaga', 'Bradley Cooper'), ('Lil Nas X', 'Jack Harlow'), ('Bad Bunny', 'Chencho Corleone'), ('Drake', 'Wizkid'), ('Drake', 'Kyla'), ('Wizkid', 'Kyla'), ('Dua Lipa', 'DaBaby'), ('Bad Bunny', 'JHAYCO'), ('Bizarrap', 'Quevedo'), ('The Weeknd', 'JENNIE'), ('The Weeknd', 'Lily-Rose Depp'), ('JENNIE', 'Lily-Rose Depp'), ('SAINt JHN', 'Imanbek'), ('24kGoldn', 'iann dior'), ('Lady Gaga', 'Bruno Mars'), ('FloyyMenor', 'Cris Mj'), ('Sam Smith', 'Kim Petras'), ('Doja Cat', 'SZA'), ('Marshmello', 'Bastille'), ('Tyler', 'The Creator'), ('Tyler', 'Kali Uchis'), ('The Creator', 'Kali Uchis'), ('Rema', 'Selena Gomez'), ('Camila Cabello', 'Young Thug'), ('Metro Boomin', 'T

In [29]:
print(len(edges))

8876


In [30]:
import pandas as pd
import networkx as nx
import plotly.graph_objects as go
from itertools import combinations
from pyvis.network import Network

G = nx.Graph()
G.add_edges_from(edges)

important_nodes = [node for node in G.nodes if G.degree(node) >= 3]
G_filtered = G.subgraph(important_nodes)

from networkx.algorithms.community import greedy_modularity_communities
communities = greedy_modularity_communities(G_filtered)
community_dict = {}
for i, community in enumerate(communities):
    for node in community:
        community_dict[node] = i

net = Network(notebook=True, height='1000px', width='100%', bgcolor='#222222', font_color='white', cdn_resources='remote')
net.force_atlas_2based(gravity=-50, central_gravity=0.01, spring_length=200, spring_strength=0.05, damping=0.4)

for node in G_filtered.nodes:
    net.add_node(node, title=node, size=10 + 5 * G_filtered.degree(node), color=f'hsl({community_dict[node] * 30}, 100%, 50%)')

for edge in G_filtered.edges:
    net.add_edge(edge[0], edge[1], color='gray')

net.show("artist_collab_network.html")


artist_collab_network.html


network on collabirating artist: