In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px 
import networkx as nx
import plotly.graph_objects as go
from itertools import combinations
from pyvis.network import Network

# This Notebook is an EDA for the Reasearch Question(s): <br> How do collaborations of music artists influence the success of their songs in comparison to solo songs on Spotify? <br> How long does it take for a song to enter the charts after its release and is there a connection to the length of time the song is in the charts on Spotify

## 

### 1.) Clean up & transformation

In [3]:
df = pd.read_csv('../data/cleaned_global_17-24_Spotify.csv')
df['first_appearance'] = pd.to_datetime(df['first_appearance'])
df['best_day_date'] = pd.to_datetime(df['best_day_date'])

# clean up release_date because some outliers had weird values
df.loc[df['release_date'].str.len() < 10, 'release_date']  += '-01-01' # added 01-01 to values that only had year
df.loc[df['release_date'].str.len() != 10, 'release_date'] = df['release_date'].str[:-3] # removed (1) beucause some files very duplicates

df['release_date'] = pd.to_datetime(df['release_date'])

df.loc[df['release_date'] > df['first_appearance'], 'release_date'] = df['first_appearance'] # manually set release_date to first_appearance if first_appearance 

# Added absolute day diffrences between all events
df['relase-chart_days'] = abs((df['release_date'] - df['first_appearance']).dt.days)
df['chart-best_days'] = abs((df['best_day_date'] - df['first_appearance']).dt.days)
df['release-best_days'] = abs((df['best_day_date'] - df['release_date']).dt.days)

In [4]:
# Added intervalls to make visualisation more appealing
conditions = [
    (df['relase-chart_days'] == 0),
    (df['relase-chart_days'] <= 7),
    (df['relase-chart_days'] <= 14),
    (df['relase-chart_days'] <= 21),
    (df['relase-chart_days'] <= 28),
    (df['relase-chart_days'] > 28),
]

labels = ['0 days', '<1 week', '1-2 weeks', '2-3 weeks', '3-4 weeks', '4+ weeks']

df['relase-chart_days_bins'] = np.select(conditions, labels)




### 2.) Visualisations

In [5]:
bin_counts = df['relase-chart_days_bins'].value_counts().reindex(
    ['0 days', '<1 week', '1-2 weeks', '2-3 weeks', '3-4 weeks', '4+ weeks']
)

fig = px.pie(
    names=bin_counts.index,
    values=bin_counts.values,
    title='Track Counts per Release Age Group',
    color=bin_counts.index,
    color_discrete_sequence=px.colors.qualitative.Vivid
)

fig.update_traces(
    textinfo='percent+label',
    hole=0.6  
)

fig.update_layout(
    template="plotly_dark"
)

fig.show()


In [6]:
# bar chart of mean for diffrent time intervalls

mean_days_per_bin = df.groupby('relase-chart_days_bins')['max_days_on_chart'].mean().reindex(
    ['0 days', '<1 week', '1-2 weeks', '2-3 weeks', '3-4 weeks', '4+ weeks']
)

overall_mean = df['max_days_on_chart'].mean()

fig = px.bar(
    x=mean_days_per_bin.index,
    y=mean_days_per_bin.values,
    text=mean_days_per_bin.round(1),
    labels={'x': 'Time Baskets', 'y': 'Average Max Days on Chart'},
    title='Average Max Days on Chart per Time Basket',
    color=mean_days_per_bin.index,
    color_discrete_sequence=px.colors.qualitative.Vivid
)

fig.add_hline(
    y=overall_mean,
    line_dash="dash",
    line_color="red",
    annotation_text=f"Overall Mean: {overall_mean:.1f}",
    annotation_position="top left"
)

fig.update_traces(textposition='outside')
fig.update_layout(
    xaxis_title="Time Baskets",
    yaxis_title="Average Max Days on Chart",
    template="plotly_dark",
)

fig.show()


In [7]:
# bar chart of median for diffrent time intervalls
mean_days_per_bin = df.groupby('relase-chart_days_bins')['max_days_on_chart'].median().reindex(
    ['0 days', '<1 week', '1-2 weeks', '2-3 weeks', '3-4 weeks', '4+ weeks']
)

overall_mean = df['max_days_on_chart'].median()

fig = px.bar(
    x=mean_days_per_bin.index,
    y=mean_days_per_bin.values,
    text=mean_days_per_bin.round(1),
    labels={'x': 'Time Baskets', 'y': 'Average Max Days on Chart'},
    title='Average Max Days on Chart per Time Basket',
    color=mean_days_per_bin.index,
    color_discrete_sequence=px.colors.qualitative.Vivid
)

fig.add_hline(
    y=overall_mean,
    line_dash="dash",
    line_color="red",
    annotation_text=f"Overall Mean: {overall_mean:.1f}",
    annotation_position="top left"
)

fig.update_traces(textposition='outside')
fig.update_layout(
    xaxis_title="Time Baskets",
    yaxis_title="Average Max Days on Chart",
    template="plotly_dark",
)

fig.show()


In [8]:
# bar chart of median for diffrent time intervalls

mean_days_per_bin = df.groupby('relase-chart_days_bins')['total_streams'].median().reindex(
    ['0 days', '<1 week', '1-2 weeks', '2-3 weeks', '3-4 weeks', '4+ weeks']
)

overall_mean = df['total_streams'].median()

fig = px.bar(
    x=mean_days_per_bin.index,
    y=mean_days_per_bin.values,
    text=mean_days_per_bin.round(1),
    labels={'x': 'Time Baskets', 'y': 'Average Max Days on Chart'},
    title='Average Min Peak Rank per Time Basket',
    color=mean_days_per_bin.index,
    color_discrete_sequence=px.colors.qualitative.Vivid
)

fig.add_hline(
    y=overall_mean,
    line_dash="dash",
    line_color="red",
    annotation_text=f"Overall Mean: {overall_mean:.1f}",
    annotation_position="top left"
)

fig.update_traces(textposition='outside')
fig.update_layout(
    xaxis_title="Time Baskets",
    yaxis_title="Average Max Days on Chart",
    template="plotly_dark",
)

fig.show()


In [None]:
# Violin chart 

fig = px.violin(
    df,
    x='relase-chart_days_bins',
    y='max_days_on_chart',
    box=True,  
    points='all',  
    color='relase-chart_days_bins',
    color_discrete_sequence=px.colors.qualitative.Vivid,
    title='Distribution of "Days in Chart" per Release-Interval',
    category_orders={
        'relase-chart_days_bins': ['0 days', '<1 week', '1-2 weeks', '2-3 weeks', '3-4 weeks', '4+ weeks']
    }
)

overall_median = df['max_days_on_chart'].median()
fig.add_hline(
    y=overall_median,
    line_dash="dash",
    line_color="red",
    annotation_text=f"Overall Median: {overall_median:.1f}",
    annotation_position="top left"
)

fig.update_layout(
    xaxis_title="Release until First Chart Appearance Interval",
    yaxis_title="Days in Chart",
    yaxis=dict(range=[0, 200]),  
    template="plotly_dark",
)

fig.show()


In [None]:
# Violin chart for poster

fig = px.violin(
    df,
    x='relase-chart_days_bins',
    y='max_days_on_chart',

    box=True,  
    points='all',  
    color='relase-chart_days_bins',
    color_discrete_sequence=px.colors.qualitative.Vivid,
    category_orders={
        'relase-chart_days_bins': ['0 days', '<1 week', '1-2 weeks', '2-3 weeks', '3-4 weeks', '4+ weeks']
    }
)

overall_median = df['max_days_on_chart'].median()
fig.add_hline(
    y=overall_median,
    line_dash="dash",
    line_color="red",
    annotation_text=f"Overall Median: {overall_median:.1f}",
    annotation_position="top left"
)

fig.update_layout(
    showlegend=False,
    template='plotly',
    
    xaxis=dict(
        showline=True,
        linecolor='black',
        gridcolor='black',
        zerolinecolor='black',
        tickangle=-30,
        title=dict(
            text="Release until First Chart Appearance Interval",
            font=dict(size=16, family="Arial", color="black", weight="bold")
        )
    ),
    
    yaxis=dict(
        range=[0, 200],
        dtick=50,
        showline=True,
        linecolor='black',
        gridcolor='black',
        zerolinecolor='black',
        title=dict(
            text="Days on Chart",
            font=dict(size=16, family="Arial", color="black", weight="bold")
        )
    ),
    
    font=dict(size=14, color='black', weight="bold"),
    margin=dict(l=0, r=0, t=0, b=0),
    plot_bgcolor='rgba(0, 0, 0, 0)',
    paper_bgcolor='rgba(0, 0, 0, 0)',

)

fig.show()


In [None]:
# query for colabs with commata in artist coulumn
df['is_colab'] = df['artist_names'].str.contains(',')
df['artist_count'] = df['artist_names'].str.count(',') + 1
df['artist_count'] = np.where(df['artist_count'] > 3, 4, df['artist_count'])

In [None]:
# not in use

median_streams = df.groupby('artist_count')['total_streams'].median().reset_index()

fig = px.bar(
    median_streams,
    x='artist_count',     
    y='total_streams',    
    title="Median Total Streams by Artist Count",
    labels={'artist_count': 'Number of Artists', 'total_streams': 'Median Total Streams'},
    color='artist_count', 
    color_discrete_sequence=px.colors.qualitative.Vivid
)
fig.update_traces(textposition='outside')
fig.update_layout(
    xaxis_title="Artist Count",
    yaxis_title="Median Total Streams",
    template="plotly_dark",
)

fig.show()


In [None]:
#colab bar plot
median_streams = df.groupby('is_colab')['total_streams'].median().reset_index()

fig = px.bar(
    median_streams,
    x='is_colab',
    y='total_streams',
    title="Median Total Streams by Collaboration Status",
    labels={'is_colab': 'Collaboration', 'total_streams': 'Median Total Streams'},
    color='is_colab', 
    color_discrete_sequence=px.colors.qualitative.Vivid
)

fig.update_layout(
    xaxis=dict(
        title="Collaboration",
        tickmode='array',
        tickvals=[False, True],
        ticktext=['Solo', 'Collab']
    ),
    yaxis_title="Median Total Streams",
    template="plotly_dark"
)


fig.show()


In [None]:
# colab count breakdown 
colab_counts = df['is_colab'].value_counts().reset_index()
colab_counts.columns = ['is_colab', 'count']

fig = px.pie(
    colab_counts,
    names='is_colab',
    values='count',
    title="Collaboration Count Breakdown",
    color_discrete_sequence=px.colors.qualitative.Vivid,
    template="plotly_dark"
)

label_map = {True: 'Collab', False: 'Solo'}

fig.data[0].labels = [label_map.get(v, str(v)) for v in fig.data[0].labels]

fig.update_traces(textinfo='percent+label')

fig.show()


### 2.2) Create the html page for the colab-network

In [None]:


# Get all collaberating artist as a list of tuples (edges)

edges = []
for artists in df_colab["artist_names"]:
    artist_list = [artist.strip() for artist in artists.split(",")]  
    if len(artist_list) > 1:
        edges.extend(combinations(artist_list, 2))  
print(edges)

In [None]:
# An LLM helped with this graoh

# Create the html page shown in the Iframe of the network

G = nx.Graph()
G.add_edges_from(edges)

important_nodes = [node for node in G.nodes if G.degree(node) >= 3]
G_filtered = G.subgraph(important_nodes)

from networkx.algorithms.community import greedy_modularity_communities
communities = greedy_modularity_communities(G_filtered)
community_dict = {}
for i, community in enumerate(communities):
    for node in community:
        community_dict[node] = i

net = Network(notebook=True, height='1000px', width='100%', bgcolor='#222222', font_color='white', cdn_resources='remote')
net.force_atlas_2based(gravity=-50, central_gravity=0.01, spring_length=200, spring_strength=0.05, damping=0.4)

for node in G_filtered.nodes:
    net.add_node(node, title=node, size=10 + 5 * G_filtered.degree(node), color=f'hsl({community_dict[node] * 30}, 100%, 50%)')

for edge in G_filtered.edges:
    net.add_edge(edge[0], edge[1], color='gray')

net.show("artist_collab_network.html")


artist_collab_network.html


network on collabirating artist: