# **Cross-platform Analysis of Most Streamed Spotify Songs 2023**
This analysis looks at the most streamed spotify songs of 2023 and how they perform across different music streaming platforms. 

### 1. Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib_venn import venn3, venn3_circles
import seaborn as sns
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

### 2. Load the dataset

In [None]:
df=pd.read_csv("/kaggle/input/top-spotify-songs-2023/spotify-2023.csv",encoding='latin-1')
df.head()

In [None]:
print(f"The dataset contains {df.shape[0]} rows and {df.shape[1]} columns.")

In [None]:
# Print the DataFrame info
df.info()

### 3. Data Cleaning

In [None]:
#check missing values
df.isnull().sum()

**Missing Values in coulmns:**
* in_shazam_charts = 50 
* key = 95

In [None]:
# Missing values in column 'key' will be replaced as 'missing_key' because the data must be an object
df['key'] = df['key'].fillna("missing_key")

# Missing values in column 'in shazam charts' will be replaced with 0 because the data must be an integer
df['in_shazam_charts'] = df['in_shazam_charts'].fillna(0)

df.isnull().sum()

Now we dont have missing values. 

**Remove duplicate songs**


In [None]:
df.duplicated(subset=['track_name','artist(s)_name']).sum()

df[df.duplicated(subset=['track_name', 'artist(s)_name'], keep=False)]

In [None]:
# Remove duplicates based on 'track_name' and 'artist(s)_name'
df_cleaned = df.drop_duplicates(subset=['track_name', 'artist(s)_name'], keep='first')

# Print the number of rows after removing duplicates
print(f"Number of rows after removing duplicates: {df_cleaned.shape[0]}")

**Data type conversion**

Converting specific columns to numeric data type (float64 or int64), coercing any errors encountered (such as non-numeric strings) to NaN (Not a Number).

In [None]:
df['streams'] = pd.to_numeric(df['streams'], errors='coerce')
df['in_deezer_playlists'] = pd.to_numeric(df['in_deezer_playlists'], errors='coerce')
df['in_shazam_charts'] = pd.to_numeric(df['in_shazam_charts'], errors='coerce')
df.info()

### 4. Exploratory Data Analysis (EDA)

In [None]:
plt.figure(figsize=(6, 5))
sns.countplot(data=df, x='artist_count', palette="rocket", width=0.5, saturation=2)
plt.title('Number of Artists per Song', fontsize=10)
plt.xlabel('Number of Artists')
plt.ylabel('Song Count')
plt.tight_layout()
plt.show()

In [None]:
# Group by artist and sum up their streams
artist_streams = df.groupby('artist(s)_name')['streams'].sum().sort_values(ascending=False).head(10)

# Plot the artists with the most streams
plt.figure(figsize=(6, 5))
sns.barplot(x=artist_streams.values, y=artist_streams.index, palette="rocket" ,width=0.5, saturation = 2,orient='h')
plt.title('Top 10 Artists based on Total Streams', fontsize=10)
plt.xlabel('Total Streams in billions')
plt.ylabel('Artist Name')
plt.tight_layout()
plt.show()

In [None]:
# Count every song by artist
top_artists = df['artist(s)_name'].value_counts().head(10)

# Plot the top 10 artists based on the number of songs
plt.figure(figsize=(6, 5))
sns.barplot(x=top_artists.values, y=top_artists.index, palette="rocket", width=0.5, saturation=2, orient='h')
plt.title('Top 10 Artists based on Most Songs', fontsize=10)
plt.xlabel('Number of Songs')
plt.ylabel('Artist Name')
plt.tight_layout()
plt.show()

In [None]:
heatmap_data = df[['in_spotify_charts', 'in_apple_charts', 'in_deezer_charts', 'in_shazam_charts']]

#Correlation between the appearances of songs on different music charts (Spotify, Apple, Deezer, and Shazam)
plt.figure(figsize=(8, 7))
sns.heatmap(heatmap_data.corr(), annot=True, cmap='rocket_r', fmt='.2f', vmin =-1, vmax = 1)
plt.title('Correlation Heatmap of Chart Appearances')
plt.tight_layout()
plt.show()

High correlation values suggest that the same songs are popular across multiple platforms, while low correlation values suggest platform-specific popularity.

The correlation coefficient measures the strength and direction of the linear relationship between two variables. It ranges from -1 to 1, where:

* 1 indicates a perfect positive correlation
* -1 indicates a perfect negative correlation
* 0 indicates no correlation

Strongest Correlation: 
* The highest correlation is between Spotify and Deezer charts (0.60), suggesting that songs popular on Spotify are likely to be popular on Deezer as well.

Moderate Correlation:
* Spotify and Apple have a moderate correlation (0.55), indicating that there is a significant overlap between songs popular on these two platforms.
* Spotify and Shazam also have a moderate correlation (0.51), implying that songs frequently appearing on Spotify charts are often recognized on Shazam.

Lower Correlation:
* Apple and Deezer have lower correlation (0.38) which is still positive but weaker compared to Spotify's correlations with other platforms.
* Apple and Shazam also has same correlation as Apple and Deezer (0.38).
* Deezer and Shazam have the lowest correlation (0.31), indicating less overlap between these platforms.

There is a noticeable overlap in song popularity between Spotify, Deezer, and Apple, with Spotify showing a stronger correlation with Deezer. Shazam has the weakest correlations with other platforms, indicating that songs recognized on Shazam might not always align with the most popular songs on streaming platforms.

In [None]:
# The distribution and popularity of tracks across these platforms
# Assuming df is your DataFrame containing the data
# Aggregate counts based on platform
platform_counts = {
    'Platform': ['Spotify', 'Apple Music', 'Deezer', 'Shazam'],
    'Count': [
        df['in_spotify_charts'].sum(),
        df['in_apple_charts'].sum(),
        df['in_deezer_charts'].sum(),
        df['in_shazam_charts'].sum()
    ]
}

# Convert to DataFrame
platform_counts_df = pd.DataFrame(platform_counts)

# Plotting
plt.figure(figsize=(8, 5))
sns.barplot(data=platform_counts_df, x='Platform', y='Count', palette='rocket')
plt.xlabel('Platform')
plt.ylabel('Number of Tracks')
plt.title('Distribution of Tracks Among Platform Charts')
plt.tight_layout()
plt.show()

In [None]:


# Creating sets for each platform's charts
spotify_tracks = set(df[df['in_spotify_charts'] > 0]['track_name'])
apple_tracks = set(df[df['in_apple_charts'] > 0]['track_name'])
deezer_tracks = set(df[df['in_deezer_charts'] > 0]['track_name'])
shazam_tracks = set(df[df['in_shazam_charts'] > 0]['track_name'])

# Venn diagram for Spotify, Apple, and Deezer (for simplicity, Shazam can be included similarly)
plt.figure(figsize=(10, 7))
venn3([spotify_tracks, apple_tracks, deezer_tracks],
      ('Spotify', 'Apple Music', 'Deezer'))
plt.title('Overlap of Tracks Among Spotify, Apple Music, and Deezer Charts')
plt.show()


In [None]:
#plot how tracks distributed among the platforms chart
playlist_counts = {
    'Platform': ['Spotify', 'Apple Music', 'Deezer', 'Shazam'],
    'Number of Tracks': [
        df['in_spotify_playlists'].sum(),
        df['in_apple_playlists'].sum(),
        df['in_deezer_playlists'].sum(),
        0      #Assuming Shazam doesn't have playlists in your dataset
    ]
}

# Convert to DataFrame
playlist_counts_df = pd.DataFrame(playlist_counts)

# Plotting
plt.figure(figsize=(6, 5))
sns.barplot(data=playlist_counts_df, x='Platform', y='Number of Tracks', palette='rocket')
plt.xlabel('Platform')
plt.ylabel('Number of Tracks')
plt.title('Distribution of Tracks in Playlists Across Platforms')
plt.tight_layout()
plt.show()


This code will produce a bar plot that shows the total number of tracks included in playlists across Spotify, Apple Music, and Deezer. 
Shazam is included in the plot with a value of 0, indicating it does not have playlist data in this context. 
This visualization helps compare the distribution of tracks in playlists across these platforms.

In [None]:

# Creating sets for each platform's charts
spotify_tracks = set(df[df['in_spotify_playlists'] > 0]['track_name'])
apple_tracks = set(df[df['in_apple_playlists'] > 0]['track_name'])
deezer_tracks = set(df[df['in_deezer_playlists'] > 0]['track_name'])


# Venn diagram for Spotify, Apple, and Deezer (for simplicity, Shazam can be included similarly)
plt.figure(figsize=(6, 5))
venn3([spotify_tracks, apple_tracks, deezer_tracks],
      ('Spotify', 'Apple Music', 'Deezer'))
plt.title('Overlap of Tracks Among Spotify, Apple Music, and Deezer Charts')
plt.show()