# PROYECT SETUP

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# LOADING DATA

In [None]:
spotify = pd.read_csv(r"C:\Users\Ian Mazzola\Desktop\Data Analyst\Datasets\spotify-2023.csv", encoding = "latin")

### This dataset contains a comprehensive list of the most famous songs of 2023 as listed on Spotify. The dataset offers a wealth of features beyond what is typically available in similar datasets. It provides insights into each song's attributes, popularity, and presence on various music platforms. The dataset includes information such as track name, artist(s) name, release date, Spotify playlists and charts, streaming statistics, Apple Music presence, Deezer presence, Shazam charts, and various audio features. It's available on Kaggle (https://www.kaggle.com/datasets/nelgiriyewithana/top-spotify-songs-2023)

In [None]:
spotify.head()

In [None]:
print("Spotify:", spotify.shape)

In [None]:
spotify.info()

# MISSING VALUES

In [None]:
spotify.isnull().sum()

### We have two columns with NULL values. This NULL values represent the abscence of information. We want to keep them to reflect the reality of the dataset. Also, this could be useful for data imputation tasks.

# DATATYPE CORRECTION

In [None]:
### We have a wrong datatype in streams. It should be int64 because represents the number of streams of a song. Making a table with the most streamed songs on Spotify shows something.

top_streams_spotify = spotify[["track_name", "artist(s)_name", "streams"]].sort_values(by = "streams", ascending = False).head(10)
top_streams_spotify

### We see that the song "Love Grows(Where My Rosemary Goes)" has a non-numeric stream value. This could be the reason that the datatype of streams is an object. According to its Spotify Page, this song has a stream count of 212 913 596 at 7:16 pm of November 6th, 2023. So, we correct our data.

In [None]:
spotify.loc[574,"streams"] = 212913596

### Now, we change datatype to int64

In [None]:
spotify["streams"] = spotify["streams"].astype("int64")

In [None]:
spotify.info()

### We can see that "in_deezer_playlist" and "in_shazam_charts" have object Dtype. We won't change it to int64 because we will work only with Spotify as streaming platform.

# TOP 10 STREAMED SONGS IN 2023 ON SPOTIFY

In [None]:
# New name for the table that has the top 10 songs

top_songs_spotify = spotify[["track_name", "artist(s)_name", "streams"]].sort_values(by = "streams", ascending = False).head(10)

# Plot

graph1 = sns.barplot(x = top_songs_spotify["streams"], y = top_songs_spotify["track_name"], palette = "Reds")
plt.xlabel("Streams(in billions)")
plt.ylabel("Track Name")
plt.title("Top 10 Songs with Most Streams on Spotify")

plt.show()

# PERSONAL TASTE OF MUSIC

### The idea of this analysis is to take data in this dataset from artists that i like to hear and compare it to the Top Streamed Songs on Spotify. First, we will make a dataframe with personal taste data.

In [None]:
shawn_mendes = spotify[spotify["artist(s)_name"].str.contains("Shawn Mendes")]
bruno_mars = spotify[spotify["artist(s)_name"].str.contains("Bruno Mars")]
ariana_grande = spotify[spotify["artist(s)_name"].str.contains("Ariana Grande")]
justin_bieber = spotify[spotify["artist(s)_name"].str.contains("Justin Bieber")]
post_malone = spotify[spotify["artist(s)_name"].str.contains("Post Malone")]
the_weeknd = spotify[spotify["artist(s)_name"].str.contains("The Weeknd")]
maroon_5 = spotify[spotify["artist(s)_name"].str.contains("Maroon 5")]

personal_music = pd.concat([shawn_mendes, bruno_mars, ariana_grande, justin_bieber, post_malone, the_weeknd, maroon_5])
personal_music.head(60)

In [None]:
# We have three track names with an encoding error, so we correct those. 

personal_music.loc[693,"track_name"] = "Señorita"
personal_music.loc[514,"track_name"] = "Here We Go ... Again (feat. Tyler, the Creator)"
personal_music.loc[538,"track_name"] = "Don't Break My Heart"
personal_music.head(60)

### Making the dataframe with this technique could lead in duplicates, because we are counting twice featured songs.
### For example, the song "Die For You - Remix" has to be counted twice because we select Ariana Grande and The Weeknd for this dataframe.

In [None]:
personal_music = personal_music.drop_duplicates()

In [None]:
personal_music.duplicated().sum()

## STATISTICS

In [None]:
personal_music.describe()

### The mean released year is 2019.51.
### The oldest song in this dataframe is from 2010 and the newest is from 2023.
### The mean artist count is 1.51, and is approximately equal to 2. That means that i like featured songs more than solo ones. 
### The mean number of appearences in spotify playlists is 7573.51 with an standard deviation of 8331.96. This suggests that there is a significant variation in popularity among songs.
### The average beats per minute (BPM) is 121.531 with an standard deviation of 25.898.
### The average danceability percentage is 62.23% with an standard deviation of 14.18.
### The average valence percentage is 49.15% with an standard deviation of 24.30.
### The average energy percentage is 63.78% with an standard deviation of 14.95.
### The average acousticness is 23.04% with an standard deviation of 26.34.
### The average instrumentalness percentage is 0.56% with an standard deviation of 3.79.
### The average liveness percentage is 19.78% with an standard deviation of 14.28.
### The average speechiness percentage is 8.20% with an standard deviation of 8.27.

In [None]:
# Most streamed songs on Spotify from my personal selection.

personal_top_songs = personal_music[["track_name", "artist(s)_name", "streams"]].sort_values(by = "streams", ascending = False).head(10)

# Plot

graph2 = sns.barplot(x = personal_top_songs["streams"], y = personal_top_songs["track_name"], palette = "Reds")
plt.xlabel("Streams(in billions)")
plt.ylabel("Track Name")
plt.title("Top 10 Songs from my personal selection with Most Streams on Spotify")

plt.show()

## CATEGORICAL VALUES ANALYSIS

In [None]:
# Key distribution

sns.countplot(x = "key", data = personal_music)
plt.title("Key Distribution for Personal Music Selection")

plt.show()

### The majority of songs has C# key

In [None]:
# Top Artists with most songs in the Personal Selection

top_personal_artists = personal_music["artist(s)_name"].value_counts().head(10)

plt.figure(figsize = (12,6))
graph3 = sns.barplot(x = top_personal_artists, y = top_personal_artists.index, palette = "Greens")
plt.xlabel("Number of Songs")
plt.ylabel("Artist(s) Name")
plt.title("Top Artists with most songs in the Personal Selection")
graph3.bar_label(graph3.containers[0])

plt.show()

### The Weeknd leads the presence in my Personal Selection with 22 solo songs, followed by Bruno Mars with 4 solo songs and Shawn Mendes, Post Malone and Justin Bieber, both three having 3 songs.

## NUMERICAL VALUES ANALYSIS

In [None]:
# Musical Attributes Distribution

attribute_columns = personal_music[["danceability_%", "valence_%", "energy_%", "acousticness_%", "liveness_%", "speechiness_%"]]

graph4 = sns.pairplot(attribute_columns, corner = True)
plt.show()

### The Energy and Danceability appears to have a right-skewed distribution, suggesting that a significant number of tracks have high energy levels and are more danceable.
### Acousticness, Liveness and Speechiness seems to have a left-skewed distribution, implying a large number of songs with lower values of those.

### Danceability and Valence seems to have a positive correlation, suggesting that more danceable songs leads to a more possitive mood.
### Acousticness and Energy appears to have a negative correlation, more energic songs tends to decrease acousticness values. 
### I don't appreciate another relationship between other numerical attributes, that shows that are diverse musical compositions in the Personal Selection.

# TOP 100 SONGS OF SPOTIFY 

In [None]:
# To compare with my personal selection, i decided to take the Most 100 streamed Songs on Spotify.

top100_spotify = spotify.sort_values(by = "streams", ascending = False).head(100)

## STATISTICS

In [None]:
top100_spotify.describe()

In [None]:
# The mean released year is 2012.78. This is earlier than my personal selection.
# The oldest song is from 1975, the 20th century is included in the top 100 and in my selection is not. 
# The mean artist count is 1.31. This suggest that in Top 100 tends to solo songs, when my personal selection goes more for featured songs.
# The number of appearences in spotify playlist is 20552.62, that is almost three times higher than my personal selection. It suggest that my musical preferences aren't influenced by the most listened songs.
# The average BPM, danceability, valence, energy, acousticness, instrumentalness, liveness, and speechiness values are generally similar to the personal selection. This means that the musical attributes of the songs that i like are like the attributes from the Top 100 songs.

## CATEGORICAL VALUES ANALYSIS

In [None]:
# Key distribution

sns.countplot(x = "key", data = top100_spotify)
plt.title("Key Distribution for the Most 100 Streamed Songs on Spotify")

plt.show()

### C# is the most popular key. Also, in my personal selection, most of the songs are in C#. Maybe listeners we love the songs in C#.

In [None]:
# Top Artists with most songs in the Top 100

top100_artists = top100_spotify["artist(s)_name"].value_counts().head(10)

plt.figure(figsize = (12,6))
graph5 = sns.barplot(x = top100_artists, y = top100_artists.index, palette = "Greens")
plt.xlabel("Number of Songs")
plt.ylabel("Artist(s) Name")
plt.title("Top Artists with most songs in the Top 100")
graph5.bar_label(graph5.containers[0])

plt.show()

### Here, Ed Sheeran reaches the first place with 6 songs and he is followed by The Weeknd with 5 songs. 

## NUMERICAL VALUES ANALYSIS

In [None]:
# Numerical Values Distribution

attribute_columns_top = top100_spotify[["danceability_%", "valence_%", "energy_%", "acousticness_%", "liveness_%", "speechiness_%"]]

graph4 = sns.pairplot(attribute_columns_top, corner = True)
plt.show()

### It seems to be that most of the Top 100 Songs have lower levels of acousticness, liveness and speechiness. We reach to a similar conclusion in my personal selection.
### Here, Danceability appears that is not a right-skewed distribution. Maybe as not that it is in my personal selection will be a better aproach to the observation.
### I don´t see a new observation that i could get from this new distributions.

### The correlations that we get in the personal selection are applicable in this graphs. 

# The biggest conclusion that i get from this analysis is that the musical attributes from the most streamed songs on Spotify contributes to developing my personal taste of Music. There are a lot of similarities in musical attributes between the Top 100 and my personal preferences.  