In [13]:
# Import 'pandas' library for data manipulation
# Import 'pathlib' module to handle file paths
# Import 'matplotlib' library to create visualizations
# Import 'numpy' library for numerical operations
# Import 'scipy.stats' module for linear regression 

import pandas as pd 
from pathlib import Path 

import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import linregress

In [11]:
# variable 'spotify_csv' is assigned the 'Path' object, representing the file path to 'spotify-2023.csv' file.

spotify_csv = Path("spotify-2023.csv")

# the pd.read_csv function reads the 'spotify-2023.csv' file and loads it into a Pandas DataFrame.

spotify_df = pd.read_csv(spotify_csv, encoding="latin-1")

# after reduced and renamed columns, first 5 rows of data are shown
spotify_df.head()


FileNotFoundError: [Errno 2] No such file or directory: 'spotify-2023.csv'

In [None]:
spotify_df.columns

In [None]:
# Create DataFrames with columns needed

spotify_columns_df = spotify_df[["artist(s)_name", "track_name", "released_year","streams", "in_spotify_charts", "in_apple_charts", "in_deezer_charts", "in_shazam_charts", "bpm", "danceability_%"]]

spotify_columns_df.head(10)



In [None]:
# rename columns

spotify_columns_df = spotify_columns_df.rename(columns={"track_name": "Song Title",
                                                       "artist(s)_name": "Artist",
                                                       "released_year": "Released Year",
                                                       "streams": "Streams",
                                                       "in_spotify_charts": "Spotify Charts",
                                                       "in_apple_charts": "Apple Charts",
                                                       "in_deezer_charts": "Deezer Charts",
                                                       "in_shazam_charts": "Shazam Charts",
                                                       "bpm": "BPM",
                                                       "danceability_%": "Danceability %"
                                                       })

spotify_columns_df

In [None]:
# find missing values 

spotify_columns_df.count()

In [None]:
# filling missing values with 0 

spotify_columns_df = spotify_columns_df.fillna(0)
spotify_columns_df.count()


In [None]:
# Detecting missingg or NA values
# NA values = Returns True 

fillna_values = spotify_columns_df.isna()
fillna_values

In [None]:
# Datatypes of values 

spotify_columns_df.dtypes

In [None]:
# Deleting a weird value from DataFrame

value_to_delete = 'BPM110KeyAModeMajorDanceability53Valence75Energy69Acousticness7Instrumentalness0Liveness17Speechiness3'
spotify_columns_df = spotify_columns_df[spotify_columns_df['Streams'] != value_to_delete]

spotify_columns_df



In [None]:
# Converting float or object datatypes to integer datatype of columns with numerical values 

spotify_columns_df = spotify_columns_df.astype({"Streams": "int64"}, errors='raise')
spotify_columns_df["Streams"].dtype


In [None]:
spotify_columns_df['Shazam Charts'] = spotify_columns_df['Shazam Charts'].str.replace(',','')
spotify_columns_df = spotify_columns_df.fillna({'Shazam Charts': 0})
spotify_columns_df['Shazam Charts'] = spotify_columns_df['Shazam Charts'].astype("int64")
spotify_columns_df.dtypes

In [None]:
# Calculating the maximum value from the "Streams" column

highest_streams = spotify_columns_df["Streams"].max()

print(highest_streams)

In [None]:
# Calculating the minimum value from the "Streams" column

lowest_streams = spotify_columns_df["Streams"].min()

print(lowest_streams)

In [None]:
# Calculating descroptive statistics for the "Streams" column

stats_streams = spotify_columns_df["Streams"].describe()

print(stats_streams)

In [None]:
# Calculating descriptive statistics for all the columns with numerical values

stats_spotify_columns = spotify_columns_df.describe()

print(stats_spotify_columns)

In [None]:
# Creating a new DataFrame the top 5 songs

spotify_columns_top5_df = spotify_columns_df.head(5)

spotify_columns_top5_df

In [None]:
# Data Cleaning: Sorting Streams in descending (highest to lowest) order with parameter (ascending=False) for top 5 songs

spotify_columns_top5_df = spotify_columns_df.sort_values(["Streams"], ascending=False)

spotify_columns_top5_df.head(5)


In [None]:
# Creating a DataFrame for the bottom 5 songs 

spotify_columns_bottom5_df = spotify_columns_df.tail(5)

spotify_columns_bottom5_df

In [None]:
# Data Cleaning: Sorting Streams in ascending (lowest to highest) order with parameter (ascending=True) for bottom 5 songs

spotify_columns_bottom5_df = spotify_columns_df.sort_values(["Streams"], ascending=True)

spotify_columns_bottom5_df.head(5)

In [None]:
# Data Visualization: This code creates a bar chart to visualize the stream count by songs from the top 5. 

# The 'data' dictionary contains two lists: 'song tittle' and 'streams'
# 'Song Title' contains the titles of the top songs, followed by the name of the artist.
# 'Streams' contains the respective stream counts for each song.

data = {'Song Title': ['Blinding Lights \n By The Weeknd', 'Shape of You \n By Ed Sheeran', 
                       'Someone You Loved \n By Lewis Capaldi', 'Dance Monkey \n By Tones and I',
                       'Sunflower-Spider-Man \n By Post Malone, Swae Lee'],
        'Streams': [3703895074, 3562543890, 2887241814, 2864791672, 2808096550]}

# Bar chart creation: 

# Set the size of the figure 
plt.figure(figsize=(10,6))

# Set x-axis to represent 'Song Tittles' and y-axis 'Streams'
plt.bar(data['Song Title'], data['Streams'], color='skyblue')

# Set rotation of x-axis labels by 45 degrees for readability as horizontal axis
# Display y-axis labels in plain format
plt.xticks(rotation=45, ha='right')
plt.ticklabel_format(style='plain', axis='y')


# Set title, x-axis label, and y-axis label

plt.xlabel('Song Title')
plt.ylabel('Streams')
plt.title('Streams by Songs-Top 5')

# Fitting elements within the figure

plt.tight_layout()
plt.savefig("Barchart-top5")
plt.show()

In [None]:
# Data Visualization: This code creates a bar chart to visualize streams count by Songs for the bottom 5 songs

# The 'data' dictionary contains two lists: 'song tittle' and 'streams'
# 'Song Title' contains the titles of the last 5 songs, followed by the name of the artist.
# 'Streams' contains the respective stream counts for each song.

data = {'Song Title': ["Que Vuelvas \n By Carin Leon, Grupo Frontera",
                       "Jhoome Jo Pathaan \n By Arijit Singh, Vishal Dadlani",
                       "QUEMA \n By Sog, Ryan Castro, Peso Pluma",
                       "Gol Bolinha, Gol Quadrado 2 \n By Mc Pedrinho, DJ 900","Overdrive \n By Post Malone"],
        'Streams': [2762, 1365184, 11599388, 11956641, 14780425]}


# Bar chart creation: 

# Set the size of the figure 
plt.figure(figsize=(10,6))

# Set x-axis to represent 'Song Tittles' and y-axis 'Streams'
plt.bar(data['Song Title'], data['Streams'], color='skyblue')

# Set rotation of x-axis labels by 45 degrees for readability as horizontal axis
# Display y-axis labels in plain format
plt.xticks(rotation=45, ha='right')
plt.ticklabel_format(style='plain', axis='y')


# Set title, x-axis label, and y-axis label
# Fitting elements within the figure

plt.xlabel('Song Title')
plt.ylabel('Streams')
plt.title('Streams by Song-Bottom 5')
plt.tight_layout()
plt.savefig("Barchart-bottom5")
plt.show()

In [None]:
# Desxriptive statistics: max and min for BPM

print("Max: " + str(spotify_columns_df['BPM'].max()))
print("Min: " + str(spotify_columns_df['BPM'].min()))

In [None]:
# Data Visualization: Pie Chart

# Defining the boundaries for the bins into groups of BPMs.

bins = [60, 89, 119, 149, 179, 210]

groups = ['60 to 89 bpm', '90 to 119 bpm', '120 to 149 bpm', '150 to 179 bpm', '180 to 210 bpm']
pd.cut(spotify_columns_df['BPM'], bins, labels=groups)


In [None]:
# Slicing bpm counts of stream 

spotify_bpm_counts = pd.cut(spotify_columns_df['BPM'], bins, labels=groups)
spotify_bpm_counts = spotify_bpm_counts.value_counts()
spotify_bpm_counts

In [None]:
# Pie chart

colors = ['lightpink', 'peachpuff', 'palegreen', 'skyblue', 'plum']
explode = (.1,0,0,0,0)
plt.pie(spotify_bpm_counts, labels=groups, autopct="%1.1f%%", shadow=True, explode=explode, colors=colors)
plt.title('BPM Distribution')
plt.savefig("BPM-Distribution")
plt.show

In [None]:
# Data Visualization: Data from DataFrame to create scatter plot

spotifycharts_scatterplot = spotify_columns_df[['Spotify Charts', 'Danceability %']]

# Filter out fillna values of 0 out of 'Spotify Charts'

spotifycharts_scatterplot = spotifycharts_scatterplot.loc[(spotifycharts_scatterplot['Spotify Charts'] > 0 ) &
                                                          (spotifycharts_scatterplot['Spotify Charts'] < 100)]


# Linear Regression

(slope, intercept, rvalue, pvalue, stderr) = linregress(x_axis, y_axis)
regress_values = x_axis * slope + intercept 
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))

# Set Variables

x_axis = spotifycharts_scatterplot['Spotify Charts']
y_axis = spotifycharts_scatterplot['Danceability %']

# Plotting data 

plt.scatter(x_axis, y_axis)
plt.plot(x_axis, regress_values, "r-")
plt.annotate(line_eq, (60,30), fontsize=15,color="red")
plt.title("Danceability across Spotify Charts")
plt.xlabel('Spotify Chart Position')
plt.ylabel('Danceability (%)')
plt.savefig("scatter1")
print(f"The r-squared is: {rvalue**2}")
plt.show()

In [None]:
spotifycharts_scatterplot.describe()


In [None]:
# Data Visualization: Data from DataFrame to create scatter plot

applecharts_scatterplot = spotify_columns_df[['Apple Charts', 'Danceability %']]

# Filter out fillna values of 0 out of 'Spotify Charts'

applecharts_scatterplot = applecharts_scatterplot.loc[(applecharts_scatterplot['Apple Charts'] > 0) & (applecharts_scatterplot['Apple Charts'] < 100)]


# Set Variables

x_axis = applecharts_scatterplot['Apple Charts']
y_axis = applecharts_scatterplot['Danceability %']

# Linear Regression

(slope, intercept, rvalue, pvalue, stderr) = linregress(x_axis, y_axis)
regress_values = x_axis * slope + intercept 
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))

# Plotting data 

plt.scatter(x_axis, y_axis)
plt.plot(x_axis, regress_values, "r-")
plt.annotate(line_eq, (60,60), fontsize=15,color="red")
plt.title("Danceability across Apple Charts")
plt.xlabel('Apple Chart Position')
plt.ylabel('Danceability (%)')
plt.savefig("scatter2")
print(f"The r-squared is: {rvalue**2}")
plt.show()

In [None]:
applecharts_scatterplot.describe()

In [None]:
# Data Visualization: Data from DataFrame to create scatter plot

deezercharts_scatterplot = spotify_columns_df[['Deezer Charts', 'Danceability %']]

# Filter out fillna values of 0 out of 'Spotify Charts'

deezercharts_scatterplot = deezercharts_scatterplot.loc[(deezercharts_scatterplot['Deezer Charts'] > 0) & (deezercharts_scatterplot['Deezer Charts'] < 100)]

# Set Variables

x_axis = deezercharts_scatterplot['Deezer Charts']
y_axis = deezercharts_scatterplot['Danceability %']

# Linear Regression

(slope, intercept, rvalue, pvalue, stderr) = linregress(x_axis, y_axis)
regress_values = x_axis * slope + intercept 
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))

# Plotting data 

plt.scatter(x_axis, y_axis)
plt.plot(x_axis, regress_values, "r-")
plt.annotate(line_eq, (30,30), fontsize=15,color="red")
plt.title("Danceability across Deezer Charts")
plt.xlabel('Deezer Chart Position')
plt.ylabel('Danceability (%)')
plt.savefig("scatter3")
print(f"The r-squared is: {rvalue**2}")
plt.show()

In [None]:
deezercharts_scatterplot.describe()


In [None]:
# Data Visualization: Data from DataFrame to create scatter plot

shazamcharts_scatterplot = spotify_columns_df[['Shazam Charts', 'Danceability %']]

# Filter out fillna values of 0 out of 'Spotify Charts'

shazamcharts_scatterplot = shazamcharts_scatterplot.loc[(shazamcharts_scatterplot['Shazam Charts'] > 0) & (shazamcharts_scatterplot['Shazam Charts'] < 100)]

# Set Variables

x_axis = shazamcharts_scatterplot['Shazam Charts']
y_axis = shazamcharts_scatterplot['Danceability %']



(slope, intercept, rvalue, pvalue, stderr) = linregress(x_axis, y_axis)
regress_values = x_axis * slope + intercept 
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))

# Plotting data 

plt.scatter(x_axis, y_axis)
plt.plot(x_axis, regress_values, "r-")
plt.annotate(line_eq, (30,30), fontsize=15,color="red")
plt.title("Danceability across Shazam Charts")
plt.xlabel('Shazam Chart Position')
plt.ylabel('Danceability (%)')
plt.savefig("scatter4")
print(f"The r-squared is: {rvalue**2}")
plt.show()

In [None]:
shazamcharts_scatterplot.describe()

In [None]:
# Finding unique values present in 'Released Year' column

spotify_columns_df["Released Year"].unique()


In [None]:
# Data Aggregation: 

# Create separat groups for each unique value of 'Released Year' column
# Calculate the average value of the number of 'Streams' for each 'Released Year' group. 

releasedyear_linechart_df = spotify_columns_df.groupby("Released Year")
average_streams_releasedyear = releasedyear_linechart_df['Streams'].mean()

average_streams_releasedyear


In [None]:
# Data Visualization: Line Chart 

plt.plot(average_streams_releasedyear)
plt.title("Music Popularity over Time")
plt.xlabel('Released Year')
plt.ylabel('Streams')
plt.ticklabel_format(style='plain', axis='y')

plt.savefig("Streams-accrossyr1")
plt.show()



In [None]:
spotify_reducedyear_df = spotify_columns_df.loc[spotify_columns_df["Released Year"] > 2020]

releasedyear_linechart_df = spotify_reducedyear_df.groupby("Released Year")
average_streams_releasedyear = releasedyear_linechart_df['Streams'].mean()

average_streams_releasedyear

In [None]:
plt.plot(average_streams_releasedyear)
plt.title("Music Popularity over Time (2020-2023)")
plt.xlabel('Released Year')
plt.ylabel('Streams')
plt.ticklabel_format(style='plain', axis='y')
plt.ticklabel_format(style='plain', axis='x')

plt.savefig("Streams-accrossyr2")
plt.show()