## Spotify 2023 Data Analysis. 

### Exploratory Analysis

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from matplotlib.ticker import FuncFormatter
import seaborn as sns
from IPython.display import display
import plotly.express as px
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from scipy import stats

print('packages imported')

In [None]:
df = pd.read_csv(r'C:\Users\cindy\Documents\Projects\spotify_2023.csv', encoding='latin-1')
df.head()

In [None]:
df.shape

#### Preliminary Analysis

In [None]:
# Getting the summary statistics (mean, min, max, etc).
df.describe()

In [None]:
# check for null values, if there are null values, replace with 0
df.isnull().sum()

In [None]:
# Replace the null values by 0
df['in_shazam_charts'].fillna(0, inplace=True)
df['key'].fillna(0, inplace=True)

df.isnull().sum()

In [None]:
df.info()

#### Top 10 Most Streamed Tracks

In [None]:
# df['streams'] column is a object and needs to be changed to numerical. Find the non-numeric row and drop it.
df['streams'] = pd.to_numeric(df['streams'], errors = 'coerce', downcast='integer')
non_num  = df[df['streams'].isna()]
# drop row and reset index
df.drop(574, axis=0, inplace=True)
df.reset_index(drop=True, inplace=True)

df.shape

In [None]:
print(df['streams'].dtype)

In [None]:
# What are the top 10 most streamed songs of 2022?
#Order 'streams' in desc order 
top_10_songs = df.sort_values(by='streams', ascending=False).head(10)
top_10_songs.reset_index(drop=True, inplace=True)

top_10_songs.head(1)

In [None]:
#create a bar chart to visualize top 10 songs
plt.figure(figsize=(10,10))
sns.set(style="whitegrid")
ax = sns.barplot(x='streams', y='track_name', data=top_10_songs, palette="viridis")

# Add labels and title
ax.set(xlabel="Streams", ylabel="Track Name")
plt.title("Top 10 Songs by Streams")

# Define a custom tick formatter function
def billions_formatter(x, pos):
    return f'{x / 1e9:.1f}B'

# Apply the custom formatter to the x-axis
plt.gca().xaxis.set_major_formatter(FuncFormatter(billions_formatter))

# Annotate the bars with stream counts
for index, row in top_10_songs.iterrows():
    plt.text(row['streams'], index, f"{row['streams'] / 1e9:.1f}B streams")


# Show the plot
plt.show()

#### Top 10 Most Streamed Artist

In [None]:
df['artist(s)_name'] = df['artist(s)_name'].str.split(',')
df['artist(s)_name']

In [None]:
#seperate songs with multiple artist into their own rows
df = df.explode('artist(s)_name', ignore_index=True)
df.reset_index(drop=True, inplace=True)

#drop artist count row
df.drop('artist_count', axis=1, inplace=True)
df.head(3)

In [None]:
duplicated_rows = df[df.duplicated(keep=False)]
display(duplicated_rows)

In [None]:
#delete duplicate rows
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True)

# Check if the duplicates are gone
dup_rows = df[df.duplicated(keep=False)]
display(dup_rows)
df.shape

In [None]:
#calculate sum of streams, grouping by artist
artst_strms=df.groupby('artist(s)_name')['streams'].sum().reset_index()
most_strm_artst = artst_strms.sort_values(by='streams', ascending=False).head(10)

most_strm_artst.head(10)

In [None]:
# Create a treemap using Plotly Express
fig = px.treemap(most_strm_artst, 
                 path=['artist(s)_name'],  # Define the hierarchy with only one level (the artist's name)
                 values='streams',  # Define the values to visualize (total streams)
                 color='streams',  # Color the treemap by total stream count
                 hover_data=['streams'],  # Add additional hover information (total streams)
                 title='Top 10 Most Streamed Artists (Treemap)')

# Customize the appearance
fig.update_traces(textinfo='label+value', selector=dict(type='treemap'))

# Show the interactive treemap
fig.show()

#### Top 10 artist with the most tracks

In [None]:
artst_trck_cnt = df['artist(s)_name'].value_counts()

top_10_artst = artst_trck_cnt.head(10)

display(top_10_artst)

In [None]:
# create bar chart with top 10 artist
plt.figure(figsize=(8, 6))
top_10_artst.plot(kind='bar')
plt.xlabel('Artist')
plt.ylabel('Number of songs')
plt.title('Top 10 Artists with most tracks')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

### Exploring Correlations and Music Attributes

#### Correlation between attributes and streams

In [None]:
# Correlation between attributes and streams
correlation_matrix = df[['streams', 'danceability_%', 'energy_%', 'valence_%', 'acousticness_%', 'instrumentalness_%', 'liveness_%', 'speechiness_%']].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Heatmap')
plt.show()

In [None]:
# Scatter plot of danceability vs. energy
plt.figure(figsize=(8, 6))
sns.scatterplot(x='danceability_%', y='energy_%', data=df)
plt.title('Danceability vs. Energy')
plt.xlabel('Danceability')
plt.ylabel('Energy')
plt.show()

#### Visualization of Music Sounds

In [None]:
plt.figure(figsize=(8, 6))
sns.histplot(df['bpm'], bins=20, kde=True, color='skyblue')
plt.title('Distribution of BPM (Beats Per Minute)')
plt.xlabel('BPM')
plt.ylabel('Count')
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
sns.countplot(x='key', data=df, palette='Set3')
plt.title('Key Distribution')

plt.subplot(1, 2, 2)
sns.countplot(x='mode', data=df, palette='Set2')
plt.title('Mode Distribution')

plt.tight_layout()
plt.show()

#### Linear Regression

In [None]:
int_cols = df.select_dtypes(include=['int', 'float'])
print(int_cols.info())

In [None]:
X = int_cols.drop('streams', axis = 1)
y = df.streams

In [None]:
# define the model
model = LinearRegression()

# train the model
model.fit(X,y)

In [None]:
# get coeficients of the model
coeficientes = model.coef_
intercepto = model.intercept_

# Print coeficients
print("Coeficientes:", coeficientes)
print("Intercepto:", intercepto)

In [None]:
columns_to_exclude = ['in_spotify_playlists','in_spotify_charts','in_apple_playlists','in_apple_charts','in_deezer_charts']

X_2 = X.drop(columns_to_exclude, axis = 1)
model.fit(X_2,y)
X3 = sm.add_constant(X_2)
est = sm.OLS(y, X3)
est2 = est.fit()
print(est2.summary())

In [None]:
pd.set_option('display.float_format', '{:.2f}'.format)
result = df.groupby("released_year").streams.mean()

# graph year x streams.mean
sns.regplot(x=result.index, y=result)

# Show the plot
plt.show()

## There is a outlier around 1973-1978 (prob 1975)

In [None]:
df[df.released_year == 1975]
## 2 all time hits

In [None]:
result2 = df.groupby("released_day").streams.mean()

sns.regplot(x=result2.index, y=result2)

# Show the plot
# plot shows that it is more convenient to release songs later in the month
plt.show()

In [None]:
result3 = df.groupby('released_month').streams.mean()
sns.regplot(x=result3.index, y=result3)

# Show the plot
# no real relationship
plt.show()

In [None]:
result4 = df.groupby('danceability_%').streams.mean()
sns.regplot(x=result4.index, y=result4)

# Show the plot
# the higher the danceability_%, the lower the streams
plt.show()

In [None]:
result5 = df.groupby('energy_%').streams.mean()
sns.regplot(x=result5.index, y=result5)

# Show the plot
# energy doesn't really have a effect on streams
plt.show()