<a href="https://colab.research.google.com/github/hejnal/kschool-marketing-digital-geo-bqml/blob/main/notebooks/solutions/exercise2_EDA_solution.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# EDA Analysis for Spotify Dataset

## Instructions

Use Jupyter notebook and standard libraries to analyze the data and generate graphs.

To offload the memory consumption, BigQuery DataFrames can be used instead of normal Pandas.

## Install and import Libraries

In [None]:
!pip install --user --upgrade --quiet bigframes plotly yellowbrick scikit-learn

In [None]:
import seaborn as sns
from yellowbrick.target import FeatureCorrelation
import plotly.express as px
import matplotlib.pyplot as plt
import numpy as np

import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

sns.set(rc={'figure.figsize':(11.7,8.27)})

## [Colab Only] Authenticate

In [None]:
from google.colab import auth
auth.authenticate_user()

## Setup Project and Region and Table Name

In [None]:
import bigframes.pandas as bpd

PROJECT_ID = "clean-silo-405314"  # @param {type:"string"}
REGION = "US"  # @param {type:"string"}
bpd.close_session()

# Set BigQuery DataFrames options
# Note: The project option is not required in all environments.
# On BigQuery Studio, the project ID is automatically detected.
bpd.options.bigquery.project = PROJECT_ID

# Note: The location option is not required.
# It defaults to the location of the first table or query
# passed to read_gbq(). For APIs where a location can't be
# auto-detected, the location defaults to the "US" location.
bpd.options.bigquery.location = REGION

## Load data directly from BigQuery, using magic bigquery functions or BigQuery DataFrames

### BigFrames option - all aggregations are done in BigQuery

In [None]:
df = bpd.read_gbq('raw_data.spotify_full_dataset', columns=["artist_name", "track_name", "acousticness", "danceability", "duration_ms", "energy", "instrumentalness", "key", "liveness", "loudness", "mode", "popularity", "speechiness", "tempo", "valence", "year"], use_cache=False)


### Magic Keyword option - download data to Pandas, process data in the local memory

In [None]:
# Load BigQuery Magic extension
%load_ext google.cloud.bigquery

In [None]:
%%bigquery df --project $PROJECT_ID --no_query_cache
SELECT
  artist_name,
  track_name,
  popularity,
  year,
  genre,
  danceability,
  energy,
  key,
  loudness,
  mode,
  speechiness,
  acousticness,
  instrumentalness,
  liveness,
  valence,
  tempo,
  duration_ms,
  time_signature
FROM
  `raw_data.spotify_full_dataset`

## Explore Spotify dataset

### Describe the dataframe

In [None]:
df.describe()

### Inspect the data

In [None]:
# show first 5 rows
df.head()

Let's check for the null values

In [None]:
df.isnull().sum()

Let's see the stats for all the features

In [None]:
df_stats = df.describe()
df_stats = df_stats.transpose()
df_stats

In [None]:
df.dtypes

### Histograms

Let's see the popularity charts for artists after 2010.

In [None]:
df_filtered = df.loc[df['year'] > 2010]

sns.set(rc={'figure.figsize':(14.7,8.27)})
sns.histplot(df_filtered['popularity'], kde=False)

Popularity without outliers.

In [None]:
from scipy import stats
numeric_features = df.select_dtypes(np.number)
numeric_features_filtered = numeric_features.loc[(numeric_features['year'] >= 2010) & (numeric_features['popularity'] > 0)]
numeric_features_filtered['popularity']
np.abs(stats.zscore(np.array(numeric_features_filtered['popularity'], dtype=np.float64)))

numeric_features_with_no_outliers = numeric_features_filtered[(np.abs(stats.zscore(np.array(numeric_features_filtered['popularity'], dtype=np.float64))) < 3)]

sns.set(rc={'figure.figsize':(14.7,8.27)})
sns.histplot(numeric_features_with_no_outliers['popularity'], kde=False)

Analyse the last 3 years.

In [None]:
df_filtered = df.loc[(df['year'] >= 2020) & (df['year'] <= 2023)]

Analyse the number of songs per decade.

In [None]:
def get_decade(year):
    period_start = int(year/10) * 10
    decade = '{}s'.format(period_start)
    return decade

df['decade'] = df['year'].apply(get_decade)

sns.displot(df['decade'])

### Correlation between features

In [None]:
# Filter to numeric columns
numeric_columns = df.select_dtypes(include=np.number).columns
df_numeric = df[numeric_columns]

sns.set(rc={'figure.figsize':(12.7,8.27)})
# Calculate correlation and plot heatmap
sns.heatmap(df_numeric.corr())

More advanced correlations: energy and popularity, for different modes - major and minor (blue orange) for each year separately.

In [None]:
sns.set_theme()
sns.set(rc={'figure.figsize':(12.7,8.27)})
sns.relplot(data=df_filtered, x='energy', y='popularity', height=10, aspect=2, hue='mode', col='year', col_wrap=2)

More basic correlation in the bar chart.

In [None]:
feature_names = ['acousticness', 'danceability', 'energy', 'instrumentalness',
                 'liveness', 'loudness', 'speechiness', 'tempo', 'valence','duration_ms', 'key', 'mode']

X, y = df[feature_names], df['popularity']

# Convert Int64 columns to float64 as a safe option
for col in ['duration_ms', 'key', 'mode']:
    if col in X.columns and X[col].dtype == 'Int64':
        X[col] = X[col].astype(np.float64) # Use float64 to be consistent

# Create a list of the feature names (already done, but keeping for context)
features = np.array(feature_names)

# Instantiate the visualizer
visualizer = FeatureCorrelation(labels=features)

plt.rcParams['figure.figsize']=(15,15)
visualizer.fit(X, y)     # Fit the data to the visualizer
visualizer.show()


### Timeseries

In [None]:
numeric_features = df.select_dtypes(np.number)

In [None]:
features_by_year = numeric_features.groupby("year", as_index=False).mean()
sound_features = ['acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'valence']

fig = px.line(features_by_year, x='year', y=sound_features, height=1000, width=1800)
fig.show()

## Exercises
For your favourite artist get some interesting stats about their career, how their song evolves, over time and what makes them successful.

Find ansers to the following questions:

* See the feature evolution over time.
* In which years they published their songs (albums) and how many songs were released?
* What is the most popular song by the artist?
* In which year were the songs with the highest average energy levels released?
* What is the name of the most danceable song by your favorite artist (the one in the group)?
* Which feature has the highest correlation with song popularity?


In [None]:
# TODO filter df dataset by the artist

# my_artist_df =

In [None]:
# @title Solution
my_artist_df = df.loc[df["artist_name"] == "Bon Iver"]

In [None]:
# TODO: See the feature evolution over time

numeric_features = my_artist_df.select_dtypes(np.number)
features_by_year = numeric_features.groupby("year", as_index=False).mean()
sound_features = ['energy']

fig = px.line(features_by_year, x='year', y=sound_features, height=1000, width=1800)
fig.show()

In [None]:
# @title Solution
numeric_features = my_artist_df.select_dtypes(np.number)
features_by_year = numeric_features.groupby("year", as_index=False).mean()
sound_features = ['acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'valence']

fig = px.line(features_by_year, x='year', y=sound_features, height=1000, width=1800)
fig.show()

In [None]:
# TODO: In which years they published their songs (albums) and how many songs were released?

songs_by_year_df = my_artist_df.groupby("year").size().sort_index(ascending=True)

plt.bar(songs_by_year_df.index, songs_by_year_df.values)

# Add labels and title
plt.xlabel("Year")
plt.ylabel("Number of Songs")
plt.title("Number of Songs Released by Bon Iver Each Year")

# Show the plot
plt.show()

In [None]:
# @title Solution
# Questions: In which years they published their songs (albums) and how many songs were released?

songs_by_year_df = my_artist_df.groupby("year").size().sort_index(ascending=True)

plt.bar(songs_by_year_df.index, songs_by_year_df.values)

# Add labels and title
plt.xlabel("Year")
plt.ylabel("Number of Songs")
plt.title("Number of Songs Released by Bon Iver Each Year")

# Show the plot
plt.show()

In [None]:
# TODO: What is the most popular song by the artist?
most_popular_song = # TODO: use idxmax() as the index of the max element

In [None]:
# @title Solution
most_popular_song = my_artist_df.loc[my_artist_df['popularity'].idxmax()]
print(most_popular_song)

In [None]:
# TODO: In which year were the songs with the highest average energy levels released?

# use groupby and agg() function.

In [None]:
# @title Solution
# TODO: In which year were the songs with the highest average energy levels released?
my_artist_df.groupby("year").agg({"energy": "mean"}).sort_values(by="energy", ascending=False).head(1)

In [None]:
# TODO: What is the name of the most danceable song by your favorite artist (the one in the group)?

# similar to the most popular song

In [None]:
# @title Solution
# TODO: What is the name of the most danceable song by your favorite artist (the one in the group)?
most_danceable_song = my_artist_df.loc[my_artist_df['danceability'].idxmax()]
print(most_danceable_song)

In [None]:
# TODO: Which feature has the highest correlation with song popularity?

# Filter to numeric columns
df_bon_iver_numeric_columns = my_artist_df.select_dtypes(include=np.number).columns
df_bon_iver_numeric = my_artist_df[df_bon_iver_numeric_columns]

# use corr() function. ignore popularity and year indexes, sort and limit 1

In [None]:
# @title Solution
# TODO: Which feature has the highest correlation with song popularity?

# Filter to numeric columns
df_bon_iver_numeric_columns = my_artist_df.select_dtypes(include=np.number).columns
df_bon_iver_numeric = my_artist_df[df_bon_iver_numeric_columns]

df_bon_iver_numeric.corr().drop(['popularity', 'year']).loc[:, ["popularity"]].sort_values(by="popularity", ascending=False).head(1)