# Spotify Analysis
The goal of this notebook is to clean and analyse music streaming data sourced from Spotify (see steps here to source the data: https://support.stats.fm/docs/import/spotify-import/)

In [None]:
# Import required libraries

import os
import json
import pandas as pd
import numpy as np
import plotly.express as px

## Data Import
Spotify provides data in json format, however the data is often supplied in multiple files split by year. This data needs to be concatenated.

In [None]:
def read_jsons_to_df(data_dir_path):
    """
    Iterates through all json files in the provided directory, normalises the data
    into tabular format and then concatenates all tables together into a dataframe.
    Assumes all jsons are the same format (provided from Spotify).
    
    Parameters:
        data_dir_path: path to the directory with the json files
    
    Returns:
        dataframe with all concatenated jsons
    """
    # List all .json files in the directory
    json_files = [f for f in os.listdir(data_dir_path) if f.endswith('.json')]

    # Initialize an empty list to store individual dataframes
    df_list = []

    # Iterate over the filtered list of json files
    for filename in json_files:
        file_path = os.path.join(data_dir_path, filename)

        # Read the json
        df = pd.read_json(file_path)
            
        # Append the resulting DataFrame to the list
        df_list.append(df)

    # Concatenate all dataframes in the list into one final dataframe
    final_df = pd.concat(df_list, ignore_index=True)
    
    return final_df

In [None]:
# Apply the function
df = read_jsons_to_df('data/')

## Data Cleansing

In [None]:
# Change ts data type to timestamp
df['ts'] = pd.to_datetime(df['ts'])

In [None]:
# Add columns for Year, Month, MM-YYYY
df['Year'] = df['ts'].dt.strftime('%Y').astype(int)
df['Month'] = df['ts'].dt.strftime('%m').astype(int)
df['YYYY-MM'] = df['ts'].dt.strftime('%Y-%m')

In [None]:
# Add columns for minutes, hours, days from miliseconds
df['mins_played'] = df['ms_played']/60000
df['hours_played'] = df['mins_played']/60
df['days_played'] = df['hours_played']/24

In [None]:
# Rename artist, album & song columns
df.rename(columns={'master_metadata_album_artist_name':'artist',
                   'master_metadata_album_album_name':'album',
                  'master_metadata_track_name':'song'}, inplace=True)

## Analysis

### Lists of Top Artists, Albums & Song

In [None]:
# Top played artists of all time
list_time_by_artist = df.groupby(["artist"]) \
                            .agg({"mins_played": ["sum"],
                                "hours_played": ["sum"],
                                "days_played": ["sum"]}) \
                            .set_axis(['mins_played',
                                       'hours_played',
                                       'days_played'], axis=1) \
                            .sort_values('mins_played', ascending = False)

list_time_by_artist = list_time_by_artist.round(2)

list_time_by_artist

In [None]:
# Top played albums of all time
list_time_by_album = df.groupby(["album"]) \
                            .agg({"mins_played": ["sum"],
                                "hours_played": ["sum"],
                                "days_played": ["sum"]}) \
                            .set_axis(['mins_played',
                                       'hours_played',
                                       'days_played'], axis=1) \
                            .sort_values('mins_played', ascending = False)

list_time_by_album = list_time_by_album.round(2)

list_time_by_album

In [None]:
# Top played songs of all time
list_time_by_song = df.groupby(["song"]) \
                            .agg({"mins_played": ["sum"],
                                "hours_played": ["sum"],
                                "days_played": ["sum"]}) \
                            .set_axis(['mins_played',
                                       'hours_played',
                                       'days_played'], axis=1) \
                            .sort_values('mins_played', ascending = False)

list_time_by_song = list_time_by_song.round(2)

list_time_by_song

### Artist Popularity over Time

In [None]:
# Only select top 15 artists of all time
top_15_artists = list_time_by_artist.sort_values('mins_played',
                                                 ascending=False).head(15)

# Filter the original dataframe to include only rows for the top 15 artists
filtered_df = df[df['artist'].isin(top_15_artists.index)]

In [None]:
# Group by month and artist
list_time_by_month = filtered_df.groupby(["artist", "Year"]) \
                            .agg({"mins_played": ["sum"],
                                "hours_played": ["sum"],
                                "days_played": ["sum"]}) \
                            .set_axis(['mins_played',
                                       'hours_played',
                                       'days_played'], axis=1) \
                            .sort_values('mins_played', ascending = False)

list_time_by_month = list_time_by_month.round(2)
list_time_by_month = list_time_by_month.reset_index()

In [None]:
list_time_by_month = list_time_by_month.sort_values(by=['Year'])

# Plot the line chart
fig = px.line(list_time_by_month,
              x = "Year",
              y = "hours_played",
              color = 'artist',
              title = 'Top 15 Artists: Listening Frequency over Years')

# Show the plot
fig.show()

### Distinct Count of Artists Listened to Per Year

In [None]:
# Get count of unique values in artist column by year
nunique_by_year = df.groupby('Year')['artist'].nunique()
nunique_by_year = df.groupby('Year').agg({'artist': ['nunique']}) \
                                    .set_axis(['number_artists'], axis=1) \
                                    .reset_index()

# Plot the line chart
fig = px.line(nunique_by_year,
              x = "Year",
              y = "number_artists",
              title = 'Number of Distinct Artists Listened to per Year')

# Show the plot
fig.show()

In [None]:
# distinct count of artists listened to per year
# location of artists? use spotify api?