# Spotify Analysis
The goal of this notebook is to clean and analyse music streaming data sourced from Spotify (see steps here to source the data: https://support.stats.fm/docs/import/spotify-import/)

In [None]:
# Import required libraries

import seaborn
import os
import json
import pandas as pd
import numpy as np
import matplotlib as mpl

## Data Import
Spotify provides data in json format, however the data is often supplied in multiple files split by year. This data needs to be concatenated.

In [None]:
def read_jsons_to_df(data_dir_path):
    """
    Iterates through all json files in the provided directory, normalises the data
    into tabular format and then concatenates all tables together into a dataframe.
    Assumes all jsons are the same format (provided from Spotify).
    
    Parameters:
        data_dir_path: path to the directory with the json files
    
    Returns:
        dataframe with all concatenated jsons
    """
    # List all .json files in the directory
    json_files = [f for f in os.listdir(data_dir_path) if f.endswith('.json')]

    # Initialize an empty list to store individual dataframes
    df_list = []

    # Iterate over the filtered list of json files
    for filename in json_files:
        file_path = os.path.join(data_dir_path, filename)

        # Read the json
        df = pd.read_json(file_path)
            
        # Append the resulting DataFrame to the list
        df_list.append(df)

    # Concatenate all dataframes in the list into one final dataframe
    final_df = pd.concat(df_list, ignore_index=True)
    
    return final_df

In [None]:
# Apply the function
df = read_jsons_to_df('data/')

In [None]:
# Change ts data type to timestamp
df['ts'] = pd.to_datetime(df['ts'])

## Analysis

In [None]:
# Top played artists of all time
listening_time_by_artist = df.groupby(["master_metadata_album_artist_name"]) \
                            .agg({"ms_played": ["sum"]}) \
                            .set_axis(['ms_played'], axis=1) \
                            .sort_values('ms_played', ascending = False)

listening_time_by_artist

In [None]:
# Top played songs of all time
listening_time_by_album = df.groupby(["master_metadata_album_album_name"]) \
                            .agg({"ms_played": ["sum"]}) \
                            .set_axis(['ms_played'], axis=1) \
                            .sort_values('ms_played', ascending = False)

listening_time_by_album

In [None]:
# Top played songs of all time
listening_time_by_song = df.groupby(["master_metadata_track_name"]) \
                            .agg({"ms_played": ["sum"]}) \
                            .set_axis(['ms_played'], axis=1) \
                            .sort_values('ms_played', ascending = False)

listening_time_by_song