In [1]:
# Imports
import pandas as pd
from pathlib import Path
from collections import Counter


In [2]:
SPOTIFY_DATA_FOLDER_DIR = r"C:\Users\JM070903\OneDrive - Jacobs\Documents\Python\Spotify Listening Analysis\Spotify_Analysis_2.0\1. Data\MyData"

In [82]:
tracks_to_remove = [
    "Binaural Beta Sinus Drone II",
    "Binaural Alpha Sinus 100 Hz - 108 Hz",
    "Cabin Sound",
    "Unknown Track",
]  # Most of these songs are white noise I listen to on repeat, need to remove them from the df
artists_to_remove = [
    "Unknown Artist",
    "Stuff You Should Know",
    "Freakonomics Radio",
    "World War One",
    "The History of WWII Podcast - by Ray Harris Jr",
]  # Removing podcasts and unknown tracks

In [83]:
def gather_mydata(folder_dir):
    """
    Returns consolidated dataframe of listening history from Spotify .json files

    Parameters:
    folder_dir (string): folder directory location of .json Spotify files
    
    Returns:

    spotify_data_df (pandas dataframe): Dataframe object containing combined and consolidated listening history
    """

    _folder_dir = Path(folder_dir)  # converting to pathlib object for OS-agnostic manipulation
    _jsons = Path(_folder_dir).glob("*.json")  # Finding all files in the json folder that end with .json -> generator object
    _json_list = [file.name for file in _jsons]  # retrieving filename of .json files only
    _streaming_list = [s for s in _json_list if "StreamingHistory" in s] #grabbing .jsons for streaming history only
    _spotify_data = {key: [] for key in _streaming_list} # Creating empty dict with json filenames as keys

    for spotify_json in _streaming_list:
        json_filepath = (Path(_folder_dir, spotify_json))    
        read_data = pd.read_json(json_filepath, typ="series", encoding="utf8")
        _spotify_data[spotify_json].append(read_data)

    streams_list = [key[0] for key in _spotify_data.values()]

    spotify_data_df = pd.concat(streams_list, ignore_index=True, sort=False)
    spotify_data_df = pd.json_normalize(spotify_data_df)  # This is a really handy way of converting dict keys to column names
   
    return spotify_data_df

In [113]:
class CleanDataFrame:
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def remove_track(self, tracknames):
        """
        Removes any tracks passed to argument
        
        Parameters:
        tracknames (list): list of tracks to remove
        
        Returns:
        self: original dataframe object with tracks removed
        
        """
        for track in tracknames:
            self.dataframe = self.dataframe[
                ~self.dataframe["trackName"].str.contains(track)
            ]
        return self 

    def remove_artist(self, artists):
        """
        Removes any artists passed to argument
        
        Parameters:
        artists (list): list of artists to remove
        
        Returns:
        self: original dataframe object with artists removed
        
        """
        for artist in artists:
            self.dataframe = self.dataframe[
                ~self.dataframe["artistName"].str.contains(artist)
            ]
        return self

    def convert_datetime(self):
        """
        Converts all time based df columns to datetime objects
        
        Parameters:
        self (dataframe): original dataframe to be cleaned
        
        Returns:
        self (dataframe): cleaned dataframe
        """
        self.dataframe["endTime"] = pd.to_datetime(
            self.dataframe["endTime"], format="%Y-%m-%d %H:%M"
        )
        self.dataframe["minutesPlayed"] = self.dataframe["msPlayed"].divide(60000)

        self.dataframe.drop(["endTime", "msPlayed"], axis=1, inplace=True)

        return self

    def return_df(self): #ToDo - shouldn't need this method. The dataframe shoud be returned when any of the above methods are called. 
        """ Returns the dataframe object, rather than a CleanDataFrame object. """
        return self.dataframe


In [114]:
streams_df = gather_mydata(SPOTIFY_DATA_FOLDER_DIR);

In [115]:
clean_data = CleanDataFrame(streams_df)
streams_df = clean_data.remove_track(tracks_to_remove).remove_artist(artists_to_remove).return_df()

Unnamed: 0,endTime,artistName,trackName,msPlayed
2,2019-09-16 07:09,The Growlers,California,3606
3,2019-09-16 07:09,Richard Thompson,1952 Vincent Black Lightning,284
5,2019-09-16 07:10,The Growlers,California,71453
6,2019-09-16 07:16,Dire Straits,Skateaway,398760
7,2019-09-16 07:21,Doc Robinson,Break My Fall,288093
...,...,...,...,...
30716,2020-09-16 21:57,Floating,Gamma Colours,203168
30717,2020-09-16 21:59,Floating,Gamma Colours,79032
30752,2020-09-16 23:57,Fazerdaze,Little Uneasy,116175
30754,2020-09-16 23:58,Slowdive,Star Roving,43682
