In [2]:
# imports
from dotenv import load_dotenv
import os
import pandas as pd
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from IPython.display import display
import numpy as np

In [3]:
# load env variables
load_dotenv()

# get the client id and secret from the env
client_id = os.getenv('SPOTIPY_CLIENT_ID')
client_secret = os.getenv('SPOTIPY_CLIENT_SECRET')

In [4]:
# set up spotipy with my client credentials. use the sp object to call methods / interact with the api
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=client_id, client_secret=client_secret))

In [5]:
# Set pandas to display longer content without truncation
# pd.set_option('display.max_colwidth', None)  # No limit on column width
# pd.set_option('display.max_rows', None)      # No limit on rows displayed

In [6]:
# format time duration
def format_duration(duration_ms):
    # HH:MM:SS format
    seconds = (duration_ms // 1000) % 60
    minutes = (duration_ms // (1000 * 60)) % 60
    hours = (duration_ms // (1000 * 60 * 60))
    return f"{hours}:{minutes:02}:{seconds:02}"

In [7]:
def get_tracks_for_album(album_id):
    album_tracks = sp.album_tracks(album_id)
    return album_tracks['items']

In [8]:
def get_song_features(track_id):
    song_features = sp.audio_features(track_id)
    return song_features[0] if song_features else None

In [9]:
# this is a list of dicts, and which track it comes from is unspecified
def get_all_audio_features(track_ids):
    features_list = []
    for track_id in track_ids:
        features = get_song_features(track_id)
        if features:
            features_list.append(features)
    return features_list

In [10]:
# function to calculate mean/median/mode of any audio feature
def calculate_average_features(audio_features, feature_list):
    if not audio_features:
        return None
    
    # df from the audio features in the list of dicts provided
    features_df = pd.DataFrame(audio_features)

    # display(features_df)

    # the mean for the specified features, while appending "_avg" to the col name using rename and a lamba func
    mean_features = features_df[feature_list].mean().rename(lambda x: f"{x}_avg").to_dict()

    # if i didnt want to change the col name, i'd just do this:
    # mean_features = features_df[feature_list].mean().to_dict()

    # dict of avg values for the specified features
    # print(mean_features)
    return mean_features


In [11]:
# function that takes in album and artist, and outputs the album info
def get_album_info(album_title, artist_name):
    query = f'album:{album_title} artist:{artist_name}' # spotify api syntax for this search query
    result = sp.search(q=query, type='album', limit=1) # returns the album, 1
    
    # print(result)
    if result['albums']['items']:
        # parse JSON to get the first (and only) album for its info
        album = result['albums']['items'][0]
        album_id = album['id']

        # detailed info like genre and popularity
        album_details = sp.album(album['id'])
        print(album_details['popularity'])
        # print(album_details['genres'])

        # Get album tracks
        tracks = get_tracks_for_album(album_id)

        # track duration sum for album duration
        track_durations = [track['duration_ms'] for track in tracks]
        total_duration_ms = sum(track_durations)
        print(format_duration(total_duration_ms))

        # extract details from JSON and put into new dict of info for the album
        album_data = {
            'album_title': album['name'],
            'album_id': album['id'],
            'artist_name': album['artists'][0]['name'], #this is just 1 artist for now, but later can make a list if theres multiple primary artists
            'release_date': album['release_date'],
            'total_tracks': album['total_tracks'],
            'duration_ms': total_duration_ms,
            'duration': format_duration(total_duration_ms),
            'genres': album.get('genres', []), #avoid error if no genres specified
            'popularity': album_details['popularity'],
            'uri': album['uri']
        }
        return album_data
    else:
        return None

In [12]:
def read_csv(file_path):
    return pd.read_csv(file_path)

In [15]:
# Main function
def main():
    # csv file path
    csv_file_path = '../data/albums.csv'

    # read csv into pandas dataframe
    df = read_csv(csv_file_path)

    # process the df 
    album_data_list = []
    # go through each row of the df (each album) and get the data 
    for index, row in df.iterrows():
        album_title = row['album_title']
        artist_name = row['artist_name']

        album_data = get_album_info(album_title, artist_name)

        # individual track data for the album
        if album_data:
            track_ids = [track['id'] for track in get_tracks_for_album(album_data['album_id'])]
            audio_features = get_all_audio_features(track_ids)
            album_data['audio_features'] = audio_features  # Store audio features in the album data
            # print(album_data['audio_features'])

            # calculate avg audio features (e.g. danceability, energy)
            features_to_average = ['tempo', 'valence', 'danceability', 'energy', 'acousticness', 'liveness', 'loudness','speechiness'] # later, keep track of mode for key and time signature? or need specifics # of a certain key/time sig? will compare at the end across all albums accumulated
            avg_features = calculate_average_features(audio_features, features_to_average) #returns dict of features avg

            # add the average features dict to the album data dict
            album_data.update(avg_features)

            # keep in mind the avg of avgs across other albums isnt the same mathematically, if i were to want to do track data (like finding the mode for the keys etc as i noted above)

            # print(f"Album: {album_data['album_title']} by {album_data['artist_name']}")
            # for feature, avg_value in avg_features.items():
            #     print(f"Average {feature.capitalize()}: {avg_value:.2f}")

            # appends album data to the list of albums' datas
            # check/print this
            album_data_list.append(album_data)

    # convert this album data list to a new df with all the info it pulled
    album_data_df = pd.DataFrame(album_data_list)

    # here, we can analyze the data or do more with it
    # print(album_data_df)
    # print(type(album_data_df))
    # print(album_data_df.head())  # Show first few rows to confirm content

    # album_data_df # in jupyter notebook, this shows the df
    # album_data_df.head()  # Shows the first 5 rows

    display(album_data_df)


    # output the average length (calculate)
    # Calculate average album length

    average_album_duration = 0
    sum_duration = 0
    average_tracks = 0
    sum_tracks = 0
    
    if not album_data_df.empty:
        # print(int(album_data_df['duration_ms'].mean()))

        sum_duration = format_duration(album_data_df['duration_ms'].sum())
        average_album_duration = int(album_data_df['duration_ms'].mean())
        sum_tracks = album_data_df['total_tracks'].sum()
        average_tracks = album_data_df['total_tracks'].mean()
        # print(f'Average album duration: {format_duration(average_album_duration)} minutes')
        # print(f'Total duration: {sum_duration}') 
        # print(f'Average number of tracks: {average_tracks:.2f}') 
        # print(f'Total tracks: {sum_tracks}')

        summary_data = {
            'sum_duration': sum_duration,
            'average_duration': average_album_duration,
            'sum_tracks': sum_tracks,
            'average_tracks': average_tracks,
            'album_count': album_data_df.shape[0]
        }

        summary_df = pd.DataFrame([summary_data])  # create df from summary data
    # else:
    #     print('No album data available.')

    # access genre by album title
    # album_title = 'GUTS'
    # first_album_genre = album_data_df[album_data_df['album_title'] == album_title]['genres'].values[0]
    # print(first_album_genre)

    # audio features average
    # print(album_data_df['audio_features'])

    # export dfs to CSV
        album_data_df.to_json('../data/album_data.json', index=False)
        summary_df.to_json('../data/summary_analysis.json', index=False)

    print("Album data and summary analysis exported successfully.")

In [16]:
main()

FileNotFoundError: [Errno 2] No such file or directory: 'data/albums.csv'