<a href="https://colab.research.google.com/github/jmgang/G2Sprint3Activities/blob/master/Sen's_Copy_of_D5_Activity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Day 5 Exercise - Looking at Spotify playlists data

## Instructions


1. (5 mins) As a group, choose up to 6 music genres and obtain track data from the genre's top 20 most-followed playlists in Spotify.

2. (5 mins) Distribute the data gathering task by assigning 1-2 playlist genres to each person and then pooling all the gathered data in one shared folder.

3. (10 mins) Each group member must compare the audio features of 2 playlist genres using histograms. Identify audio features that best distinguishes each genre from the other (i.e. the feature best to use for a classification model)
4.( 15 mins) Take turns presenting this notebook with your code answer to the whole group.

In [1]:
!pip install spotipy -q

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/250.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m245.8/250.3 kB[0m [31m8.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m250.3/250.3 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

import ast
import math
import getpass
import time
import os

from tqdm import tqdm

In [3]:
# Mount GDrive folders
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# Set home directory
import os
home_dir = "/content/drive/MyDrive/Colab Notebooks/Sprint 3/"
os.chdir(home_dir)

In [5]:
# Make data/playlist/ directory
os.makedirs('data/playlists', exist_ok=True)

In [6]:
client_id = 'xxxxx'

In [7]:
client_secret = 'xxxxx'

In [8]:
client_credentials_manager = SpotifyClientCredentials(client_id=client_id,
                                                      client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager = client_credentials_manager)

In [9]:
def get_playlist_ids_names(KEYWORD):
  playlist_ids = []
  playlist_names = []

  N = 100
  # Get playlist in batches of 50
  for n in np.arange(N//50):
      offset= 50*n
      print("Getting batch %d of search results for keyword: %s ..." % (n,KEYWORD), end='' )
      results = sp.search(q=KEYWORD, type='playlist' , market='PH', offset = offset, limit=50)
      playlist_ids.extend([p['href'].split('/')[5] for p in results['playlists']['items']])
      playlist_names.extend([p['name'] for p in results['playlists']['items']])
      print("  DONE!")
  return playlist_ids, playlist_names

In [10]:
def get_track_audio_features_data(track_ids):
    audio_features_data = []
    audio_features_keys = ['danceability','energy','key','loudness','mode',\
                            'speechiness','acousticness','instrumentalness','liveness',\
                            'valence','tempo','duration_ms']

    total_iterations = math.ceil(len(track_ids) / 100)

    for i in tqdm(range(0, len(track_ids), 100), total=total_iterations, desc='Fetching audio features data '):
        track_ids_chunk = track_ids[i:i + 100]
        track_audio_features = sp.audio_features(track_ids_chunk)

        for audio_feature in track_audio_features:
          audio_feature_data = dict()
          audio_feature_data['track_id'] = audio_feature['id']
          for key in audio_features_keys:
            try:
                audio_feature_data[key] = audio_feature[key]
            except:
                audio_feature_data[key] = None
          audio_features_data.append(audio_feature_data)

        time.sleep(3)
    return audio_features_data

In [11]:
def get_playlist_tracks_data(playlist_info):
  playlist_track_data = []
  for track_data in playlist_info['tracks']:
      relevant_track_data = { key: track_data['track'][key] for key in ['name','popularity','duration_ms'] }
      relevant_track_data['track_id'] = track_data['track']['id']
      relevant_track_data['artist_id'] = [artist['id'] for artist in track_data['track']['artists'] ]
      relevant_track_data['artist_name'] = [artist['name'] for artist in track_data['track']['artists'] ]
      relevant_track_data['num_artists'] = len([artist['id'] for artist in track_data['track']['artists']])
      # If single artist track, convert list to single-element
      relevant_track_data['artist_id'] = relevant_track_data['artist_id'][0] if len(relevant_track_data['artist_id'])==1 \
                                          else relevant_track_data['artist_id']
      relevant_track_data['artist_name'] = relevant_track_data['artist_name'][0] if len(relevant_track_data['artist_name'])==1 \
                                          else relevant_track_data['artist_name']
      relevant_track_data['album_id'] = track_data['track']['album']['uri'].split(":")[2]
      relevant_track_data['release_date'] = track_data['track']['album']['release_date']
      relevant_track_data['playlist_id'] = playlist_info['playlist_id']
      relevant_track_data['playlist_name'] = playlist_info['playlist_name']
      playlist_track_data.append(relevant_track_data)
  return playlist_track_data

In [32]:
# Helper function to get playlist data in dict format
def get_playlist_data(playlist_id):
    playlist_data = sp.playlist(playlist_id)
    track_data = []
    relevant_playlist_data = {'playlist_id': playlist_id}
    relevant_playlist_data['playlist_name'] = playlist_data['name']
    relevant_playlist_data['playlist_total_tracks'] = playlist_data['tracks']['total']
    relevant_playlist_data['owner_id'] = playlist_data['owner']['id']
    relevant_playlist_data['owner_name'] = playlist_data['owner']['display_name']
    relevant_playlist_data['total_followers'] = playlist_data['followers']['total']
    tracks = playlist_data['tracks']
    track_data.extend(tracks['items'])

    # Tracks might contain additional items
    while tracks['next']:
        tracks = sp.next(tracks)
        track_data.extend(tracks['items'])
        time.sleep(0.5)

    relevant_playlist_data['tracks'] = track_data

    return relevant_playlist_data

### 1. Read playlist tracks of 2 genres

#### Genre 1: EDM

In [33]:
#set keyword
KEYWORD1='EDM'

# Get playlists of searched keyword

In [34]:
playlist_ids, playlist_names = get_playlist_ids_names(KEYWORD1)

Getting batch 0 of search results for keyword: EDM ...  DONE!
Getting batch 1 of search results for keyword: EDM ...  DONE!


# Get playlist data with track information

In [35]:
playlist_data_list = []
playlist_track_information = []
progress_bar = tqdm(enumerate(playlist_ids), total=len(playlist_ids), desc='Fetching playlist data ')
for i, playlist_id in progress_bar:
    try:
        relevant_playlist_data = get_playlist_data(playlist_id)
        playlist_data_list.append(relevant_playlist_data)
        playlist_track_information.extend(get_playlist_tracks_data(relevant_playlist_data))
        time.sleep(1)
    except Exception as e:
        # Print track id and error msg
        print(f'Error requesting data for playlist id {playlist_id}: {e}')
        continue

Fetching playlist data :  26%|██▋       | 26/99 [00:52<04:15,  3.50s/it]

Error requesting data for playlist id 2PEmzRQWUweLDajgMBSIYy: 'NoneType' object has no attribute 'split'


Fetching playlist data :  76%|███████▌  | 75/99 [02:57<00:28,  1.17s/it]

Error requesting data for playlist id 37i9dQZF1DX87b1GU2LR5p: 'NoneType' object is not subscriptable


Fetching playlist data : 100%|██████████| 99/99 [03:39<00:00,  2.22s/it]


## Saving Playlist data

In [36]:
playlist_data_df = pd.DataFrame(playlist_data_list)
playlist_data_df.drop('tracks', inplace=True, axis=1)
playlist_data_df = playlist_data_df.sort_values('total_followers',ascending=False)
playlist_data_df  = playlist_data_df[playlist_data_df['playlist_name'].str.lower().str.contains(KEYWORD1.lower())]
playlist_data_df.head()

Unnamed: 0,playlist_id,playlist_name,playlist_total_tracks,owner_id,owner_name,total_followers
19,33PyRULhtc4SRrUE1wbbmp,Tomorrowland 2023 Playlist 💙 EDM HITS,146,11154757151,Fans of Tomorrowland 🎪,492270
92,71z6BdHlnfNj4DKRhuu1Fk,RAGE 💪 EDM WORKOUT MOTIVATION,94,tribaltrap,Tribal Trap,491374
11,3Di88mvYplBtkDBIzGLiiM,EDM Hits 2023,136,fineshark,Christoffer Brants,378893
5,37i9dQZF1DXafD1g5rer7q,EDM Samplings,50,spotify,Spotify,371569
2,37i9dQZF1DX3Kdv0IChEm9,EDM Hits,60,spotify,Spotify,267896


In [37]:
filename = "data/playlists/activity_"+KEYWORD1+"_playlist_data.csv"
playlist_data_df.to_csv(filename,encoding='utf=8',index=False)

## Saving Playlist tracks data

In [40]:
playlist_track_data_df = pd.DataFrame(playlist_track_information)
playlist_track_data_df = playlist_track_data_df.drop_duplicates(subset='track_id').reset_index(drop=True)
playlist_track_data_df  = playlist_track_data_df[playlist_track_data_df['playlist_name'].str.lower().str.contains(KEYWORD1.lower())]
playlist_track_data_df = playlist_track_data_df.reset_index(drop=True)
playlist_track_data_df

Unnamed: 0,name,popularity,duration_ms,track_id,artist_id,artist_name,num_artists,album_id,release_date,playlist_id,playlist_name
0,911,58,346608,6X7HtAzJHEiuvfp499kKY9,1pMFoni6A1enu9OBmaanG2,Montiego,1,2CYIPrkbakrcT3Gdy1HlWd,2023-09-14,534x9sylHYDWODbk9RSJLp,EDM & POP
1,She's in my Head,47,179986,1nHQdojhIIJmYgCIemppW9,"[0YL46Muu8yKzyKoDZ4pTSa, 5exS0bytCYdixgv02DaCm3]","[APE, William Singe]",2,7sHOKa3o4vVjaaAVY2eAJc,2023-04-21,534x9sylHYDWODbk9RSJLp,EDM & POP
2,Surrender,3,166809,1EzxRda4zTJweTgEjXdCDY,1pLIeBmatVccW3t5kppRBe,Wave Pilot,1,6GxLhHHW05gNQWIWf9ur1G,2023-10-06,534x9sylHYDWODbk9RSJLp,EDM & POP
3,Over The Moon,44,200019,3pJuM2Dxu4TB0wxh0XfcqO,"[0aMljpKFM4rWpxxRqhjCmy, 68tR0TsEKX89ID4fyBMgch]","[A/K, James Vickery]",2,2hLY4bZYdimqGFqScWpmkV,2023-05-12,534x9sylHYDWODbk9RSJLp,EDM & POP
4,Toxic - Remix,26,189150,0SGehYAzgNR7J3g3AOTuWu,"[4yFBbJUbc3Xn1KJD8C6IaG, 4dLZTad0O8DMQJTjUorTqk]","[Ricki Ayela, MNDLB5]",2,2PGYrP4zq48G1pau0NXTzB,2023-10-13,534x9sylHYDWODbk9RSJLp,EDM & POP
...,...,...,...,...,...,...,...,...,...,...,...
9967,Unity Step - Remix,17,266181,0CCe8RCQaW0S8jmC9Tkzty,"[1y2DGRvDs9Ice1z5eBOZ3w, 5jLkFPWnofjG4BQ4LBHPk...","[J.Walker of (TLD), J-Chief, Aaron P'reach, Ki...",5,6lSB96Lca21H5As1xRLFq4,2019-03-31,3lO8tnRmWIJJwElxJWyqYZ,EDM 2023
9968,CC4,49,107792,11FLscd48vGzX8l7x6Z90J,1CFCsEqKrCyvAFKOATQHiW,Lecrae,1,4GnEecDMWGwbCKsd3KTFny,2022-11-04,3lO8tnRmWIJJwElxJWyqYZ,EDM 2023
9969,Someone in church (Instrumental),0,67030,0iNeXsD1UuuXNC0OB6T0qg,3msK6zh6BViIJLO8zQ5QvR,Cadenza,1,4aI7cg5QesoKL7lgCchKaJ,2022-05-01,3lO8tnRmWIJJwElxJWyqYZ,EDM 2023
9970,The disciples (Instrumental),0,43702,2G40uystuR6GC9GLE4D55c,3msK6zh6BViIJLO8zQ5QvR,Cadenza,1,4aI7cg5QesoKL7lgCchKaJ,2022-05-01,3lO8tnRmWIJJwElxJWyqYZ,EDM 2023


In [41]:
filename = "data/playlists/activity_"+KEYWORD1+"_playlist_tracks.csv"
playlist_track_data_df.to_csv(filename,encoding='utf=8',index=False)

# Getting Audio Features of Tracks

In [42]:
track_ids = playlist_track_data_df.track_id.unique().tolist()
len(track_ids)

9972

In [43]:
track_audio_features = get_track_audio_features_data(track_ids)
len(track_audio_features)

Fetching audio features data : 100%|██████████| 100/100 [05:25<00:00,  3.25s/it]


9972

In [44]:
track_audio_features_df = pd.DataFrame(track_audio_features)
track_audio_features_df

Unnamed: 0,track_id,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
0,6X7HtAzJHEiuvfp499kKY9,0.802,0.921,4,-7.624,0,0.0522,0.037900,0.92600,0.1110,0.0836,124.994,346608
1,1nHQdojhIIJmYgCIemppW9,0.769,0.602,10,-8.021,1,0.0708,0.340000,0.00841,0.0922,0.3740,120.089,179987
2,1EzxRda4zTJweTgEjXdCDY,0.637,0.643,7,-5.672,0,0.1310,0.004320,0.48700,0.1490,0.0902,150.074,166809
3,3pJuM2Dxu4TB0wxh0XfcqO,0.735,0.676,2,-6.338,1,0.0341,0.000831,0.01220,0.0908,0.3930,124.049,200020
4,0SGehYAzgNR7J3g3AOTuWu,0.756,0.950,8,-4.871,1,0.0456,0.080500,0.54700,0.0773,0.5420,125.028,189150
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9967,0CCe8RCQaW0S8jmC9Tkzty,0.908,0.729,0,-5.220,1,0.0592,0.072800,0.06810,0.1540,0.4950,109.999,266182
9968,11FLscd48vGzX8l7x6Z90J,0.556,0.566,3,-10.649,0,0.1310,0.483000,0.00000,0.4220,0.3590,133.186,109670
9969,0iNeXsD1UuuXNC0OB6T0qg,0.769,0.305,8,-13.168,1,0.1070,0.117000,0.82900,0.1120,0.6270,172.032,67030
9970,2G40uystuR6GC9GLE4D55c,0.574,0.476,10,-17.172,0,0.0728,0.083700,0.74800,0.0724,0.3400,87.977,43703


In [45]:
filename = "data/playlists/activity_"+KEYWORD1+"_tracks_audio_features.csv"
track_audio_features_df.to_csv(filename,encoding='utf=8',index=False)

In [46]:
overlapping_columns = [col for col in track_audio_features_df.columns if col in playlist_track_data_df.columns and col != 'track_id']
track_audio_features_df.drop(columns=overlapping_columns, inplace=True)
overall_playlist_track_data_df = track_audio_features_df.merge(playlist_track_data_df, on='track_id')
overall_playlist_track_data_df

Unnamed: 0,track_id,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,...,name,popularity,duration_ms,artist_id,artist_name,num_artists,album_id,release_date,playlist_id,playlist_name
0,6X7HtAzJHEiuvfp499kKY9,0.802,0.921,4,-7.624,0,0.0522,0.037900,0.92600,0.1110,...,911,58,346608,1pMFoni6A1enu9OBmaanG2,Montiego,1,2CYIPrkbakrcT3Gdy1HlWd,2023-09-14,534x9sylHYDWODbk9RSJLp,EDM & POP
1,1nHQdojhIIJmYgCIemppW9,0.769,0.602,10,-8.021,1,0.0708,0.340000,0.00841,0.0922,...,She's in my Head,47,179986,"[0YL46Muu8yKzyKoDZ4pTSa, 5exS0bytCYdixgv02DaCm3]","[APE, William Singe]",2,7sHOKa3o4vVjaaAVY2eAJc,2023-04-21,534x9sylHYDWODbk9RSJLp,EDM & POP
2,1EzxRda4zTJweTgEjXdCDY,0.637,0.643,7,-5.672,0,0.1310,0.004320,0.48700,0.1490,...,Surrender,3,166809,1pLIeBmatVccW3t5kppRBe,Wave Pilot,1,6GxLhHHW05gNQWIWf9ur1G,2023-10-06,534x9sylHYDWODbk9RSJLp,EDM & POP
3,3pJuM2Dxu4TB0wxh0XfcqO,0.735,0.676,2,-6.338,1,0.0341,0.000831,0.01220,0.0908,...,Over The Moon,44,200019,"[0aMljpKFM4rWpxxRqhjCmy, 68tR0TsEKX89ID4fyBMgch]","[A/K, James Vickery]",2,2hLY4bZYdimqGFqScWpmkV,2023-05-12,534x9sylHYDWODbk9RSJLp,EDM & POP
4,0SGehYAzgNR7J3g3AOTuWu,0.756,0.950,8,-4.871,1,0.0456,0.080500,0.54700,0.0773,...,Toxic - Remix,26,189150,"[4yFBbJUbc3Xn1KJD8C6IaG, 4dLZTad0O8DMQJTjUorTqk]","[Ricki Ayela, MNDLB5]",2,2PGYrP4zq48G1pau0NXTzB,2023-10-13,534x9sylHYDWODbk9RSJLp,EDM & POP
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9967,0CCe8RCQaW0S8jmC9Tkzty,0.908,0.729,0,-5.220,1,0.0592,0.072800,0.06810,0.1540,...,Unity Step - Remix,17,266181,"[1y2DGRvDs9Ice1z5eBOZ3w, 5jLkFPWnofjG4BQ4LBHPk...","[J.Walker of (TLD), J-Chief, Aaron P'reach, Ki...",5,6lSB96Lca21H5As1xRLFq4,2019-03-31,3lO8tnRmWIJJwElxJWyqYZ,EDM 2023
9968,11FLscd48vGzX8l7x6Z90J,0.556,0.566,3,-10.649,0,0.1310,0.483000,0.00000,0.4220,...,CC4,49,107792,1CFCsEqKrCyvAFKOATQHiW,Lecrae,1,4GnEecDMWGwbCKsd3KTFny,2022-11-04,3lO8tnRmWIJJwElxJWyqYZ,EDM 2023
9969,0iNeXsD1UuuXNC0OB6T0qg,0.769,0.305,8,-13.168,1,0.1070,0.117000,0.82900,0.1120,...,Someone in church (Instrumental),0,67030,3msK6zh6BViIJLO8zQ5QvR,Cadenza,1,4aI7cg5QesoKL7lgCchKaJ,2022-05-01,3lO8tnRmWIJJwElxJWyqYZ,EDM 2023
9970,2G40uystuR6GC9GLE4D55c,0.574,0.476,10,-17.172,0,0.0728,0.083700,0.74800,0.0724,...,The disciples (Instrumental),0,43702,3msK6zh6BViIJLO8zQ5QvR,Cadenza,1,4aI7cg5QesoKL7lgCchKaJ,2022-05-01,3lO8tnRmWIJJwElxJWyqYZ,EDM 2023


In [47]:

overall_playlist_track_data_df['duration_mins'] = overall_playlist_track_data_df['duration_ms'] / 60000
#tag genre with keyword
overall_playlist_track_data_df['genre'] = KEYWORD1
overall_playlist_track_data_df.head()

Unnamed: 0,track_id,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,...,duration_ms,artist_id,artist_name,num_artists,album_id,release_date,playlist_id,playlist_name,duration_mins,genre
0,6X7HtAzJHEiuvfp499kKY9,0.802,0.921,4,-7.624,0,0.0522,0.0379,0.926,0.111,...,346608,1pMFoni6A1enu9OBmaanG2,Montiego,1,2CYIPrkbakrcT3Gdy1HlWd,2023-09-14,534x9sylHYDWODbk9RSJLp,EDM & POP,5.7768,EDM
1,1nHQdojhIIJmYgCIemppW9,0.769,0.602,10,-8.021,1,0.0708,0.34,0.00841,0.0922,...,179986,"[0YL46Muu8yKzyKoDZ4pTSa, 5exS0bytCYdixgv02DaCm3]","[APE, William Singe]",2,7sHOKa3o4vVjaaAVY2eAJc,2023-04-21,534x9sylHYDWODbk9RSJLp,EDM & POP,2.999767,EDM
2,1EzxRda4zTJweTgEjXdCDY,0.637,0.643,7,-5.672,0,0.131,0.00432,0.487,0.149,...,166809,1pLIeBmatVccW3t5kppRBe,Wave Pilot,1,6GxLhHHW05gNQWIWf9ur1G,2023-10-06,534x9sylHYDWODbk9RSJLp,EDM & POP,2.78015,EDM
3,3pJuM2Dxu4TB0wxh0XfcqO,0.735,0.676,2,-6.338,1,0.0341,0.000831,0.0122,0.0908,...,200019,"[0aMljpKFM4rWpxxRqhjCmy, 68tR0TsEKX89ID4fyBMgch]","[A/K, James Vickery]",2,2hLY4bZYdimqGFqScWpmkV,2023-05-12,534x9sylHYDWODbk9RSJLp,EDM & POP,3.33365,EDM
4,0SGehYAzgNR7J3g3AOTuWu,0.756,0.95,8,-4.871,1,0.0456,0.0805,0.547,0.0773,...,189150,"[4yFBbJUbc3Xn1KJD8C6IaG, 4dLZTad0O8DMQJTjUorTqk]","[Ricki Ayela, MNDLB5]",2,2PGYrP4zq48G1pau0NXTzB,2023-10-13,534x9sylHYDWODbk9RSJLp,EDM & POP,3.1525,EDM


In [48]:
overall_playlist_track_data_df.to_csv("data/playlists/"+KEYWORD1+"_playlist_tracks_data.csv", index=False, encoding='utf-8')

## 2. Check the playlist tracks' histogram of audio features

In [None]:
# Plot histogram comparing audio features for both playlist genres
# Write code here


>Q: What audio feature/s best defines the genre you picked? Does it make sense to use these as a feature for a classification model?

Answer here