<a href="https://colab.research.google.com/github/jmgang/SpoTwoFy-project-notebooks/blob/main/notebooks/1_SpoTwoFy_Playlist_Extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [35]:
!pip install spotipy -q

In [36]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

import ast
import math
import getpass
import time
import os

from tqdm import tqdm

In [37]:
# Mount GDrive folders
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [38]:
# Set home directory
import os
home_dir = "/content/drive/MyDrive/Colab Notebooks/Sprint 3/"
os.chdir(home_dir)

In [39]:
# Make data/playlist/ directory
os.makedirs('data/playlists', exist_ok=True)

In [40]:
client_id = 'ecf55d3b31964b53af6841e39dfebcb2'

In [41]:
client_secret = '73eb4ae7d6774f0e8d67ec2da2d9e5a8'

In [42]:
client_credentials_manager = SpotifyClientCredentials(client_id=client_id,
                                                      client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager = client_credentials_manager)

In [43]:
def get_playlist_ids_names(KEYWORD):
  playlist_ids = []
  playlist_names = []

  N = 100
  # Get playlist in batches of 50
  for n in np.arange(N//50):
      offset= 50*n
      print("Getting batch %d of search results for keyword: %s ..." % (n,KEYWORD), end='' )
      results = sp.search(q=KEYWORD, type='playlist' , market='PH', offset = offset, limit=50)
      playlist_ids.extend([p['href'].split('/')[5] for p in results['playlists']['items']])
      playlist_names.extend([p['name'] for p in results['playlists']['items']])
      print("  DONE!")
  return playlist_ids, playlist_names

In [44]:
def get_track_audio_features_data(track_ids):
    audio_features_data = []
    audio_features_keys = ['danceability','energy','key','loudness','mode',\
                            'speechiness','acousticness','instrumentalness','liveness',\
                            'valence','tempo','duration_ms']

    total_iterations = math.ceil(len(track_ids) / 100)

    for i in tqdm(range(0, len(track_ids), 100), total=total_iterations, desc='Fetching audio features data '):
        track_ids_chunk = track_ids[i:i + 100]
        track_audio_features = sp.audio_features(track_ids_chunk)

        for audio_feature in track_audio_features:
          if audio_feature is not None:
            audio_feature_data = dict()
            audio_feature_data['track_id'] = audio_feature['id'] if 'id' in audio_feature.keys() else 'none'
            for key in audio_features_keys:
              try:
                  audio_feature_data[key] = audio_feature[key]
              except:
                  audio_feature_data[key] = None
            audio_features_data.append(audio_feature_data)

        time.sleep(3)
    return audio_features_data

In [45]:
def get_playlist_tracks_data(playlist_info):
  playlist_track_data = []
  if 'tracks' not in playlist_info or playlist_info['tracks'] is None:
        print(f"No tracks data for playlist id: {playlist_info.get('playlist_id', 'Unknown')}")
        return []
  for track_data in playlist_info['tracks']:
      if 'track' not in track_data or track_data['track'] is None:
        continue
      relevant_track_data = { key: track_data['track'][key] for key in ['name','popularity','duration_ms'] }
      relevant_track_data['track_id'] = track_data['track']['id']
      relevant_track_data['artist_id'] = [artist['id'] for artist in track_data['track']['artists'] ]
      relevant_track_data['artist_name'] = [artist['name'] for artist in track_data['track']['artists'] ]
      relevant_track_data['num_artists'] = len([artist['id'] for artist in track_data['track']['artists']])
      # If single artist track, convert list to single-element
      relevant_track_data['artist_id'] = relevant_track_data['artist_id'][0] if len(relevant_track_data['artist_id'])==1 \
                                          else relevant_track_data['artist_id']
      relevant_track_data['artist_name'] = relevant_track_data['artist_name'][0] if len(relevant_track_data['artist_name'])==1 \
                                          else relevant_track_data['artist_name']
      relevant_track_data['album_id'] = track_data['track']['album']['uri'].split(":")[2]
      relevant_track_data['release_date'] = track_data['track']['album']['release_date']
      relevant_track_data['playlist_id'] = playlist_info['playlist_id']
      relevant_track_data['playlist_name'] = playlist_info['playlist_name']
      playlist_track_data.append(relevant_track_data)
  return playlist_track_data

In [46]:
# Helper function to get playlist data in dict format
def get_playlist_data(playlist_id):
    playlist_data = sp.playlist(playlist_id)
    if playlist_data is None:
        print(f"Failed to fetch data for playlist id: {playlist_id}")
        return None
    track_data = []
    relevant_playlist_data = {'playlist_id': playlist_id}
    relevant_playlist_data['playlist_name'] = playlist_data['name']
    relevant_playlist_data['playlist_total_tracks'] = playlist_data['tracks']['total']
    relevant_playlist_data['owner_id'] = playlist_data['owner']['id']
    relevant_playlist_data['owner_name'] = playlist_data['owner']['display_name']
    relevant_playlist_data['total_followers'] = playlist_data['followers']['total']
    tracks = playlist_data['tracks']
    track_data.extend(tracks['items'])

    # Tracks might contain additional items
    while tracks['next']:
        tracks = sp.next(tracks)
        track_data.extend(tracks['items'])
        time.sleep(0.5)

    relevant_playlist_data['tracks'] = track_data

    return relevant_playlist_data

### 1. Read playlist tracks of 2 genres

#### Genre 1: EDM

In [47]:
#set keyword
KEYWORD1='pinoy'

# Get playlists of searched keyword

In [48]:
playlist_ids, playlist_names = get_playlist_ids_names(KEYWORD1)

Getting batch 0 of search results for keyword: pinoy ...  DONE!
Getting batch 1 of search results for keyword: pinoy ...  DONE!


# Get playlist data with track information

In [49]:
playlist_data_list = []
playlist_track_information = []
progress_bar = tqdm(enumerate(playlist_ids), total=len(playlist_ids), desc='Fetching playlist data ')
for i, playlist_id in progress_bar:
    try:
        relevant_playlist_data = get_playlist_data(playlist_id)
        if relevant_playlist_data:
          playlist_data_list.append(relevant_playlist_data)
          playlist_track_information.extend(get_playlist_tracks_data(relevant_playlist_data))
        time.sleep(1)
    except Exception as e:
        # Print track id and error msg
        print(f'Error requesting data for playlist id {playlist_id}: {e}')
        continue

Fetching playlist data :  33%|███▎      | 33/100 [00:59<01:37,  1.45s/it]

Error requesting data for playlist id 2neVdvh8glAB6e1NT6x4Bw: 'NoneType' object has no attribute 'split'


Fetching playlist data :  88%|████████▊ | 88/100 [02:33<00:18,  1.52s/it]

Error requesting data for playlist id 0i1eTZZaaY7f2jQcEsyVJM: 'NoneType' object has no attribute 'split'


Fetching playlist data : 100%|██████████| 100/100 [02:56<00:00,  1.77s/it]


## Saving Playlist data

In [50]:
playlist_data_df = pd.DataFrame(playlist_data_list)
playlist_data_df.drop('tracks', inplace=True, axis=1)
playlist_data_df = playlist_data_df.sort_values('total_followers',ascending=False)
playlist_data_df  = playlist_data_df[playlist_data_df['playlist_name'].str.lower().str.contains(KEYWORD1.lower())]
playlist_data_df.head()

Unnamed: 0,playlist_id,playlist_name,playlist_total_tracks,owner_id,owner_name,total_followers
43,37i9dQZF1DX0iFfuXuP4Pm,Tatak Pinoy,51,spotify,Spotify,1853001
0,37i9dQZF1DXcfJb5A3855D,Pinoy Open Mic,75,spotify,Spotify,724961
14,37i9dQZF1DX6oLdwrroY56,Pinoy Indie Mix,50,spotify,Spotify,302300
4,37i9dQZF1DX7JQSRbPlmZT,Pinoy Covers,50,spotify,Spotify,298456
48,37i9dQZF1DWU0rV6Im72pA,Paskong Pinoy,117,spotify,Spotify,279786


In [51]:
filename = "data/playlists/project_playlists/"+KEYWORD1+"_playlist_data.csv"
playlist_data_df.to_csv(filename,encoding='utf=8',index=False)

## Saving Playlist tracks data

In [52]:
playlist_track_data_df = pd.DataFrame(playlist_track_information)
playlist_track_data_df = playlist_track_data_df.drop_duplicates(subset='track_id').reset_index(drop=True)
playlist_track_data_df  = playlist_track_data_df[playlist_track_data_df['playlist_name'].str.lower().str.contains(KEYWORD1.lower())]
playlist_track_data_df = playlist_track_data_df.reset_index(drop=True)
playlist_track_data_df

Unnamed: 0,name,popularity,duration_ms,track_id,artist_id,artist_name,num_artists,album_id,release_date,playlist_id,playlist_name
0,As It Was,95,167303,4Dvkj6JhhA12EX05fT7y2e,6KImCVD70vtIoJWnq6nGn3,Harry Styles,1,5r36AJ6VOJtp00oxSkBZ5h,2022-05-20,37i9dQZF1DXcfJb5A3855D,Pinoy Open Mic
1,Starboy,94,230453,7MXVkk9YMctZqd1Srtv4MB,"[1Xyo4u8uXC1ZmMpatF05PJ, 4tZwfgrHOc3mvqYlEYSvVi]","[The Weeknd, Daft Punk]",2,2ODvWsOgouMbaA5xf0RkJe,2016-11-25,37i9dQZF1DXcfJb5A3855D,Pinoy Open Mic
2,Thinking out Loud,67,281560,1Slwb6dOYkBlWal1PGtnNg,6eUKZXaKkcviH0Ku9w2n3V,Ed Sheeran,1,6NoBzYmh5gUusGPCfg0pct,2013,37i9dQZF1DXcfJb5A3855D,Pinoy Open Mic
3,Sweater Weather,93,240400,2QjOHCTQ1Jl3zawyYOpxh6,77SW9BnxLY8rJ0RciFqkHh,The Neighbourhood,1,4xkM0BwLM9H2IUcbYzpcBI,2013-04-19,37i9dQZF1DXcfJb5A3855D,Pinoy Open Mic
4,Another Love,92,244360,3JvKfv6T31zO0ini8iNItO,2txHhyCwHjUEpJjWrEyqyX,Tom Odell,1,0Gf1yE895FKK4YWVRuAeg8,2013-06-24,37i9dQZF1DXcfJb5A3855D,Pinoy Open Mic
...,...,...,...,...,...,...,...,...,...,...,...
5022,We Are In Love,30,214090,42pcW88G60C6wLNDlMMXGz,1m1WrvaorUQVETZMdRCKt9,Wred Esguerra,1,4OXkApenmz6bSIbouhFngx,2022-06-10,6rqwSnpjgL7CoTYi0uZj8V,Pinoy Tambayan Songs
5023,Kumusta Ka,0,181867,56sRPwcfFXDxtCwvCDgkEc,5yVM85m9yNcHO6o0vaaVxW,Daniel Padilla,1,3f2mb0nkdNKLDfDnvvTqAc,2013-01-01,6rqwSnpjgL7CoTYi0uZj8V,Pinoy Tambayan Songs
5024,Panalangin,8,207337,1d8Gt5Ruvis73pDIqsbl9H,5yVM85m9yNcHO6o0vaaVxW,Daniel Padilla,1,3f2mb0nkdNKLDfDnvvTqAc,2013-01-01,6rqwSnpjgL7CoTYi0uZj8V,Pinoy Tambayan Songs
5025,Director's Cut,51,290080,2LeO0FnzYLxaDYZqBK9FJN,4GBFKKuwmZUnAJt6nBal7A,Kamikazee,1,0tIuSD3ZKUjrTJ2g052fMl,2006-01-01,6rqwSnpjgL7CoTYi0uZj8V,Pinoy Tambayan Songs


In [53]:
filename = "data/playlists/project_playlists/"+KEYWORD1+"_playlist_tracks.csv"
playlist_track_data_df.to_csv(filename,encoding='utf=8',index=False)

# Getting Audio Features of Tracks

In [54]:
track_ids = playlist_track_data_df.track_id.unique().tolist()
len(track_ids)

5027

In [55]:
track_audio_features = get_track_audio_features_data(track_ids)
len(track_audio_features)

Fetching audio features data : 100%|██████████| 51/51 [02:46<00:00,  3.27s/it]


5027

In [56]:
track_audio_features_df = pd.DataFrame(track_audio_features)
track_audio_features_df

Unnamed: 0,track_id,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
0,4Dvkj6JhhA12EX05fT7y2e,0.520,0.731,6,-5.338,0,0.0557,0.3420,0.001010,0.3110,0.662,173.930,167303
1,7MXVkk9YMctZqd1Srtv4MB,0.679,0.587,7,-7.015,1,0.2760,0.1410,0.000006,0.1370,0.486,186.003,230453
2,1Slwb6dOYkBlWal1PGtnNg,0.781,0.445,2,-6.061,1,0.0295,0.4740,0.000000,0.1840,0.591,78.998,281560
3,2QjOHCTQ1Jl3zawyYOpxh6,0.612,0.807,10,-2.810,1,0.0336,0.0495,0.017700,0.1010,0.398,124.053,240400
4,3JvKfv6T31zO0ini8iNItO,0.445,0.537,4,-8.532,0,0.0400,0.6950,0.000017,0.0944,0.131,122.769,244360
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5022,42pcW88G60C6wLNDlMMXGz,0.605,0.776,5,-7.033,1,0.0714,0.0883,0.000019,0.1240,0.585,109.912,214091
5023,56sRPwcfFXDxtCwvCDgkEc,0.618,0.698,2,-6.925,1,0.0343,0.0380,0.000004,0.3840,0.628,125.027,181868
5024,1d8Gt5Ruvis73pDIqsbl9H,0.767,0.660,10,-5.940,1,0.0309,0.5580,0.000000,0.1190,0.961,119.928,207337
5025,2LeO0FnzYLxaDYZqBK9FJN,0.316,0.942,4,-3.496,1,0.1120,0.0206,0.000000,0.7970,0.518,164.343,290080


In [57]:
filename = "data/playlists/project_playlists/"+KEYWORD1+"_tracks_audio_features.csv"
track_audio_features_df.to_csv(filename,encoding='utf=8',index=False)

In [58]:
overlapping_columns = [col for col in track_audio_features_df.columns if col in playlist_track_data_df.columns and col != 'track_id']
track_audio_features_df.drop(columns=overlapping_columns, inplace=True)
overall_playlist_track_data_df = track_audio_features_df.merge(playlist_track_data_df, on='track_id')
overall_playlist_track_data_df.rename(columns={'name' : 'track_name'}, inplace=True)
overall_playlist_track_data_df

Unnamed: 0,track_id,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,...,track_name,popularity,duration_ms,artist_id,artist_name,num_artists,album_id,release_date,playlist_id,playlist_name
0,4Dvkj6JhhA12EX05fT7y2e,0.520,0.731,6,-5.338,0,0.0557,0.3420,0.001010,0.3110,...,As It Was,95,167303,6KImCVD70vtIoJWnq6nGn3,Harry Styles,1,5r36AJ6VOJtp00oxSkBZ5h,2022-05-20,37i9dQZF1DXcfJb5A3855D,Pinoy Open Mic
1,7MXVkk9YMctZqd1Srtv4MB,0.679,0.587,7,-7.015,1,0.2760,0.1410,0.000006,0.1370,...,Starboy,94,230453,"[1Xyo4u8uXC1ZmMpatF05PJ, 4tZwfgrHOc3mvqYlEYSvVi]","[The Weeknd, Daft Punk]",2,2ODvWsOgouMbaA5xf0RkJe,2016-11-25,37i9dQZF1DXcfJb5A3855D,Pinoy Open Mic
2,1Slwb6dOYkBlWal1PGtnNg,0.781,0.445,2,-6.061,1,0.0295,0.4740,0.000000,0.1840,...,Thinking out Loud,67,281560,6eUKZXaKkcviH0Ku9w2n3V,Ed Sheeran,1,6NoBzYmh5gUusGPCfg0pct,2013,37i9dQZF1DXcfJb5A3855D,Pinoy Open Mic
3,2QjOHCTQ1Jl3zawyYOpxh6,0.612,0.807,10,-2.810,1,0.0336,0.0495,0.017700,0.1010,...,Sweater Weather,93,240400,77SW9BnxLY8rJ0RciFqkHh,The Neighbourhood,1,4xkM0BwLM9H2IUcbYzpcBI,2013-04-19,37i9dQZF1DXcfJb5A3855D,Pinoy Open Mic
4,3JvKfv6T31zO0ini8iNItO,0.445,0.537,4,-8.532,0,0.0400,0.6950,0.000017,0.0944,...,Another Love,92,244360,2txHhyCwHjUEpJjWrEyqyX,Tom Odell,1,0Gf1yE895FKK4YWVRuAeg8,2013-06-24,37i9dQZF1DXcfJb5A3855D,Pinoy Open Mic
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5022,42pcW88G60C6wLNDlMMXGz,0.605,0.776,5,-7.033,1,0.0714,0.0883,0.000019,0.1240,...,We Are In Love,30,214090,1m1WrvaorUQVETZMdRCKt9,Wred Esguerra,1,4OXkApenmz6bSIbouhFngx,2022-06-10,6rqwSnpjgL7CoTYi0uZj8V,Pinoy Tambayan Songs
5023,56sRPwcfFXDxtCwvCDgkEc,0.618,0.698,2,-6.925,1,0.0343,0.0380,0.000004,0.3840,...,Kumusta Ka,0,181867,5yVM85m9yNcHO6o0vaaVxW,Daniel Padilla,1,3f2mb0nkdNKLDfDnvvTqAc,2013-01-01,6rqwSnpjgL7CoTYi0uZj8V,Pinoy Tambayan Songs
5024,1d8Gt5Ruvis73pDIqsbl9H,0.767,0.660,10,-5.940,1,0.0309,0.5580,0.000000,0.1190,...,Panalangin,8,207337,5yVM85m9yNcHO6o0vaaVxW,Daniel Padilla,1,3f2mb0nkdNKLDfDnvvTqAc,2013-01-01,6rqwSnpjgL7CoTYi0uZj8V,Pinoy Tambayan Songs
5025,2LeO0FnzYLxaDYZqBK9FJN,0.316,0.942,4,-3.496,1,0.1120,0.0206,0.000000,0.7970,...,Director's Cut,51,290080,4GBFKKuwmZUnAJt6nBal7A,Kamikazee,1,0tIuSD3ZKUjrTJ2g052fMl,2006-01-01,6rqwSnpjgL7CoTYi0uZj8V,Pinoy Tambayan Songs


In [59]:

overall_playlist_track_data_df['duration_mins'] = overall_playlist_track_data_df['duration_ms'] / 60000
#tag genre with keyword
overall_playlist_track_data_df['genre'] = KEYWORD1
overall_playlist_track_data_df.head()

Unnamed: 0,track_id,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,...,duration_ms,artist_id,artist_name,num_artists,album_id,release_date,playlist_id,playlist_name,duration_mins,genre
0,4Dvkj6JhhA12EX05fT7y2e,0.52,0.731,6,-5.338,0,0.0557,0.342,0.00101,0.311,...,167303,6KImCVD70vtIoJWnq6nGn3,Harry Styles,1,5r36AJ6VOJtp00oxSkBZ5h,2022-05-20,37i9dQZF1DXcfJb5A3855D,Pinoy Open Mic,2.788383,pinoy
1,7MXVkk9YMctZqd1Srtv4MB,0.679,0.587,7,-7.015,1,0.276,0.141,6e-06,0.137,...,230453,"[1Xyo4u8uXC1ZmMpatF05PJ, 4tZwfgrHOc3mvqYlEYSvVi]","[The Weeknd, Daft Punk]",2,2ODvWsOgouMbaA5xf0RkJe,2016-11-25,37i9dQZF1DXcfJb5A3855D,Pinoy Open Mic,3.840883,pinoy
2,1Slwb6dOYkBlWal1PGtnNg,0.781,0.445,2,-6.061,1,0.0295,0.474,0.0,0.184,...,281560,6eUKZXaKkcviH0Ku9w2n3V,Ed Sheeran,1,6NoBzYmh5gUusGPCfg0pct,2013,37i9dQZF1DXcfJb5A3855D,Pinoy Open Mic,4.692667,pinoy
3,2QjOHCTQ1Jl3zawyYOpxh6,0.612,0.807,10,-2.81,1,0.0336,0.0495,0.0177,0.101,...,240400,77SW9BnxLY8rJ0RciFqkHh,The Neighbourhood,1,4xkM0BwLM9H2IUcbYzpcBI,2013-04-19,37i9dQZF1DXcfJb5A3855D,Pinoy Open Mic,4.006667,pinoy
4,3JvKfv6T31zO0ini8iNItO,0.445,0.537,4,-8.532,0,0.04,0.695,1.7e-05,0.0944,...,244360,2txHhyCwHjUEpJjWrEyqyX,Tom Odell,1,0Gf1yE895FKK4YWVRuAeg8,2013-06-24,37i9dQZF1DXcfJb5A3855D,Pinoy Open Mic,4.072667,pinoy


In [60]:
overall_playlist_track_data_df.to_csv("data/playlists/project_playlists/"+KEYWORD1+"_playlist_tracks_data.csv", index=False, encoding='utf-8')