# Part 1: Extract Data from Spotify API

In [None]:
# Install to Access Spotify API
# https://spotipy.readthedocs.io/en/2.22.1/
!pip install spotipy

In [60]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import pandas as pd

In [None]:
# API Authentication
client_credentials_manager = SpotifyClientCredentials(client_id='5bb2bca0a272463e93543db4840a003e', client_secret='5eba8c34df7646dfbd7188eab33e34b8')
# API Authorization: Permission to Access Data from This API
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [None]:
playlist_link = 'https://open.spotify.com/playlist/37i9dQZEVXbNG2KDcFcKOF'
playlist_URI = playlist_link.split("/")[-1]
data = sp.playlist_tracks(playlist_URI)
# The result is a JSON file containing information about track(song), album, artist
# The format is dictionary + list

In [99]:
data['items'][0] # The first song

{'added_at': '2023-06-09T10:12:46Z',
 'added_by': {'external_urls': {'spotify': 'https://open.spotify.com/user/'},
  'href': 'https://api.spotify.com/v1/users/',
  'id': '',
  'type': 'user',
  'uri': 'spotify:user:'},
 'is_local': False,
 'primary_color': None,
 'track': {'album': {'album_type': 'album',
   'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/0XeEobZplHxzM9QzFQWLiR'},
     'href': 'https://api.spotify.com/v1/artists/0XeEobZplHxzM9QzFQWLiR',
     'id': '0XeEobZplHxzM9QzFQWLiR',
     'name': 'Eslabon Armado',
     'type': 'artist',
     'uri': 'spotify:artist:0XeEobZplHxzM9QzFQWLiR'}],
   'available_markets': ['AD',
    'AE',
    'AG',
    'AL',
    'AM',
    'AO',
    'AR',
    'AT',
    'AU',
    'AZ',
    'BA',
    'BB',
    'BD',
    'BE',
    'BF',
    'BG',
    'BH',
    'BI',
    'BJ',
    'BN',
    'BO',
    'BR',
    'BS',
    'BT',
    'BW',
    'BY',
    'BZ',
    'CA',
    'CD',
    'CG',
    'CH',
    'CI',
    'CL',
    'CM',
    'CO'

# Part 2: Transform the Data to Proper Structure

<h3> 2.1 Explore the Data

In [None]:
len(data['items']) # Return 50 -> There are 50 tracks (songs).
# Album ID
data['items'][0]['track']['album']['id']
# Album Name
data['items'][0]['track']['album']['name']
# Release Date
data['items'][0]['track']['album']['release_date']
# Total Tracks
data['items'][0]['track']['album']['total_tracks']
# External URL
data['items'][0]['track']['album']['external_urls']['spotify']

<h3> 2.2 Restructure the Data

In [106]:
# Restructure Album Data
# Granularity: album
album_list = []
for row in data['items']:
    album_id = row['track']['album']['id']
    album_name = row['track']['album']['name']
    album_release_date = row['track']['album']['release_date']
    album_total_tracks = row['track']['album']['total_tracks']
    album_url = row['track']['album']['external_urls']['spotify']
    album_element = {'album_id': album_id, 
                     'album_name': album_name, 
                     'album_release_date': album_release_date, 
                     'album_total_tracks': album_total_tracks,
                     'album_url': album_url}
    # print(album_element)
    album_list.append(album_element)
album_list[0:5]

[{'album_id': '5aDEezKnOqyQo0qvTFhpkM',
  'album_name': 'DESVELADO',
  'album_release_date': '2023-04-28',
  'album_total_tracks': 16,
  'album_url': 'https://open.spotify.com/album/5aDEezKnOqyQo0qvTFhpkM'},
 {'album_id': '5dKPhEYBhP8j85HcxQfaw6',
  'album_name': 'Peso Pluma: Bzrp Music Sessions, Vol. 55',
  'album_release_date': '2023-06-01',
  'album_total_tracks': 1,
  'album_url': 'https://open.spotify.com/album/5dKPhEYBhP8j85HcxQfaw6'},
 {'album_id': '5gCcb5fsSb6w5K8SyJrgtB',
  'album_name': 'WHERE SHE GOES',
  'album_release_date': '2023-05-18',
  'album_total_tracks': 1,
  'album_url': 'https://open.spotify.com/album/5gCcb5fsSb6w5K8SyJrgtB'},
 {'album_id': '7aGzSSUD8S6IhPCsZSiuMT',
  'album_name': 'un x100to',
  'album_release_date': '2023-04-17',
  'album_total_tracks': 1,
  'album_url': 'https://open.spotify.com/album/7aGzSSUD8S6IhPCsZSiuMT'},
 {'album_id': '6aBVGuOUEuX18rHxyDWbti',
  'album_name': 'La Bebe (Remix)',
  'album_release_date': '2023-03-17',
  'album_total_tracks'

In [107]:
# Restructure Artist Data
# Granularity: Artist
artist_list = []
for row in data['items']:
    for key,value in row.items():
        if key == 'track':
            artists = []
            for artist in value ['artists']: 
                # Note: One song can have multiple artists
                # Tag artists is a list with each element as an sigle artist.
                artist_element = {'artist_id': artist['id'],
                                  'artist_name': artist['name'],
                                  'external_url': artist['href']}
                artist_list.append(artist_element)
artist_list[0:5]

[{'artist_id': '0XeEobZplHxzM9QzFQWLiR',
  'artist_name': 'Eslabon Armado',
  'external_url': 'https://api.spotify.com/v1/artists/0XeEobZplHxzM9QzFQWLiR'},
 {'artist_id': '12GqGscKJx3aE4t07u7eVZ',
  'artist_name': 'Peso Pluma',
  'external_url': 'https://api.spotify.com/v1/artists/12GqGscKJx3aE4t07u7eVZ'},
 {'artist_id': '716NhGYqD1jl2wI1Qkgq36',
  'artist_name': 'Bizarrap',
  'external_url': 'https://api.spotify.com/v1/artists/716NhGYqD1jl2wI1Qkgq36'},
 {'artist_id': '12GqGscKJx3aE4t07u7eVZ',
  'artist_name': 'Peso Pluma',
  'external_url': 'https://api.spotify.com/v1/artists/12GqGscKJx3aE4t07u7eVZ'},
 {'artist_id': '4q3ewBCX7sLwd24euuV69X',
  'artist_name': 'Bad Bunny',
  'external_url': 'https://api.spotify.com/v1/artists/4q3ewBCX7sLwd24euuV69X'}]

In [108]:
# Restructure Song Data
# Granularity: song
# Note: One song can have multiple artists
# artist_id_1st only contains the first artist of each song
# artists_list contains a list of all artists of each song
song_list = []
for row in data['items']:
    song_id = row['track']['id']
    song_name = row['track']['name']
    song_duration = row['track']['duration_ms']
    song_url = row['track']['external_urls']['spotify']
    song_popularity = row['track']['popularity']
    song_added_time = row['added_at']
    album_id = row['track']['album']['id']
    artist_id_1st = row['track']['artists'][0]['id']
    artists_list = [artist['id'] for artist in row['track']['artists']]
    song_element = {'song_id': song_id,
                    'song_name': song_name,
                    'song_duration': song_duration,
                    'song_url': song_url,
                    'song_popularity': song_popularity,
                    'song_added_time': song_added_time,
                    'album_id': album_id,
                    'artist_id_1st': artist_id_1st,
                    'artists_list': artists_list}
    song_list.append(song_element)
song_list[0:5]

[{'song_id': '3qQbCzHBycnDpGskqOWY0E',
  'song_name': 'Ella Baila Sola',
  'song_duration': 165671,
  'song_url': 'https://open.spotify.com/track/3qQbCzHBycnDpGskqOWY0E',
  'song_popularity': 93,
  'song_added_time': '2023-06-09T10:12:46Z',
  'album_id': '5aDEezKnOqyQo0qvTFhpkM',
  'artist_id_1st': '0XeEobZplHxzM9QzFQWLiR',
  'artists_list': ['0XeEobZplHxzM9QzFQWLiR', '12GqGscKJx3aE4t07u7eVZ']},
 {'song_id': '5AqiaZwhmC6dIbgWrD5SzV',
  'song_name': 'Peso Pluma: Bzrp Music Sessions, Vol. 55',
  'song_duration': 188361,
  'song_url': 'https://open.spotify.com/track/5AqiaZwhmC6dIbgWrD5SzV',
  'song_popularity': 93,
  'song_added_time': '2023-06-09T10:12:46Z',
  'album_id': '5dKPhEYBhP8j85HcxQfaw6',
  'artist_id_1st': '716NhGYqD1jl2wI1Qkgq36',
  'artists_list': ['716NhGYqD1jl2wI1Qkgq36', '12GqGscKJx3aE4t07u7eVZ']},
 {'song_id': '7ro0hRteUMfnOioTFI5TG1',
  'song_name': 'WHERE SHE GOES',
  'song_duration': 231704,
  'song_url': 'https://open.spotify.com/track/7ro0hRteUMfnOioTFI5TG1',
  'song

In [109]:
# Mapping Table for Song and Artist
# One song can have multiple artists, and one artist can have multiple songs.
# Granularity: song x artist
# If song 1 has 2 artists, a and b,
#    artist a has no other songs, 
#    artist b has another song 2 which has one other artist c
#    artist c has no other songs.
# There will be 4 records:
#    song 1, artist a
#    song 1, artist b
#    song 2, artist b
#    song 2, artist c
song_artist_mapping = []
for row in data['items']:
    song_id = row['track']['id']
    song_name = row['track']['name']
    # Creat a list of all artists for each song
    artist_ids = [artist['id'] for artist in row['track']['artists']]
    artist_names = [artist['name'] for artist in row['track']['artists']]
    # Creat a mapping record for each combination of artist and song
    for artist_id in artist_ids:
        for artist_name in artist_names:
            song_artist_mapping.append({'song_id': song_id,
                                        'song_name': song_name,
                                        'artist_id': artist_id,
                                        'artist_name': artist_name})
song_artist_mapping[0:5]

[{'song_id': '3qQbCzHBycnDpGskqOWY0E',
  'song_name': 'Ella Baila Sola',
  'artist_id': '0XeEobZplHxzM9QzFQWLiR',
  'artist_name': 'Eslabon Armado'},
 {'song_id': '3qQbCzHBycnDpGskqOWY0E',
  'song_name': 'Ella Baila Sola',
  'artist_id': '0XeEobZplHxzM9QzFQWLiR',
  'artist_name': 'Peso Pluma'},
 {'song_id': '3qQbCzHBycnDpGskqOWY0E',
  'song_name': 'Ella Baila Sola',
  'artist_id': '12GqGscKJx3aE4t07u7eVZ',
  'artist_name': 'Eslabon Armado'},
 {'song_id': '3qQbCzHBycnDpGskqOWY0E',
  'song_name': 'Ella Baila Sola',
  'artist_id': '12GqGscKJx3aE4t07u7eVZ',
  'artist_name': 'Peso Pluma'},
 {'song_id': '5AqiaZwhmC6dIbgWrD5SzV',
  'song_name': 'Peso Pluma: Bzrp Music Sessions, Vol. 55',
  'artist_id': '716NhGYqD1jl2wI1Qkgq36',
  'artist_name': 'Bizarrap'}]

<h3> 2.3 Transform the Dictonaries to Tables with Pandas

In [110]:
album_df = pd.DataFrame.from_dict(album_list)
album_df.head(5)

Unnamed: 0,album_id,album_name,album_release_date,album_total_tracks,album_url
0,5aDEezKnOqyQo0qvTFhpkM,DESVELADO,2023-04-28,16,https://open.spotify.com/album/5aDEezKnOqyQo0q...
1,5dKPhEYBhP8j85HcxQfaw6,"Peso Pluma: Bzrp Music Sessions, Vol. 55",2023-06-01,1,https://open.spotify.com/album/5dKPhEYBhP8j85H...
2,5gCcb5fsSb6w5K8SyJrgtB,WHERE SHE GOES,2023-05-18,1,https://open.spotify.com/album/5gCcb5fsSb6w5K8...
3,7aGzSSUD8S6IhPCsZSiuMT,un x100to,2023-04-17,1,https://open.spotify.com/album/7aGzSSUD8S6IhPC...
4,6aBVGuOUEuX18rHxyDWbti,La Bebe (Remix),2023-03-17,2,https://open.spotify.com/album/6aBVGuOUEuX18rH...


In [64]:
album_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   album_id            50 non-null     object
 1   album_name          50 non-null     object
 2   album_release_date  50 non-null     object
 3   album_total_tracks  50 non-null     int64 
 4   album_url           50 non-null     object
dtypes: int64(1), object(4)
memory usage: 2.1+ KB


In [75]:
artist_df = pd.DataFrame.from_dict(artist_list)
artist_df.head(5)

Unnamed: 0,artist_id,artist_name,external_url
0,0XeEobZplHxzM9QzFQWLiR,Eslabon Armado,https://api.spotify.com/v1/artists/0XeEobZplHx...
1,12GqGscKJx3aE4t07u7eVZ,Peso Pluma,https://api.spotify.com/v1/artists/12GqGscKJx3...
2,716NhGYqD1jl2wI1Qkgq36,Bizarrap,https://api.spotify.com/v1/artists/716NhGYqD1j...
3,12GqGscKJx3aE4t07u7eVZ,Peso Pluma,https://api.spotify.com/v1/artists/12GqGscKJx3...
4,4q3ewBCX7sLwd24euuV69X,Bad Bunny,https://api.spotify.com/v1/artists/4q3ewBCX7sL...


In [76]:
artist_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96 entries, 0 to 95
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   artist_id     96 non-null     object
 1   artist_name   96 non-null     object
 2   external_url  96 non-null     object
dtypes: object(3)
memory usage: 2.4+ KB


In [86]:
song_df = pd.DataFrame.from_dict(song_list)
song_df.head(5)

Unnamed: 0,song_id,song_name,song_duration,song_url,song_popularity,song_added_time,album_id,artist_id_1st,artists_list
0,3qQbCzHBycnDpGskqOWY0E,Ella Baila Sola,165671,https://open.spotify.com/track/3qQbCzHBycnDpGs...,93,2023-06-09T10:12:46Z,5aDEezKnOqyQo0qvTFhpkM,0XeEobZplHxzM9QzFQWLiR,"[0XeEobZplHxzM9QzFQWLiR, 12GqGscKJx3aE4t07u7eVZ]"
1,5AqiaZwhmC6dIbgWrD5SzV,"Peso Pluma: Bzrp Music Sessions, Vol. 55",188361,https://open.spotify.com/track/5AqiaZwhmC6dIbg...,93,2023-06-09T10:12:46Z,5dKPhEYBhP8j85HcxQfaw6,716NhGYqD1jl2wI1Qkgq36,"[716NhGYqD1jl2wI1Qkgq36, 12GqGscKJx3aE4t07u7eVZ]"
2,7ro0hRteUMfnOioTFI5TG1,WHERE SHE GOES,231704,https://open.spotify.com/track/7ro0hRteUMfnOio...,97,2023-06-09T10:12:46Z,5gCcb5fsSb6w5K8SyJrgtB,4q3ewBCX7sLwd24euuV69X,[4q3ewBCX7sLwd24euuV69X]
3,6pD0ufEQq0xdHSsRbg9LBK,un x100to,194563,https://open.spotify.com/track/6pD0ufEQq0xdHSs...,100,2023-06-09T10:12:46Z,7aGzSSUD8S6IhPCsZSiuMT,6XkjpgcEsYab502Vr1bBeW,"[6XkjpgcEsYab502Vr1bBeW, 4q3ewBCX7sLwd24euuV69X]"
4,2UW7JaomAMuX9pZrjVpHAU,La Bebe - Remix,234352,https://open.spotify.com/track/2UW7JaomAMuX9pZ...,99,2023-06-09T10:12:46Z,6aBVGuOUEuX18rHxyDWbti,1NNRWkhwmcXRimFYSBpB1y,"[1NNRWkhwmcXRimFYSBpB1y, 12GqGscKJx3aE4t07u7eVZ]"


In [87]:
song_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   song_id          50 non-null     object
 1   song_name        50 non-null     object
 2   song_duration    50 non-null     int64 
 3   song_url         50 non-null     object
 4   song_popularity  50 non-null     int64 
 5   song_added_time  50 non-null     object
 6   album_id         50 non-null     object
 7   artist_id_1st    50 non-null     object
 8   artists_list     50 non-null     object
dtypes: int64(2), object(7)
memory usage: 3.6+ KB


<h3> 2.4 Remove Duplicates for Primary Keys

In [79]:
# Since id's are the primary keys, they can't have duplicates.
# Thus, we only keep the first recod of each repeated album_id.
album_df = album_df.drop_duplicates(subset=['album_id'])
album_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 48 entries, 0 to 49
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   album_id            48 non-null     object
 1   album_name          48 non-null     object
 2   album_release_date  48 non-null     object
 3   album_total_tracks  48 non-null     int64 
 4   album_url           48 non-null     object
dtypes: int64(1), object(4)
memory usage: 2.2+ KB


In [80]:
artist_df = artist_df.drop_duplicates(subset=['artist_id'])
artist_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 73 entries, 0 to 95
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   artist_id     73 non-null     object
 1   artist_name   73 non-null     object
 2   external_url  73 non-null     object
dtypes: object(3)
memory usage: 2.3+ KB


In [89]:
# Since the granularity of the original data is song,
# there shouldn't be any changes after deduplicating.
song_df = song_df.drop_duplicates(subset=['song_id'])
song_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50 entries, 0 to 49
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   song_id          50 non-null     object
 1   song_name        50 non-null     object
 2   song_duration    50 non-null     int64 
 3   song_url         50 non-null     object
 4   song_popularity  50 non-null     int64 
 5   song_added_time  50 non-null     object
 6   album_id         50 non-null     object
 7   artist_id_1st    50 non-null     object
 8   artists_list     50 non-null     object
dtypes: int64(2), object(7)
memory usage: 3.9+ KB


<h3> 2.5 Transform Data Types and Formats

In [93]:
album_df['album_release_date'] = pd.to_datetime(album_df['album_release_date'])
album_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 48 entries, 0 to 49
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   album_id            48 non-null     object        
 1   album_name          48 non-null     object        
 2   album_release_date  48 non-null     datetime64[ns]
 3   album_total_tracks  48 non-null     int64         
 4   album_url           48 non-null     object        
dtypes: datetime64[ns](1), int64(1), object(3)
memory usage: 2.2+ KB


In [94]:
song_df['song_added_time'] = pd.to_datetime(song_df['song_added_time'])
song_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50 entries, 0 to 49
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype              
---  ------           --------------  -----              
 0   song_id          50 non-null     object             
 1   song_name        50 non-null     object             
 2   song_duration    50 non-null     int64              
 3   song_url         50 non-null     object             
 4   song_popularity  50 non-null     int64              
 5   song_added_time  50 non-null     datetime64[ns, UTC]
 6   album_id         50 non-null     object             
 7   artist_id_1st    50 non-null     object             
 8   artists_list     50 non-null     object             
dtypes: datetime64[ns, UTC](1), int64(2), object(6)
memory usage: 3.9+ KB
