<a href="https://colab.research.google.com/github/lauragabrysiak/mitx_applied_data_science/blob/main/spotipy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
# Mounting the drive
from google.colab import drive
drive.mount('/content/gdrive/')

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


In [8]:
# Installing Spotify Web API spotipy
!pip install spotipy



In [9]:
import time
from spotipy.oauth2 import SpotifyClientCredentials
import spotipy
import pandas as pd

import warnings                                 # Used to ignore the warning given as output of the code
warnings.filterwarnings('ignore')
from collections import defaultdict             # A dictionary output that does not raise a key error

In [10]:
# Importing the datasets
temp = pd.read_csv('/content/sample_data/df_final.csv'
                  , on_bad_lines='skip')

In [11]:
temp.head()

Unnamed: 0,user_id,song_id,play_count,song_title,song_release,song_artist,song_year
0,6958,447,1,Daisy And Prudence,Distillation,Erin McKeown,2000
1,6958,512,1,The Ballad of Michael Valentine,Sawdust,The Killers,2004
2,6958,549,1,I Stand Corrected (Album),Vampire Weekend,Vampire Weekend,2007
3,6958,703,1,They Might Follow You,Tiny Vipers,Tiny Vipers,2007
4,6958,719,1,Monkey Man,You Know I'm No Good,Amy Winehouse,2007


In [12]:
temp['song_title'][1]        # example of song_id (to be used for spotipy)

'The Ballad of Michael Valentine'

### Spotify Web API

Source: https://developer.spotify.com/documentation/web-api/reference/get-track

In [13]:
temp.shape

(98011, 7)

In [14]:
def get_spotify_metadata(temp_df, client_id, client_secret):

    # Set up the Spotipy client
    client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
    sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

    def get_song_info(song_title):

        # Search for the song by title
        query = f'track:{song_title}'
        results = sp.search(q=query, type='track', limit=1)

        # Check if any tracks were found
        if results['tracks']['items']:
            track = results['tracks']['items'][0]

            # Initialize empty dictionary for song_info
            song_info = {}

            # Add try-except blocks for error handling
            try:
                song_info['title'] = track['name']
                song_info['popularity'] = track['popularity']
                song_info['is_local'] = track['is_local']
                song_info['explicit'] = track['explicit']
                song_info['duration_ms'] = track['duration_ms']

                # Album info
                song_info['album'] = track['album']['name']
                song_info['release_date'] = track['album']['release_date']
                song_info['album_type'] = track['album']['album_type']
                song_info['album_available_markets'] = track['album']['available_markets']

                # Artist info
                song_info['artist'] = track['artists'][0]['name']
                song_info['artist_popularity'] = sp.artist(track['artists'][0]['id'])['popularity']
                song_info['artist_followers'] = sp.artist(track['artists'][0]['id'])['followers']['total']
                song_info['artist_genres'] = sp.artist(track['artists'][0]['id'])['genres']

                # Test these features

                #song_info['genres'] = sp.audio_features(track['id'])[0]['genres']
                #song_info['total_tracks'] = track['album']['total_tracks']
                #song_info['album_popularity'] = track['album']['popularity']
                #song_info['album_restrictions'] = track['album']['restrictions']

            except KeyError as e:

                # Handle KeyError (missing information)
                song_info['title'] = song_title
                song_info['error'] = f'Missing information: {str(e)}'

            return song_info

        else:
            return None

    # Iterate through song titles in the DataFrame
    metadata_list = []  # initiate function

    for index, row in temp_df.iterrows():
        # Add a delay to avoid hitting rate limits
        time.sleep(0.5)

        song_title = row['song_title']
        song_info = get_song_info(song_title)

        if song_info:
            metadata_list.append({**row, **song_info})

    # Create a DataFrame from the collected metadata
    merged_df = pd.DataFrame(metadata_list)

    return merged_df

In [None]:
client_id = '930c85172af549c7bed7661f025edf11'
client_secret = 'b33c88126c014981b3fc1d7c426d3a1c'
#redirect_uri = 'http://localhost:8888/callback'

result_df = get_spotify_metadata(temp.head(15000)
                                 , client_id
                                 , client_secret)

In [None]:
result_df.shape

In [None]:
result_df.to_csv('spotipy_df_temp_final.csv', index=False)