<a href="https://colab.research.google.com/github/lauragabrysiak/mitx_applied_data_science/blob/main/spotipy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Mounting the drive
from google.colab import drive
drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


In [3]:
# Installing Spotify Web API spotipy
!pip install spotipy

Collecting spotipy
  Downloading spotipy-2.23.0-py3-none-any.whl (29 kB)
Collecting redis>=3.5.3 (from spotipy)
  Downloading redis-5.0.1-py3-none-any.whl (250 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m250.3/250.3 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: redis, spotipy
Successfully installed redis-5.0.1 spotipy-2.23.0


In [4]:
import warnings                                 # Used to ignore the warning given as output of the code
warnings.filterwarnings('ignore')
from collections import defaultdict             # A dictionary output that does not raise a key error

import numpy as np                              # Basic libraries of python for numeric and dataframe computations
import pandas as pd

import matplotlib.pyplot as plt                 # data visualization
import seaborn as sns                           # data visualization advanced

from sklearn.metrics import mean_squared_error  # A performance metrics in sklearn

In [5]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from spotipy import SpotifyException

In [6]:
# Importing the datasets
temp = pd.read_csv('/content/sample_data/df_final.csv'
                  , on_bad_lines='skip')

In [None]:
temp.head()

Unnamed: 0,user_id,song_id,play_count,song_title,song_release,song_artist,song_year
0,6958,447,1,Daisy And Prudence,Distillation,Erin McKeown,2000
1,6958,512,1,The Ballad of Michael Valentine,Sawdust,The Killers,2004
2,6958,549,1,I Stand Corrected (Album),Vampire Weekend,Vampire Weekend,2007
3,6958,703,1,They Might Follow You,Tiny Vipers,Tiny Vipers,2007
4,6958,719,1,Monkey Man,You Know I'm No Good,Amy Winehouse,2007


In [8]:
temp['song_title'][1]        # example of song_id (to be used for spotipy)

'The Ballad of Michael Valentine'

https://developer.spotify.com/documentation/web-api/reference/get-track

In [11]:
import time

def get_spotify_metadata(temp_df, client_id, client_secret):

    # Set up the Spotipy client
    client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
    sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

    def get_song_info(title):
        # Search for the song by title
        results = sp.search(q=f'track:{title}', type='track', limit=1)

        # Check if any tracks were found
        if results['tracks']['items']:
            track = results['tracks']['items'][0]

            # Initialize empty dictionary for song_info
            song_info = {}

            # Add try-except blocks for error handling
            try:
                song_info['title'] = track['name']
                song_info['popularity'] = track['popularity']
                song_info['is_local'] = track['is_local']
                song_info['explicit'] = track['explicit']
                song_info['genres'] = sp.audio_features(track['id'])[0]['genres']
                song_info['duration_ms'] = track['duration_ms']

                # Album info
                song_info['album'] = track['album']['name']
                song_info['release_date'] = track['album']['release_date']
                song_info['album_type'] = track['album']['album_type']
                song_info['total_tracks'] = track['album']['total_tracks']
                song_info['album_popularity'] = track['album']['popularity']
                song_info['album_available_markets'] = track['album']['available_markets']
                song_info['album_restrictions'] = track['album']['restrictions']

                # Artist info
                song_info['artist'] = track['artists'][0]['name']
                song_info['artist_popularity'] = sp.artist(track['artists'][0]['id'])['popularity']
                song_info['artist_followers'] = sp.artist(track['artists'][0]['id'])['followers']['total']
                song_info['artist_genres'] = sp.artist(track['artists'][0]['id'])['genres']

            except KeyError as e:
                # Handle KeyError (missing information)
                song_info['title'] = title
                song_info['error'] = f'Missing information: {str(e)}'

            return song_info

        else:
            return None

    # Iterate through song titles in the DataFrame
    metadata_list = []  # initiate function

    for title in temp_df['song_title']:  # Look for song_title column

        time.sleep(0.5) # Add a delay to avoid hitting rate limits

        song_info = get_song_info(title)

        if song_info:
            metadata_list.append(song_info)

    # Create a DataFrame from the collected metadata
    metadata_df = pd.DataFrame(metadata_list)

    # Merge the original DataFrame with the new metadata DataFrame
    merged_df = pd.concat([temp_df, metadata_df], axis=1)

    return merged_df

In [15]:
client_id = '930c85172af549c7bed7661f025edf11'
client_secret = 'b33c88126c014981b3fc1d7c426d3a1c'
#redirect_uri = 'http://localhost:8888/callback'

result_df = get_spotify_metadata(temp.sample(3)
                                 , client_id
                                 , client_secret)

In [17]:
result_df

Unnamed: 0,user_id,song_id,play_count,song_title,song_release,song_artist,song_year,title,popularity,is_local,explicit,error
66822,71661.0,9139.0,1.0,Half Of My Heart,Battle Studies,John Mayer,0.0,,,,,
75679,623.0,5879.0,1.0,Rabbit Heart (Raise It Up),Rabbit Heart EP,Florence + The Machine,2009.0,,,,,
74875,67302.0,5398.0,1.0,Angie,Jump Back - The Best Of The Rolling Stones_ '7...,The Rolling Stones,1973.0,,,,,
0,,,,,,,,Half Of My Heart,67.0,False,False,Missing information: 'genres'
1,,,,,,,,Rabbit Heart (Raise It Up),49.0,False,False,Missing information: 'genres'
2,,,,,,,,Angie,75.0,False,False,Missing information: 'genres'


In [None]:
result_df.to_csv('spotipy_df_temp.csv', index=False)

In [None]:
## Normalize Text variables
import re
# from nltk.corpus import stopwords
# from nltk.stem import PorterStemmer

# Function to perform text normalization
def normalize_text(text):

    text = text.lower()                                                           # Convert to lowercase
    text = re.sub(r'[^\w\s]', '', text)                                           # Remove punctuation
    # stop_words = set(stopwords.words('english'))                                # Remove stop words (using NLTK library)
    # text = ' '.join([word for word in text.split() if word not in stop_words])
    # stemmer = PorterStemmer()                                                   # Stemming (using Porter Stemmer from NLTK)
    # text = ' '.join([stemmer.stem(word) for word in text.split()])
    return text

In [None]:
# Apply the normalization function to the 'TextColumn'
temp['song_title'] = temp['song_title'].apply(normalize_text)
temp['song_release'] = temp['song_release'].apply(normalize_text)
temp['song_artist'] = temp['song_artist'].apply(normalize_text)

In [None]:
song_df = temp
#temp = []