## Importing the libraries

In [2]:
import os
import pandas as pd
import numpy as np
import json
import spotipy
import spotipy.oauth2 as oauth2
from spotipy.oauth2 import SpotifyOAuth,SpotifyClientCredentials
import re
from tqdm import tqdm
import multiprocessing as mp
import time
import random
import datetime

In [3]:
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv("../spotify_extraction.env")

True

In [4]:
auth_manager = SpotifyClientCredentials(client_id=os.getenv('SPOTIPY_CLIENT_ID'),
                                        client_secret=os.getenv('SPOTIPY_CLIENT_SECRET'))
sp = spotipy.client.Spotify(auth_manager=auth_manager)

In [5]:
import pandas as pd

# Replace 'path_to_csv.csv' with the path to your actual CSV file
csv_path = 'V1.csv'

# Load the CSV data into a DataFrame
df = pd.read_csv(csv_path)

In [8]:
t_uri=df["track_uri"].unique()
a_uri=df["artist_uri"].unique()

# Feature extraction

Using the Spotify API for Feature Extraction and Saving Results to a CSV File and Errors to a Log File

I was using SP.track first, but I realised that it would take a lot of time and I would have to counter a lot of Api rate limits, so I used SP.tracks and SP.artists instead. They accept lists with a 50-URI maximum and handle them in a single request, so it took a lot less time.

In [9]:
# Function to find the last logged URI in the CSV
def find_last_logged_uri(file_path):
    try:
        df = pd.read_csv(file_path)
        last_uri = df.iloc[-1, 0]  # Assuming the URI is in the first column
        return last_uri
    except Exception as e:
        print(f"Error reading the file: {e}")
        return None

# Find the last URI and determine the starting index
last_uri = find_last_logged_uri('data/artist_features.csv')
start_index = 0
if last_uri and last_uri in a_uri:
    index_array = np.where(a_uri == last_uri)[0]
    if index_array.size > 0:
        start_index = index_array[0] + 1

In [10]:
artist_features = sp.artists(a_uri[start_index:start_index+50])

In [9]:
# Constants
REQUESTS_PER_MINUTE = 180
SECONDS_PER_MINUTE = 60
DELAY = SECONDS_PER_MINUTE / REQUESTS_PER_MINUTE  # Calculate delay to fit the rate limit

f = open('data/artist_features.csv', 'a')
e = 0

for i in tqdm(range(start_index, len(a_uri), 50)):
    try:
        time.sleep(DELAY)
        artist_features = sp.artists(a_uri[i:i+50])
        for x in range(min(50, len(a_uri) - i)):  # Ensure no index out of range
            artist_df = pd.DataFrame([a_uri[i+x]])
            artist_pop = artist_features['artists'][x]["popularity"]
            artist_genres = artist_features['artists'][x]["genres"]
            artist_df["artist_pop"] = artist_pop
            if artist_genres:
                artist_df["genres"] = " ".join([re.sub(' ', '_', genre) for genre in artist_genres])
            else:
                artist_df["genres"] = "unknown"
            csv_data = artist_df.to_csv(header=False, index=False)
            f.write(csv_data)
    except Exception as error:
        e += 1
        r = open("artist_features.txt", "a")
        r.write(datetime.datetime.now().strftime("%d.%b %Y %H:%M:%S") + ": " + str(error) + '\n')
        r.close()
        time.sleep(3)
        continue

# Logging the final count of errors
r = open("artist_features.txt", "a")
r.write(datetime.datetime.now().strftime("%d.%b %Y %H:%M:%S") + " _________________________ " + "Total Number Of Errors : " + str(e) + " _________________________ " + '\n')
r.close()

# Closing the file
f.close()

  0%|          | 0/3641 [00:00<?, ?it/s]

# Ignore

In [15]:
# Constants
REQUESTS_PER_MINUTE = 180
SECONDS_PER_MINUTE = 60
DELAY = SECONDS_PER_MINUTE / REQUESTS_PER_MINUTE  # Calculate delay to fit the rate limit

f = open('data/artist_features.csv', 'a')
e = 0

for i in tqdm(range(0, len(a_uri), 50)):
    try:
        # Delay each request to ensure we do not exceed the rate limit of 180 requests per minute
        time.sleep(DELAY)

        artist_features = sp.artists(a_uri[i:i+50])
        for x in range(50):
            artist_df = pd.DataFrame([a_uri[i+x]])
            artist_pop = artist_features['artists'][x]["popularity"]
            artist_genres = artist_features['artists'][x]["genres"]
            artist_df["artist_pop"] = artist_pop
            if artist_genres:
                artist_df["genres"] = " ".join([re.sub(' ', '_', genre) for genre in artist_genres])
            else:
                artist_df["genres"] = "unknown"
            csv_data = artist_df.to_csv(header=False, index=False)
            f.write(csv_data)
    except Exception as error:
        e += 1
        r = open("artist_features.txt", "a")
        r.write(datetime.datetime.now().strftime("%d.%b %Y %H:%M:%S") + ": " + str(error) + '\n')
        r.close()
        time.sleep(3)
        continue

# Logging the final count of errors
r = open("artist_features.txt", "a")
r.write(datetime.datetime.now().strftime("%d.%b %Y %H:%M:%S") + " _________________________ " + "Total Number Of Errors : " + str(e) + " _________________________ " + '\n')
r.close()

# Closing the file
f.close()

 39%|███▊      | 2281/5918 [21:00<31:16,  1.94it/s]  