## Importing the libraries

In [1]:
import os
import pandas as pd
import numpy as np
import json
import spotipy
import spotipy.oauth2 as oauth2
from spotipy.oauth2 import SpotifyOAuth,SpotifyClientCredentials
import re
from tqdm import tqdm
import multiprocessing as mp
import time
import random
import datetime

In [2]:
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv("../spotify_secrets.env")
auth_manager = SpotifyClientCredentials(client_id=os.getenv('SPOTIPY_CLIENT_ID'),
                                        client_secret=os.getenv('SPOTIPY_CLIENT_SECRET'))
sp = spotipy.client.Spotify(auth_manager=auth_manager)

In [3]:
# Replace 'path_to_csv.csv' with the path to your actual CSV file
csv_path = 'V1.csv'

# Load the CSV data into a DataFrame
df = pd.read_csv(csv_path)

In [4]:
df.columns

Index(['track_uri', 'artist_uri', 'album_uri'], dtype='object')

In [5]:
t_uri=df["track_uri"].unique()

# Feature extraction

Using the Spotify API for Feature Extraction and Saving Results to a CSV File and Errors to a Log File

I was using SP.track first, but I realised that it would take a lot of time and I would have to counter a lot of Api rate limits, so I used SP.tracks and SP.artists instead. They accept lists with a 50-URI maximum and handle them in a single request, so it took a lot less time.

In [6]:
# Function to find the last logged URI in the CSV
def find_last_logged_uri(file_path):
    try:
        df = pd.read_csv(file_path, header=None, encoding='utf-8')  # Adjust based on the actual header presence
        last_uri = df.iloc[-1, 0]  # Adjust if URI is in a different column
        return last_uri
    except Exception as e:
        print(f"Error reading the file: {e}")
        return None

In [7]:
last_uri = find_last_logged_uri('data/track_features.csv')
last_uri

'3BQY1xO8xwWaIXYFQ9dLQY'

In [8]:
# Find the last URI and determine the starting index
last_uri = find_last_logged_uri('data/track_features.csv')
start_index = 0
if last_uri and last_uri in t_uri:
    index_array = np.where(t_uri == last_uri)[0]
    if index_array.size > 0:
        start_index = index_array[0] + 1
start_index

217088

In [9]:
t_uri[start_index-1]

'3BQY1xO8xwWaIXYFQ9dLQY'

In [11]:
#with open('data/track_features.csv', 'w') as file:
#    pass

In [10]:
# Constants
REQUESTS_PER_MINUTE = 180
SECONDS_PER_MINUTE = 60
DELAY = SECONDS_PER_MINUTE / REQUESTS_PER_MINUTE  # Delay to fit the rate limit
BATCH_SIZE = 50  # Fetching 50 tracks at a time

# Function to fetch and process tracks
def fetch_and_process_tracks(track_uris):
    with open('data/track_features.csv', 'a', encoding='utf-8') as f, open('error_log.txt', 'a', encoding='utf-8') as log_file:
        for i in tqdm(range(0, len(track_uris), BATCH_SIZE)):
            try:
                time.sleep(DELAY)  # Delay each request
                track_features = sp.tracks(track_uris[i:i + BATCH_SIZE])
                track_data_list = []
                
                for track, uri in zip(track_features['tracks'], track_uris[i:i + BATCH_SIZE]):
                    if track:  # Check if track is not None
                        artist_names = ', '.join([artist['name'] for artist in track['artists']])
                        release_year = track['album']['release_date'].split('-')[0]
                        track_data = {
                            'track_uri': uri,
                            'release_year': release_year,
                            'artists': artist_names,
                            'explicit': int(track['explicit']),
                            'track_name': track['name'],
                            'track_popularity': track['popularity']
                        }
                        track_data_list.append(track_data)

                # Create DataFrame from list and write to CSV
                if track_data_list:
                    df = pd.DataFrame(track_data_list)
                    df.to_csv(f, header=False, index=False, mode='a', encoding='utf-8')
            except Exception as error:
                log_file.write(f"{datetime.datetime.now().strftime('%d.%b %Y %H:%M:%S')}: {str(error)}\n")
                time.sleep(3)  # Delay after an exception

fetch_and_process_tracks(t_uri)

  0%|          | 0/45246 [00:00<?, ?it/s]

In [13]:
f = open('data/track_features.csv', 'a')
e = 0

for i in tqdm(range(start_index, len(t_uri), 50)):
    try:
        time.sleep(DELAY)  # Ensure we respect the rate limit
        track_features = sp.tracks(t_uri[i:i+50])
        for x in range(50):  # Ensure no index out of range
            track_df = pd.DataFrame([t_uri[i+x]])
            track_df['release_date'] = track_features['tracks'][x]['album']['release_date']
            track_df['pop'] = track_features['tracks'][x]["popularity"]
            csv_data = track_df.to_csv(header=False, index=False)
            f.write(csv_data)
    except Exception as error:
        e += 1
        r = open("track_features.txt", "a")
        r.write(datetime.datetime.now().strftime("%d.%b %Y %H:%M:%S") + ": " + str(error) + '\n')
        r.close()
        time.sleep(3)
        continue

# Logging the final count of errors
r = open("track_features.txt", "a")
r.write(datetime.datetime.now().strftime("%d.%b %Y %H:%M:%S") + " _________________________ " + "Total Number Of Errors : " + str(e) + " _________________________ " + '\n')
r.close()

# Closing the file
f.close()

  0%|          | 0/42998 [00:00<?, ?it/s]

In [None]:
import pandas as pd
import time
import datetime
from tqdm import tqdm

# Constants
REQUESTS_PER_MINUTE = 180
SECONDS_PER_MINUTE = 60
DELAY = SECONDS_PER_MINUTE / REQUESTS_PER_MINUTE  # Calculate delay to fit the rate limit

f = open('data/track_features.csv', 'a')
e = 0

for i in tqdm(range(0, len(t_uri), 50)):
    try:
        # Delay each request to ensure we do not exceed the rate limit of 180 requests per minute
        time.sleep(DELAY)
        
        track_features = sp.tracks(t_uri[i:i+50])
        for x in range(50):
            track_pop = pd.DataFrame([t_uri[i+x]])
            track_pop['artists'] = track_features['tracks'][x][]
            track_pop['release_date'] = track_features['tracks'][x]['album']['release_date']
            track_pop['pop'] = track_features['tracks'][x]["popularity"]
            csv_data = track_pop.to_csv(header=False, index=False)
            f.write(csv_data)
    except Exception as error:
        e += 1
        r = open("track_features.txt", "a")
        r.write(datetime.datetime.now().strftime("%d.%b %Y %H:%M:%S") + ": " + str(error) + '\n')
        r.close()
        time.sleep(3)
        continue

# Logging the final count of errors
r = open("track_features.txt", "a")
r.write(datetime.datetime.now().strftime("%d.%b %Y %H:%M:%S") + " _________________________ " + "Total Number Of Errors : " + str(e) + " _________________________ " + '\n')
r.close()

# Closing the file
f.close()

 10%|▉         | 4318/45246 [52:08<8:55:02,  1.27it/s] 

In [None]:
f = open('data/track_features.csv','a')
e=0
for i in tqdm(range(0,len(t_uri),50)):
    try:
        track_features = sp.tracks(t_uri[i:i+50])
        for x in range(50):
            track_pop=pd.DataFrame([t_uri[i+x]])
            track_pop['release_date']=track_features['tracks'][x]['album']['release_date']
            track_pop['pop'] = track_features['tracks'][x]["popularity"]
            csv_data = track_pop.to_csv(header=False,index=False)
            f.write(csv_data)
    except Exception as error:
        e+=1
        r = open("track_features.txt", "a")
        r.write(datetime.datetime.now().strftime("%d.%b %Y %H:%M:%S")+": "+str(error)+'\n')
        r.close()
        time.sleep(3)
        continue
r = open("track_features.txt", "a")
r.write(datetime.datetime.now().strftime("%d.%b %Y %H:%M:%S")+" _________________________ "+"Total Number Of Errors : "+str(e)+" _________________________ "+'\n')
r.close()
f.close()