In [None]:
import requests
import random
import string
import base64
import os
import csv
import time

# Spotify API credentials
client_id = '13c533aa9722416793be9e5df13fa0ec'
client_secret = '0bcd029bae3f4ad19efa068b2c3ea733'

# File path to store artist data
file_path = "spotify_artists.csv"

# Step 1: Obtain an Access Token
def get_access_token(client_id, client_secret):
    token_url = 'https://accounts.spotify.com/api/token'
    client_creds = f"{client_id}:{client_secret}"
    client_creds_b64 = base64.b64encode(client_creds.encode()).decode()
    
    headers = {'Authorization': f'Basic {client_creds_b64}'}
    data = {'grant_type': 'client_credentials'}
    
    response = requests.post(token_url, headers=headers, data=data)
    if response.status_code == 200:
        print("Successfully obtained access token.")
        return response.json()['access_token']
    else:
        raise Exception(f"Failed to get access token: {response.status_code} - {response.text}")

# Step 2: Search Spotify for random artist IDs
def get_random_artist_ids(access_token, limit=50):
    url = "https://api.spotify.com/v1/search"
    headers = {'Authorization': f'Bearer {access_token}'}
    params = {
        'q': ''.join(random.choice(string.ascii_lowercase) for _ in range(3)),
        'type': 'artist',
        'limit': limit
    }
    
    response = requests.get(url, headers=headers, params=params)
    if response.status_code == 200:
        artist_items = response.json()['artists']['items']
        artist_ids = [artist['id'] for artist in artist_items]
        print(f"Retrieved {len(artist_ids)} artist IDs.")
        return artist_ids
    elif response.status_code == 429:  # Rate limit
        retry_after = int(response.headers.get("Retry-After", 5))
        print(f"Rate limit hit. Retrying in {retry_after} seconds.")
        time.sleep(retry_after)
        return get_random_artist_ids(access_token, limit)
    else:
        print(f"Failed to search for artist IDs: {response.status_code} - {response.text}")
        return []

# Step 3: Get Artist Details for a List of IDs
def get_artists_details(access_token, artist_ids):
    url = "https://api.spotify.com/v1/artists"
    headers = {'Authorization': f'Bearer {access_token}'}
    params = {'ids': ','.join(artist_ids)}
    
    response = requests.get(url, headers=headers, params=params)
    if response.status_code == 200:
        return response.json().get('artists', [])
    elif response.status_code == 429:  # Rate limit
        retry_after = int(response.headers.get("Retry-After", 5))
        print(f"Rate limit hit. Retrying in {retry_after} seconds.")
        time.sleep(retry_after)
        return get_artists_details(access_token, artist_ids)
    else:
        print(f"Failed to retrieve artist details: {response.status_code} - {response.text}")
        return []

# Step 4: Write artist details to CSV
def write_artists_to_csv(artists):
    file_exists = os.path.isfile(file_path)
    with open(file_path, mode='a', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        if not file_exists:
            writer.writerow(['id', 'name', 'popularity', 'followers', 'genres', 'url'])  # CSV headers
        for artist in artists:
            genres = ", ".join(artist.get('genres', []))
            writer.writerow([
                artist['id'],
                artist['name'],
                artist['popularity'],
                artist['followers']['total'],
                genres,
                artist['external_urls']['spotify']
            ])
    print(f"Wrote {len(artists)} artists to CSV.")

# Step 5: Main Function to Collect 100,000 Artist Names
def collect_artists(access_token, target_count=100000):
    artist_ids = set()  # Use a set to avoid duplicate IDs
    total_collected = 0
    
    while total_collected < target_count:
        # Step 5.1: Gather unique artist IDs until we have enough for a batch request
        while len(artist_ids) < 50:
            new_ids = get_random_artist_ids(access_token, limit=50 - len(artist_ids))
            artist_ids.update(new_ids)
        
        # Step 5.2: Get artist details in batches of up to 50
        batch_ids = list(artist_ids)[:50]
        artist_details = get_artists_details(access_token, batch_ids)
        write_artists_to_csv(artist_details)
        
        # Update count and clear processed IDs
        total_collected += len(artist_details)
        artist_ids.difference_update(batch_ids)  # Remove processed IDs
        
        print(f"Total artists collected so far: {total_collected}")
        if total_collected >= target_count:
            break

# Run the collection process
if __name__ == "__main__":
    access_token = get_access_token(client_id, client_secret)
    collect_artists(access_token, target_count=100000)


Successfully obtained access token.
Retrieved 50 artist IDs.
Wrote 50 artists to CSV.
Total artists collected so far: 50
Retrieved 50 artist IDs.
Wrote 50 artists to CSV.
Total artists collected so far: 100
Retrieved 18 artist IDs.
Retrieved 32 artist IDs.
Wrote 50 artists to CSV.
Total artists collected so far: 150
Retrieved 34 artist IDs.
Retrieved 16 artist IDs.
Wrote 50 artists to CSV.
Total artists collected so far: 200
Retrieved 25 artist IDs.
Retrieved 25 artist IDs.
Wrote 50 artists to CSV.
Total artists collected so far: 250
Retrieved 50 artist IDs.
Wrote 50 artists to CSV.
Total artists collected so far: 300
Retrieved 50 artist IDs.
Wrote 50 artists to CSV.
Total artists collected so far: 350
Retrieved 38 artist IDs.
Retrieved 12 artist IDs.
Wrote 50 artists to CSV.
Total artists collected so far: 400
Retrieved 50 artist IDs.
Wrote 50 artists to CSV.
Total artists collected so far: 450
Retrieved 50 artist IDs.
Wrote 50 artists to CSV.
Total artists collected so far: 500
Retri

In [None]:
import requests
import random
import string
import base64
import os
import csv
import time

# Spotify API credentials
client_id = '13c533aa9722416793be9e5df13fa0ec'
client_secret = '0bcd029bae3f4ad19efa068b2c3ea733'

# File paths
artist_file_path = "spotify_artists.csv"
track_file_path = "spotify_top_tracks.csv"

# Step 1: Obtain an Access Token
def get_access_token(client_id, client_secret):
    token_url = 'https://accounts.spotify.com/api/token'
    client_creds = f"{client_id}:{client_secret}"
    client_creds_b64 = base64.b64encode(client_creds.encode()).decode()
    
    headers = {'Authorization': f'Basic {client_creds_b64}'}
    data = {'grant_type': 'client_credentials'}
    
    response = requests.post(token_url, headers=headers, data=data)
    if response.status_code == 200:
        print("Successfully obtained access token.")
        return response.json()['access_token']
    else:
        raise Exception(f"Failed to get access token: {response.status_code} - {response.text}")

# Step 2: Read artist IDs from the CSV file
def read_artist_ids(file_path):
    artist_ids = []
    with open(file_path, mode='r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            artist_ids.append(row['id'])
    print(f"Loaded {len(artist_ids)} artist IDs from file.")
    return artist_ids

# Step 3: Get Top Tracks for a Given Artist ID
def get_top_tracks(access_token, artist_id, country='US', limit=5):
    url = f"https://api.spotify.com/v1/artists/{artist_id}/top-tracks"
    headers = {'Authorization': f'Bearer {access_token}'}
    params = {'market': country}  # The country parameter affects popularity ranking
    
    response = requests.get(url, headers=headers, params=params)
    if response.status_code == 200:
        top_tracks = response.json()['tracks'][:limit]  # Limit to top `n` tracks
        return top_tracks
    elif response.status_code == 429:  # Rate limit
        retry_after = int(response.headers.get("Retry-After", 5))
        print(f"Rate limit hit. Retrying in {retry_after} seconds.")
        time.sleep(retry_after)
        return get_top_tracks(access_token, artist_id, country, limit)
    else:
        print(f"Failed to retrieve top tracks for artist ID {artist_id}: {response.status_code} - {response.text}")
        return []

# Step 4: Get Audio Features for Multiple Tracks
def get_audio_features_batch(access_token, track_ids):
    url = "https://api.spotify.com/v1/audio-features"
    headers = {'Authorization': f'Bearer {access_token}'}
    params = {'ids': ','.join(track_ids)}
    time.sleep(30)
    response = requests.get(url, headers=headers, params=params)
    if response.status_code == 200:
        return response.json()['audio_features']
    elif response.status_code == 429:  # Rate limit
        retry_after = int(response.headers.get("Retry-After", 5))
        print(f"Rate limit hit for audio features batch. Retrying in {retry_after} seconds.")
        time.sleep(retry_after)
        return get_audio_features_batch(access_token, track_ids)
    else:
        print(f"Failed to retrieve audio features batch: {response.status_code} - {response.text}")
        return []

# Step 5: Write track details with audio features to CSV
def write_tracks_to_csv(tracks):
    file_exists = os.path.isfile(track_file_path)
    with open(track_file_path, mode='a', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        if not file_exists:
            writer.writerow([
                'artist_id', 'track_id', 'track_name', 'popularity', 'album_name', 
                'release_date', 'external_url', 'danceability', 'energy', 'key', 
                'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 
                'liveness', 'valence', 'tempo'
            ])
        for track in tracks:
            writer.writerow([
                track['artist_id'],
                track['track_id'],
                track['track_name'],
                track['popularity'],
                track['album_name'],
                track['release_date'],
                track['external_url'],
                track.get('danceability'),
                track.get('energy'),
                track.get('key'),
                track.get('loudness'),
                track.get('mode'),
                track.get('speechiness'),
                track.get('acousticness'),
                track.get('instrumentalness'),
                track.get('liveness'),
                track.get('valence'),
                track.get('tempo')
            ])
    print(f"Wrote {len(tracks)} tracks to CSV.")

# Step 6: Main Function to Collect Top Tracks with Batched Audio Features
def collect_top_tracks_with_features(access_token, artist_ids, track_limit=5):
    total_tracks_collected = 0
    batch_track_data = []

    for artist_id in artist_ids:
        # Get the top tracks for the artist
        top_tracks = get_top_tracks(access_token, artist_id, limit=track_limit)
        
        # Structure the data without audio features for now
        for track in top_tracks:
            batch_track_data.append({
                'artist_id': artist_id,
                'track_id': track['id'],
                'track_name': track['name'],
                'popularity': track['popularity'],
                'album_name': track['album']['name'],
                'release_date': track['album']['release_date'],
                'external_url': track['external_urls']['spotify']
            })
        
        # When batch reaches 100 tracks, fetch audio features
        if len(batch_track_data) >= 100:
            # Collect track IDs and get audio features in bulk
            track_ids = [track['track_id'] for track in batch_track_data]
            audio_features = get_audio_features_batch(access_token, track_ids)
            
            # Add audio features to the track data
            for i, features in enumerate(audio_features):
                if features:  # Ensure audio features data is valid
                    batch_track_data[i].update(features)
            
            # Write batch to CSV
            write_tracks_to_csv(batch_track_data)
            total_tracks_collected += len(batch_track_data)
            batch_track_data = []  # Clear batch
            
            # Display progress
            print(f"Total tracks with audio features collected so far: {total_tracks_collected}")
        
        # Stop once we reach the limit, if specified
        if total_tracks_collected >= track_limit * len(artist_ids):
            break

    # Process any remaining tracks in the last batch
    if batch_track_data:
        track_ids = [track['track_id'] for track in batch_track_data]
        audio_features = get_audio_features_batch(access_token, track_ids)
        
        for i, features in enumerate(audio_features):
            if features:
                batch_track_data[i].update(features)
        
        write_tracks_to_csv(batch_track_data)
        total_tracks_collected += len(batch_track_data)
        print(f"Final batch written. Total tracks collected: {total_tracks_collected}")

# Run the collection process
if __name__ == "__main__":
    # Step 1: Get access token
    access_token = get_access_token(client_id, client_secret)
    
    # Step 2: Load artist IDs
    artist_ids = read_artist_ids(artist_file_path)
    
    # Step 3: Collect top tracks with batched audio features
    collect_top_tracks_with_features(access_token, artist_ids, track_limit=5)


Successfully obtained access token.
Loaded 118000 artist IDs from file.
Rate limit hit. Retrying in 32394 seconds.
