In [None]:
import aiohttp
import asyncio
import random
import string
import pandas as pd
import base64
import time
import os

# Spotify API credentials
client_id = ''
client_secret = ''

# File path to new or existing Excel file with downloaded data
file_path = r""

# Step 1: Encode Client ID and Client Secret
async def get_access_token(client_id, client_secret):
    client_creds = f"{client_id}:{client_secret}"
    client_creds_b64 = base64.b64encode(client_creds.encode()).decode()
    
    token_url = 'https://accounts.spotify.com/api/token'
    headers = {'Authorization': f'Basic {client_creds_b64}'}
    data = {'grant_type': 'client_credentials'}
    
    async with aiohttp.ClientSession() as session:
        async with session.post(token_url, headers=headers, data=data) as r:
            if r.status == 200:
                return (await r.json())['access_token']
            else:
                raise Exception("Failed to get access token", r.status, await r.text())

# Main asynchronous function to collect tracks
async def main():
    access_token = await get_access_token(client_id, client_secret)
    
    def get_random_query():
        return ''.join(random.choice(string.ascii_lowercase) for _ in range(random.randint(3, 5)))

    # Handle rate limits 
    async def handle_rate_limit(response, current_wait_time):
        retry_after = response.headers.get('Retry-After')
        if response.status == 429 and retry_after:
            wait_time = int(retry_after)
            # print(f"Rate limit exceeded. Retrying after {wait_time} seconds.")
            await asyncio.sleep(wait_time)
            return wait_time  # Return the wait time from the header
        return current_wait_time  # Default return if no retry needed
    
    # Search for random tracks
    async def search_random_tracks(session, access_token, query, current_wait_time, limit=10):
        url = "https://api.spotify.com/v1/search"
        headers = {'Authorization': f'Bearer {access_token}'}
        params = {'q': query, 'type': 'track', 'limit': limit}
        
        while True:
            async with session.get(url, headers=headers, params=params) as response:
                if response.status == 200:
                    return (await response.json())['tracks']['items'], 5  # Reset wait time
                else:
                    current_wait_time = await handle_rate_limit(response, current_wait_time)

    # Get audio features with error handling
    async def get_audio_features(session, track_id, access_token):
        url = f"https://api.spotify.com/v1/audio-features/{track_id}"
        headers = {'Authorization': f'Bearer {access_token}'}
        
        while True:
            async with session.get(url, headers=headers) as response:
                if response.status == 200:
                    return await response.json()
                elif response.status == 429:  # Rate limit exceeded
                    current_wait_time = await handle_rate_limit(response, 5)
                    await asyncio.sleep(current_wait_time)  # Wait before retrying
                else:
                    print(f"Failed to retrieve audio features for track ID {track_id}. Status code: {response.status}")
                    return None
    
    # Get artist metadata with error handling
    async def get_artist_metadata(session, artist_id, access_token):
        url = f"https://api.spotify.com/v1/artists/{artist_id}"
        headers = {'Authorization': f'Bearer {access_token}'}
        
        while True:
            async with session.get(url, headers=headers) as response:
                if response.status == 200:
                    artist_data = await response.json()
                    artist_image_url = artist_data['images'][0]['url'] if artist_data['images'] else None
                    return {
                        'genres': artist_data.get('genres', []),
                        'artist_popularity': artist_data.get('popularity'),
                        'artist_followers': artist_data['followers']['total'],
                        'artist_external_url': artist_data['external_urls']['spotify'],
                        'artist_image_url': artist_image_url
                    }
                elif response.status == 429:  # Rate limit exceeded
                    current_wait_time = await handle_rate_limit(response, 5)
                    await asyncio.sleep(current_wait_time)  # Wait before retrying
                else:
                    print(f"Failed to retrieve artist metadata for artist ID {artist_id}. Status code: {response.status}")
                    return None

    # Combine track metadata and audio features
    async def get_combined_data(session, track, access_token):
        track_id = track['id']
        
        # Get audio features with error handling
        audio_features = None
        while audio_features is None:
            audio_features = await get_audio_features(session, track_id, access_token)
        
        if audio_features:
            artist_id = track['artists'][0]['id']
            
            # Get artist metadata with error handling
            artist_metadata = None
            while artist_metadata is None:
                artist_metadata = await get_artist_metadata(session, artist_id, access_token)
            
            album_image_url = track['album']['images'][0]['url'] if track['album']['images'] else None
            
            track_info = {
                'track_id': track_id,
                'artists': ', '.join([artist['name'] for artist in track['artists']]),
                'album_name': track['album']['name'],
                'release_date': track['album']['release_date'],
                'album_image_url': album_image_url,
                'track_name': track['name'],
                'popularity': track['popularity'],
                'duration_ms': track['duration_ms'],
                'explicit': track['explicit'],
                'available_markets': ', '.join(track.get('available_markets', [])),
                'track_external_url': track['external_urls']['spotify'],
                'track_genre': ', '.join(artist_metadata['genres']) if artist_metadata else None,
                'artist_popularity': artist_metadata['artist_popularity'] if artist_metadata else None,
                'artist_followers': artist_metadata['artist_followers'] if artist_metadata else None,
                'artist_image_url': artist_metadata['artist_image_url'] if artist_metadata else None,
                'artist_external_url': artist_metadata['artist_external_url'] if artist_metadata else None
            }
            return {**track_info, **audio_features}
        
        return None

    # Collect random tracks asynchronously
    async def collect_random_tracks(access_token, num_tracks=10000):
        batch_data = []
        start_time = time.time()
        total_collected = 0
        batch_size = 30
        current_wait_time = 5
        max_concurrent_requests = 10  # Limit concurrency

        async with aiohttp.ClientSession() as session:
            # Initialize semaphore here
            sem = asyncio.Semaphore(max_concurrent_requests)
            while total_collected < num_tracks:
                queries = [get_random_query() for _ in range(10)]
                print(f"Generated queries: {queries}")
                tasks = []
        
                # Create tasks for multiple queries with concurrency management
                for query in queries:
                    async def task_with_sem(query, sem):
                        async with sem:  # Acquire the semaphore
                            return await search_random_tracks(session, access_token, query, current_wait_time, limit=batch_size)
        
                    tasks.append(task_with_sem(query, sem))  # Schedule the task
        
                results = await asyncio.gather(*tasks)
                random_tracks_lists = []
        
                for result in results:
                    if result:
                        tracks, wait_time = result
                        random_tracks_lists.append(tracks)
                        current_wait_time = wait_time
                        print(f"Collected {len(tracks)} tracks for query.")
        
                random_tracks = [track for tracks in random_tracks_lists if tracks is not None for track in tracks]
        
                if random_tracks:
                    print(f"Found {len(random_tracks)} random tracks to process.")
                    track_tasks = [get_combined_data(session, track, access_token) for track in random_tracks]
                    combined_data_list = await asyncio.gather(*track_tasks)
                    batch_data.extend([data for data in combined_data_list if data])
        
                    # Write in batches of 1000 for efficiency
                    if len(batch_data) >= 1000 or total_collected >= num_tracks:
                        data_to_write = pd.DataFrame(batch_data)
                        if os.path.exists(file_path):
                            existing_data = pd.read_excel(file_path)
                            updated_data = pd.concat([existing_data, data_to_write], ignore_index=True)
                        else:
                            updated_data = data_to_write
                        updated_data.to_excel(file_path, index=False)
                        total_collected += len(batch_data)
                        batch_data = []  # Clear batch
                        print(f"{total_collected} tracks collected and saved to Excel.")
        
        end_time = time.time()
        print(f"Collection finished. Total tracks collected: {total_collected}. Time taken: {end_time - start_time:.2f} seconds.")

    # Run the collection
    await collect_random_tracks(access_token)

# Entry point for Jupyter or similar environments
if __name__ == "__main__":
    await main()  # Use this in a notebook or interactive environment

Generated queries: ['blzx', 'rfscr', 'pju', 'rlc', 'tfr', 'uowr', 'khet', 'eoh', 'fqgbe', 'nbi']
Collected 0 tracks for query.
Collected 0 tracks for query.
Collected 30 tracks for query.
Collected 30 tracks for query.
Collected 30 tracks for query.
Collected 0 tracks for query.
Collected 30 tracks for query.
Collected 30 tracks for query.
Collected 0 tracks for query.
Collected 30 tracks for query.
Found 180 random tracks to process.
