In [4]:
import pandas as pd
import requests
import spotipy
from dotenv import load_dotenv
import os
from spotipy.oauth2 import SpotifyOAuth
import json
import time
import datetime as dt
from requests.exceptions import ReadTimeout, HTTPError
import base64

# Load environment variables
load_dotenv()

CLIENT_ID = os.getenv('client_id')
CLIENT_SECRET = os.getenv('client_secret')

sp = spotipy.Spotify(auth_manager=SpotifyOAuth(
    client_id=CLIENT_ID,
    client_secret=CLIENT_SECRET,
    redirect_uri='http://localhost:8888/callback',
    scope="user-library-read"
))

global NUM_CALLS
NUM_CALLS = 0

def api_call_wrapper(func, *args, **kwargs):
    """
    Wrapper for Spotify API calls to handle rate limiting and retries.
    """
    max_retries=3
    delay=1
    backoff_factor=2,
    retries = 0
    current_delay = delay
    while retries < max_retries:
        try:
            # Make the API call
            response = func(*args, **kwargs)
            global NUM_CALLS
            NUM_CALLS += 1
            return response
        
        except HTTPError as e:
            # Handle rate-limiting or server errors (status code 429 or 5xx)
            if e.response.status_code == 429:
                retry_after = int(e.response.headers.get("Retry-After", 1))  # Spotify may provide Retry-After
                print(f"Rate limit exceeded. Retrying after {retry_after} seconds...")
                time.sleep(retry_after)
            else:
                print(f"HTTP error: {e.response.status_code}. Retrying in {current_delay} seconds...")
                time.sleep(current_delay)
                retries += 1
                current_delay *= backoff_factor  # Exponential backoff
            
        except ReadTimeout:
            # Handle timeout errors
            print(f"Request timed out. Retrying in {current_delay} seconds...")
            time.sleep(current_delay)
            retries += 1
            current_delay *= backoff_factor
        
        except Exception as e:
            # Handle unexpected errors
            print(f"An error occurred: {e}. Retrying in {current_delay} seconds...")
            time.sleep(current_delay)
            retries += 1
            current_delay *= backoff_factor
    
    print(f"Max retries reached for {func.__name__}.")
    return None  # Return None if the API call fails

def search_artists_by_genre(genre, limit=50):
    """
    Search for artists by genre.
    
    :param genre: The genre to search for.
    :param limit: Max number of artist IDs to return.
    """
    artist_ids = set()
    results = api_call_wrapper(sp.search, q=f'genre:{genre}', type='artist', limit=50)
    
    while results and len(artist_ids) < limit:
        for artist in results['artists']['items']:
            artist_ids.add(artist['id'])
        
        # Check if there's another page and update search results
        if results['artists']['next']:
            results = api_call_wrapper(sp.next, results['artists'])
        else:
            results = None

    return list(artist_ids)


# Function to get all albums for an artist (including pagination)
def get_artist_albums(artist_id, headers):
    albums_url = f'https://api.spotify.com/v1/artists/{artist_id}/albums'
    all_albums = []
    params = {
        'limit': 50,  # Max limit per request
        'offset': 0,
        'include_groups': 'album'  # Only fetch albums (exclude singles, compilations, etc.)
    }
    
    while True:
        response = requests.get(albums_url, headers=headers, params=params)
        # print(response)
        time.sleep(.4)
        if response.status_code == 200:
            data = response.json()
            all_albums.extend(data['items'])  # Append fetched albums to the list
            break
        elif response.status_code == 429:
            # Handle rate-limiting
            retry_after = int(response.headers.get("Retry-After", 1))
            print(f"Rate limited. Retrying after {retry_after} seconds...")
            # time.sleep(retry_after)
            return "rate limited"
        else:
            # Handle other error cases
            print(f"Failed to fetch albums for artist {artist_id}. Status code: {response.status_code}")
            print("Response:", response.text)
            break

    return all_albums

# Function to download all albums for a list of artists
def download_all_albums_for_artists(artist_list, headers):
    """
    Download all albums for a list of artists.

    :param artist_list: List of artist names or artist IDs.
    :return: Pandas DataFrame with album details.
    """
    all_albums_data = []
    print('before artist loop')
    
    for artist_id in artist_list:
        print(f"Fetching albums for artist ID: {artist_id}")
        albums = get_artist_albums(artist_id, headers)  # Fetch albums for the artist
        if albums:  # If albums were found, process them
            for album in albums:
                all_albums_data.append({
                    'artist_name': album['artists'][0]['name'],
                    'artist_id': album['artists'][0]['id'],
                    'album_id': album['id'],
                    'album_name': album['name'],
                    'release_date': album['release_date'],
                    'total_tracks': album['total_tracks'],
                    'album_type': album['album_type']
                })
        elif albums == "rate_limited":
            print("Rate limited")
            break
    
    # Convert the results to a Pandas DataFrame
    df_albums = pd.DataFrame(all_albums_data)
    
    # Optional: Handle empty data
    if df_albums.empty:
        print("No albums found for the given artists.")
    else:
        print(f"Found {len(df_albums)} albums.")

    return df_albums


# Spotify token URL
def get_auth_token():
    auth_url = 'https://accounts.spotify.com/api/token'

    # Encode the client ID and client secret into base64
    auth_header = base64.b64encode(f"{CLIENT_ID}:{CLIENT_SECRET}".encode()).decode()

    # Define the headers and body for the authentication request
    auth_headers = {
        "Authorization": f"Basic {auth_header}",
        "Content-Type": "application/x-www-form-urlencoded"
    }

    auth_data = {
        "grant_type": "client_credentials"
    }

    # Get the access token
    auth_response = requests.post(auth_url, headers=auth_headers, data=auth_data)

    # Check if authentication was successful
    if auth_response.status_code != 200:
        print("Failed to authenticate. Status code:", auth_response.status_code)
        print("Response:", auth_response.text)
    else:
        auth_response_data = auth_response.json()
        access_token = auth_response_data['access_token']

    # Define the headers for subsequent API requests
    headers = {
        "Authorization": f"Bearer {access_token}"
    }
    return headers


def get_related_artists(artist_id, header):
    related_artist_url = f'https://api.spotify.com/v1/artists/{artist_id}/related-artists'

    while True:
        # Adding a delay to avoid hitting rate limits
        time.sleep(.4)
        
        # Make request to the related artists endpoint
        related_response = requests.get(related_artist_url, headers=header)
        
        # Check if the request was successful
        if related_response.status_code == 200:
            related_data = related_response.json()  # Get the JSON response
            break
        
        elif related_response.status_code == 429:
            # Handle rate-limiting
            retry_after = int(related_response.headers.get("Retry-After", 1))
            print(f"Rate limited. Retrying after {retry_after} seconds...")
            # time.sleep(retry_after)
            return None
        else:
            # Handle other errors
            print("Failed to fetch related artists. Status code:", related_response.status_code)
            print("Response:", related_response.text)
            return pd.DataFrame()  # Return an empty DataFrame if the request fails

    # Function to format each artist's data
    def format_artist_data(artist):
        return {
            "name": artist.get("name"),
            "artist_id": artist.get("id"),
            "genres": ", ".join(artist.get("genres", [])),
            "followers": artist.get("followers", {}).get("total", 0),
            "popularity": artist.get("popularity"),
            "spotify_url": artist.get("external_urls", {}).get("spotify"),
        }
    
    # Extract the list of artists from the related data
    formatted_artists = [format_artist_data(artist) for artist in related_data.get('artists', [])]
    
    # Create a DataFrame from the list of formatted artist data
    related_artist_df = pd.DataFrame(formatted_artists)
    
    return related_artist_df

def get_tracks_for_album(album_id):
    """
    Fetch all tracks for a given album ID.
    
    :param album_id: The Spotify ID of the album.
    :return: List of tracks with their details.
    """
    tracks = []
    results = sp.album_tracks(album_id)
    
    while results:
        tracks.extend(results['items'])
        
        # If there's a next page of results
        if results['next']:
            results = sp.next(results)
        else:
            results = None
    
    return tracks

def get_track_features(track_ids):
    """
    Fetch audio features for a list of track IDs.
    
    :param track_ids: List of track IDs.
    :return: List of track features.
    """
    track_features = []
    
    # Spotify allows fetching audio features for up to 100 tracks in one request
    for i in range(0, len(track_ids), 100):
        batch = track_ids[i:i+100]  # Fetch in batches of 100
        features = sp.audio_features(batch)
        track_features.extend(features)
    
    return track_features

def bulk_download_tracks_and_features(album_list):
    """
    Bulk download tracks and their features for a list of albums.
    
    :param album_list: List of album IDs.
    :return: DataFrame with track and feature information.
    """
    all_tracks_data = []
    all_track_ids = []
    
    # Fetch all tracks for each album
    for album in album_list:
        print(f"Fetching tracks for album: {album}")
        tracks = get_tracks_for_album(album)
        
        for track in tracks:
            all_track_ids.append(track['id'])
            all_tracks_data.append({
                'album_id': album,
                'track_id': track['id'],
                'track_name': track['name'],
                'track_number': track['track_number'],
                'artist_name': track['artists'][0]['name'],
                'artist_id': track['artists'][0]['id'],
                'duration_ms': track['duration_ms'],
                'explicit': track['explicit'],
            })
    
    # Fetch track features for the collected track IDs
    print("Fetching track features...")
    track_features = get_track_features(all_track_ids)
    
    # Merge track data with features
    for i, track in enumerate(all_tracks_data):
        feature = track_features[i]
        if feature:  # Sometimes audio_features can return None for a track
            track.update({
                'danceability': feature['danceability'],
                'energy': feature['energy'],
                'tempo': feature['tempo'],
                'valence': feature['valence'],
                'acousticness': feature['acousticness'],
                'instrumentalness': feature['instrumentalness'],
                'liveness': feature['liveness'],
                'speechiness': feature['speechiness'],
                'key': feature['key'],
                'mode': feature['mode'],
                'loudness': feature['loudness'],
                'time_signature': feature['time_signature'],
            })
    
    # Convert to Pandas DataFrame
    df_tracks = pd.DataFrame(all_tracks_data)
    return df_tracks


Current path for getting tracks is get genres, get artists by genres, get related artists of current artists, get artists top tracks, get audio features for each individual track.

Keep current path for getting artists, and batch audio features of the top tracks to reduce api calls

Get genres, get top artists, get related artists, get artists albums, get songs from albums,

Create Album Df, contains album name, artist, release date, included songs,

In [36]:

start_time = time.time()
genre_seeds = sp.recommendation_genre_seeds()
NUM_CALLS += 1
print(NUM_CALLS, time.time() - start_time)

all_artist_ids = set()

for genre in genre_seeds['genres']:
    all_artist_ids.update(search_artists_by_genre(genre, limit=50))
print(NUM_CALLS, time.time() - start_time)
print(len(list(all_artist_ids)))



1 0.1177375316619873
239 81.43151473999023
4305


In [37]:
headers = get_auth_token()
artist_df = pd.DataFrame()
for artist in list(all_artist_ids):
    if artist not in related_artist_df["artist_id"].unique():
        related_artist_df = get_related_artists(artist, headers)
        artist_df = pd.concat([artist_df,related_artist_df])
print(NUM_CALLS, time.time() - start_time)
print(len(list(all_artist_ids)))
artist_df = artist_df.reindex()
artist_df.drop_duplicates(inplace=True)

KeyboardInterrupt: 

In [40]:
artist_df.reset_index(inplace=True)
artist_df.drop_duplicates(inplace=True)

In [41]:
artist_df.to_csv(r"C:\Projects\CIS598_Project\fall-2024-shack02\data\artists.csv")

In [42]:
headers = get_auth_token()
artist_id = list(all_artist_ids)[0]
album_df = download_all_albums_for_artists(artist_df["artist_id"].values, headers)
album_df

before artist loop
Fetching albums for artist ID: 2ynylCO9SRPTKjgNEH0Y2a
Fetching albums for artist ID: 2x48WoJGRLCpCWHEKXMZoB
Fetching albums for artist ID: 3nS4tSuT4VwGiZH6BtlJfC
Fetching albums for artist ID: 7FAAkDlPg6pg0860CIDzmu
Fetching albums for artist ID: 6q4AmzK3GzCuEzkurnYuEQ
Fetching albums for artist ID: 71N7xIvnaOFGetZL5nhWsl
Fetching albums for artist ID: 6e0QWfEFmK6AguLy02mlqi
Fetching albums for artist ID: 0q6u5HyVK4zwGuzEtqjHqa
Fetching albums for artist ID: 5ZPr0RHsR3DrAhtsYMsfHR
Fetching albums for artist ID: 4AfTOzBubFP6STibJPTxwt
Fetching albums for artist ID: 7tjMOkm52H3Qiz3ty2tbNw
Fetching albums for artist ID: 2Tl7uZui4u9a3nXUM9VMei
Fetching albums for artist ID: 4wf6GGNBqaU79839E6yjfn
Fetching albums for artist ID: 16QCJENzcdhwka9bTKYMVB
Fetching albums for artist ID: 1fRv9jiRIN7zAOSpOfRP73
Fetching albums for artist ID: 4qrHkx5cgWIslciLXUMrYw
Fetching albums for artist ID: 6qtECqesbU29iftyeWmldK
Fetching albums for artist ID: 0znuUIjvP0LXEslfaq0Nor
Fetching 

In [1]:
album_df.head()

NameError: name 'album_df' is not defined

In [25]:
album_df.to_csv(r"C:\Projects\CIS598_Project\fall-2024-shack02\data\albums.csv")

In [5]:
album_df = pd.read_csv(r"C:\Projects\fall-2024-shack02\data\albums.csv")
artists_df = pd.read_csv(r"C:\Projects\fall-2024-shack02\data\artists.csv")

In [3]:
len(album_df)

103549

In [7]:
unique_artist_df = artists_df[artists_df["artist_id"].is_unique()]

TypeError: 'bool' object is not callable

In [8]:
unique_artist_df = artists_df.drop_duplicates(subset="artist_id")
len(unique_artist_df)
# unique_artist_df.drop(["Unnamed: 0","level_0","index"], axis=1, inplace=True)
# unique_artist_df.to_csv(r"C:\Projects\fall-2024-shack02\data\artists.csv")

13099

In [4]:
# album_df.drop(['index'],axis=1,inplace=True)
artists_with_albums = album_df["artist_id"].unique()
len(artists_with_albums)

14002

In [10]:
artists = unique_artist_df['artist_id'].unique()
artists_need_albums = [artist for artist in artists if not artist in artists_with_albums]


In [11]:
print(len(artists_need_albums),len(artists))

2564 13099


In [46]:
new_albums_df_list = []

In [3]:
def segment_list(input_list, segment_size=1000):
    return [input_list[i:i + segment_size] for i in range(0, len(input_list), segment_size)]

In [47]:
headers = get_auth_token()

segmented_new_artist_list = segment_list(artists_need_albums, segment_size = 1000)


for i,artist_segment in enumerate(segmented_new_artist_list):
    print(f"Segment {i}")
    new_albums_df_list.append(download_all_albums_for_artists(artist_segment, headers))
    print(f"Segment done")
    time.sleep(60)


Segment 0
before artist loop
Fetching albums for artist ID: 7Hy3BcTukq6HpDzVFrnJ8L
Fetching albums for artist ID: 1A5kGvmKIVtX7NhcbtTZJY
Fetching albums for artist ID: 0Ymz1ExCtpFOHsNDa5JeyL
Fetching albums for artist ID: 14IQ7niDNXIIrOSjr32E7O
Fetching albums for artist ID: 6tLHGlt7L7raSf6vr96hWi
Fetching albums for artist ID: 5VeyQ48fHEJP9CVFc4YJ5X
Fetching albums for artist ID: 422QvLrp4rn4VaTnb810uJ
Fetching albums for artist ID: 4Cp8MAVITZ6a8qjTVvhV28
Fetching albums for artist ID: 4ffqFO4lp2UOv2cnG9ka0J
Fetching albums for artist ID: 51NwQz5MA32kigUGYjdA3z
Fetching albums for artist ID: 4OHSon9N7JAfxkjlH8nKDb
Fetching albums for artist ID: 4PTQtiKISN5iGNpbRVv02B
Fetching albums for artist ID: 4x0yIdlIelBumJlBFgT2jt
Fetching albums for artist ID: 088wXCnKN6e54SL0zuVXR3
Fetching albums for artist ID: 75vuZwnDSzjDlz6OOnkfjJ
Fetching albums for artist ID: 5cSF6ZUFVfUIjOUGKl67f5
Fetching albums for artist ID: 20rP9JAY0JpEb4EDNNivoy
Fetching albums for artist ID: 4BU9SHOTANVwAifirXPu5u

SSLError: HTTPSConnectionPool(host='api.spotify.com', port=443): Max retries exceeded with url: /v1/artists/7LOHocDfDRSAznhI11LhJ5/albums?limit=50&offset=0&include_groups=album (Caused by SSLError(SSLEOFError(8, '[SSL: UNEXPECTED_EOF_WHILE_READING] EOF occurred in violation of protocol (_ssl.c:1000)')))

In [48]:
print(len(new_albums_df_list))

2


In [49]:
new_albums_df = pd.DataFrame()
for df in new_albums_df_list:
    new_albums_df = pd.concat([new_albums_df, df])

In [50]:
new_albums_df
all_albums_df = pd.concat([album_df,new_albums_df])

In [51]:
all_albums_df
all_albums_df_dropped = all_albums_df.drop_duplicates("album_id")

In [52]:
print(len(all_albums_df),len(all_albums_df_dropped))
# all_albums_df_dropped.to_csv(r"C:\Projects\fall-2024-shack02\data\albums.csv")

104287 103549


In [13]:
artists_df

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,name,artist_id,genres,followers,popularity,spotify_url
0,0,0,Angel Du$t,2ynylCO9SRPTKjgNEH0Y2a,dreamo,63023,41,https://open.spotify.com/artist/2ynylCO9SRPTKj...
1,1,1,Trapped Under Ice,2x48WoJGRLCpCWHEKXMZoB,"chaotic hardcore, hardcore, melodic hardcore, ...",75031,42,https://open.spotify.com/artist/2x48WoJGRLCpCW...
2,2,2,Incendiary,3nS4tSuT4VwGiZH6BtlJfC,"long island punk, metallic hardcore, nyhc",81448,40,https://open.spotify.com/artist/3nS4tSuT4VwGiZ...
3,3,3,DRAIN,7FAAkDlPg6pg0860CIDzmu,california hardcore,91964,45,https://open.spotify.com/artist/7FAAkDlPg6pg08...
4,4,4,Drug Church,6q4AmzK3GzCuEzkurnYuEQ,"dreamo, emo, indie punk, modern melodic hardcore",81260,45,https://open.spotify.com/artist/6q4AmzK3GzCuEz...
...,...,...,...,...,...,...,...,...
13094,13094,22012,Monika Santucci,5E1FAGKopxyWqEREQblCDA,,4808,38,https://open.spotify.com/artist/5E1FAGKopxyWqE...
13095,13095,22013,Luma,29siAJ78u7y79BYOyh0lbp,melodic dubstep,11287,43,https://open.spotify.com/artist/29siAJ78u7y79B...
13096,13096,22019,MEDZ,61c8McUZCtrU9WOjvFkyiL,melodic dubstep,4623,35,https://open.spotify.com/artist/61c8McUZCtrU9W...
13097,13097,22021,lama,01M9LokQdmZvlAuwBLsYYH,melodic dubstep,3053,34,https://open.spotify.com/artist/01M9LokQdmZvlA...


In [12]:
album_df[album_df['artist_name'] == 'Korn']

Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,index,artist_name,artist_id,album_id,album_name,release_date,total_tracks,album_type
23687,23687,23687.0,4064.0,,Korn,3RNrq3jvMZxD9ZyoOZbQOD,5gizwaBQyfZYTEb8zdL7Z6,Untitled (Deluxe),2023-07-28,15,album
23688,23688,23688.0,4065.0,,Korn,3RNrq3jvMZxD9ZyoOZbQOD,4fN6xfIrXPX8SMgcFU4A6h,Requiem Mass (Deluxe Edition),2023-02-03,14,album
23689,23689,23689.0,4066.0,,Korn,3RNrq3jvMZxD9ZyoOZbQOD,7J0BUlxogdpZAtuZnhomb0,Requiem,2022-02-04,9,album
23690,23690,23690.0,4067.0,,Korn,3RNrq3jvMZxD9ZyoOZbQOD,6mWsWVsfWpoZ2d6uxm1ND1,The Nothing,2019-09-13,13,album
23691,23691,23691.0,4068.0,,Korn,3RNrq3jvMZxD9ZyoOZbQOD,0UGqqYIWXAD1FgrDI1zOjh,The Serenity of Suffering,2016-10-21,11,album
23692,23692,23692.0,4069.0,,Korn,3RNrq3jvMZxD9ZyoOZbQOD,29QP2RwZwRY9mAYWmYcH3T,The Paradigm Shift (World Tour Edition),2014-07-15,20,album
23693,23693,23693.0,4070.0,,Korn,3RNrq3jvMZxD9ZyoOZbQOD,6TA5d51wosfWuZXyOKWzpG,The Paradigm Shift (Deluxe),2013-10-08,13,album
23694,23694,23694.0,4071.0,,Korn,3RNrq3jvMZxD9ZyoOZbQOD,3ARqpnmjMB5O8uihXOOxbW,The Path of Totality (Special Edition),2011-11-30,13,album
23695,23695,23695.0,4072.0,,Korn,3RNrq3jvMZxD9ZyoOZbQOD,4a6B5VTaWSw0xRQsEtNX39,The Path of Totality,2011-11-21,11,album
23696,23696,23696.0,4073.0,,Korn,3RNrq3jvMZxD9ZyoOZbQOD,2WuADVcxasM1VvDfnwiACn,Korn III: Remember Who You Are (Special Edition),2010-07-09,14,album


In [11]:
artists_df[artists_df["name"] == 'Korn']

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,name,artist_id,genres,followers,popularity,spotify_url
3374,3374,3862,Korn,3RNrq3jvMZxD9ZyoOZbQOD,"alternative metal, funk metal, hard rock, nu m...",7904909,77,https://open.spotify.com/artist/3RNrq3jvMZxD9Z...


In [11]:
track_df = pd.read_csv(r"C:\Projects\fall-2024-shack02\data\tracks.csv")
track_df

Unnamed: 0.1,Unnamed: 0,album_id,track_id,track_name,track_number,artist_name,artist_id,duration_ms,explicit,danceability,...,tempo,valence,acousticness,instrumentalness,liveness,speechiness,key,mode,loudness,time_signature
0,0,6wtdoqQ2vTSTduaV8p4PAr,11HTHEj6oQl03hfZZ0B0am,Midnight,1,Abby Gundersen,5BlKoQLYxv24MSV5AD6i6q,208853,False,0.118,...,85.610,0.0367,0.88100,0.977000,0.1740,0.0363,2,1,-21.380,4
1,1,6wtdoqQ2vTSTduaV8p4PAr,03ypwrws1Bug6qmVUAU1vH,Surfacing,2,Abby Gundersen,5BlKoQLYxv24MSV5AD6i6q,223373,False,0.294,...,149.084,0.0742,0.89400,0.802000,0.1050,0.0306,0,1,-15.456,3
2,2,6wtdoqQ2vTSTduaV8p4PAr,7gBVdXPQ3ykeS2qR2wiihw,Stratus,3,Abby Gundersen,5BlKoQLYxv24MSV5AD6i6q,166106,False,0.178,...,116.712,0.0323,0.88400,0.925000,0.3270,0.0448,4,1,-16.246,4
3,3,6wtdoqQ2vTSTduaV8p4PAr,0i62LV1IrZosi5zCqjOpVB,Nostalgia,4,Abby Gundersen,5BlKoQLYxv24MSV5AD6i6q,164053,False,0.366,...,122.616,0.0757,0.97000,0.875000,0.1250,0.0347,9,1,-19.672,3
4,4,6wtdoqQ2vTSTduaV8p4PAr,6aRKgMEQ0DSj8DCvpFgZu9,Within a Dream,5,Abby Gundersen,5BlKoQLYxv24MSV5AD6i6q,291413,False,0.109,...,169.871,0.0399,0.58300,0.928000,0.0629,0.0378,0,1,-14.785,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7165,310,0YKWM0fWJPqohGAIj9nTEw,07hbg2yybbca8OIAginoLr,Ortam İnsanı,6,Emre Altuğ,6CtMmrX12kYA3O4kINuWjZ,241666,False,0.648,...,137.967,0.7430,0.00104,0.000118,0.0639,0.0554,1,0,-5.292,4
7166,311,0YKWM0fWJPqohGAIj9nTEw,4yfA4V6u00NLybBgh0LP9V,Seni Kaybettim,7,Emre Altuğ,6CtMmrX12kYA3O4kINuWjZ,240066,False,0.637,...,93.025,0.4070,0.10900,0.000000,0.1920,0.0576,10,0,-5.146,4
7167,312,0YKWM0fWJPqohGAIj9nTEw,1l83WQOvw7UpJmLlTJwZPE,Şans,8,Emre Altuğ,6CtMmrX12kYA3O4kINuWjZ,282680,False,0.678,...,109.978,0.1830,0.00186,0.000063,0.2170,0.0259,4,0,-5.786,4
7168,313,0YKWM0fWJPqohGAIj9nTEw,3BEIaentLkGWVhVLohXbdt,Senin Tenin Senin Kokun,9,Emre Altuğ,6CtMmrX12kYA3O4kINuWjZ,218613,False,0.500,...,73.963,0.3570,0.12800,0.000000,0.1050,0.0658,9,0,-6.223,4


In [6]:
track_dfs = []

album_need_tracks = album_df[~album_df['album_id'].isin(track_df["album_id"].unique())]["album_id"].unique()

segmented_new_albums_list = segment_list(album_need_tracks, segment_size = 25)

for segment in segmented_new_albums_list:
    if track_dfs:
        pd.concat(track_dfs).to_csv(r"C:\Projects\fall-2024-shack02\data\temp_track_data_backup.csv", index=False)
    track_dfs.append(bulk_download_tracks_and_features(segment))

Fetching tracks for album: 0gTwY9wH9hyKXYxdCEaLhP
Fetching tracks for album: 2N3Pm65Zp4WknIEPKJRWCM
Fetching tracks for album: 7EU4Ppo6QR2P9Vx3glzlFa
Fetching tracks for album: 7onDxcFHFji1YXWU8DlcGE
Fetching tracks for album: 30tPF1YX6wz07LNFU32Ik5
Fetching tracks for album: 3vxX97dwLUbw3WS9st8emA
Fetching tracks for album: 72gWw2Ubd9Poo5elkUyG89
Fetching tracks for album: 4K0feemWV3i8auUxFGePe9
Fetching tracks for album: 249okdv05EkQ8Fbx4B0VsK
Fetching tracks for album: 130HNxtaeEyRJv5VkuLhEf
Fetching tracks for album: 0w4WVxXn2AuILWNMLAI5pK
Fetching tracks for album: 1ukbgFN2l3lOiSEtOI3yqh
Fetching tracks for album: 0GHfsH18Cc2WcabvTLUxQR
Fetching tracks for album: 1jyRDlNDDsXvbHfZZCTeP1
Fetching tracks for album: 0ECGq4YURyghUpg8fUpsTJ
Fetching tracks for album: 3ivcaQOCFlBNaJEzaCeMEq
Fetching tracks for album: 1k4xgc8b7QqVyzKQZT1PTF
Fetching tracks for album: 14nFD6LTEFBJ5oyb7xIp9p
Fetching tracks for album: 4gJRHo2wPo7QVlCKTIW13g
Fetching tracks for album: 3D2zUrdExKmYQRIzlZY2FD


In [1]:
track_dfs = []

In [7]:
combined_dfs = pd.read_csv(r"C:\Projects\fall-2024-shack02\data\temp_track_data_backup.csv")
for df in track_dfs:
    combined_dfs = pd.concat([combined_dfs,df])

In [9]:
len(combined_dfs['artist_name'].unique())

1913

In [13]:
# combined_dfs.drop("Unnamed: 0", inplace=True, axis=1)
combined_dfs = pd.concat([combined_dfs,track_df])
combined_dfs.to_csv(r"C:\Projects\fall-2024-shack02\data\tracks.csv")

In [14]:
graph_data = pd.DataFrame()

In [15]:
track_df = pd.read_csv(r"C:\Projects\fall-2024-shack02\data\tracks.csv")

In [18]:
track_df.columns
track_df.drop("Unnamed: 0.1", inplace=True, axis =1)

In [19]:
album_df = album_df.drop(['Unnamed: 0.3','Unnamed: 0.2','Unnamed: 0.1','Unnamed: 0', 'index'], axis = 1)

In [20]:
album_df.columns

Index(['artist_name', 'artist_id', 'album_id', 'album_name', 'release_date',
       'total_tracks', 'album_type'],
      dtype='object')

In [21]:
# artists_df.drop(['Unnamed: 0.1', 'Unnamed: 0'], axis=1, inplace=True)
artists_df

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,name,artist_id,genres,followers,popularity,spotify_url
0,0,0,Angel Du$t,2ynylCO9SRPTKjgNEH0Y2a,dreamo,63023,41,https://open.spotify.com/artist/2ynylCO9SRPTKj...
1,1,1,Trapped Under Ice,2x48WoJGRLCpCWHEKXMZoB,"chaotic hardcore, hardcore, melodic hardcore, ...",75031,42,https://open.spotify.com/artist/2x48WoJGRLCpCW...
2,2,2,Incendiary,3nS4tSuT4VwGiZH6BtlJfC,"long island punk, metallic hardcore, nyhc",81448,40,https://open.spotify.com/artist/3nS4tSuT4VwGiZ...
3,3,3,DRAIN,7FAAkDlPg6pg0860CIDzmu,california hardcore,91964,45,https://open.spotify.com/artist/7FAAkDlPg6pg08...
4,4,4,Drug Church,6q4AmzK3GzCuEzkurnYuEQ,"dreamo, emo, indie punk, modern melodic hardcore",81260,45,https://open.spotify.com/artist/6q4AmzK3GzCuEz...
...,...,...,...,...,...,...,...,...
13094,13094,22012,Monika Santucci,5E1FAGKopxyWqEREQblCDA,,4808,38,https://open.spotify.com/artist/5E1FAGKopxyWqE...
13095,13095,22013,Luma,29siAJ78u7y79BYOyh0lbp,melodic dubstep,11287,43,https://open.spotify.com/artist/29siAJ78u7y79B...
13096,13096,22019,MEDZ,61c8McUZCtrU9WOjvFkyiL,melodic dubstep,4623,35,https://open.spotify.com/artist/61c8McUZCtrU9W...
13097,13097,22021,lama,01M9LokQdmZvlAuwBLsYYH,melodic dubstep,3053,34,https://open.spotify.com/artist/01M9LokQdmZvlA...


In [22]:
cols_to_use = artists_df.columns.difference(album_df.columns).to_list()
cols_to_use.append('artist_id')
print(cols_to_use)
album_and_artists_df = album_df.merge(artists_df[cols_to_use],how='inner',on='artist_id')
album_and_artists_df.columns

['Unnamed: 0', 'Unnamed: 0.1', 'followers', 'genres', 'name', 'popularity', 'spotify_url', 'artist_id']


Index(['artist_name', 'artist_id', 'album_id', 'album_name', 'release_date',
       'total_tracks', 'album_type', 'Unnamed: 0', 'Unnamed: 0.1', 'followers',
       'genres', 'name', 'popularity', 'spotify_url'],
      dtype='object')

In [23]:
cols_to_use = track_df.columns.difference(album_and_artists_df.columns).to_list()
cols_to_use.append('album_id')
print(cols_to_use)
graph_data = album_and_artists_df.merge(track_df[cols_to_use],how='inner',on='album_id')
graph_data.columns

['acousticness', 'danceability', 'duration_ms', 'energy', 'explicit', 'instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'speechiness', 'tempo', 'time_signature', 'track_id', 'track_name', 'track_number', 'valence', 'album_id']


Index(['artist_name', 'artist_id', 'album_id', 'album_name', 'release_date',
       'total_tracks', 'album_type', 'Unnamed: 0', 'Unnamed: 0.1', 'followers',
       'genres', 'name', 'popularity', 'spotify_url', 'acousticness',
       'danceability', 'duration_ms', 'energy', 'explicit', 'instrumentalness',
       'key', 'liveness', 'loudness', 'mode', 'speechiness', 'tempo',
       'time_signature', 'track_id', 'track_name', 'track_number', 'valence'],
      dtype='object')

In [25]:
# graph_data.drop(['Unnamed: 0.1', 'Unnamed: 0'], axis=1, inplace=True)
graph_data.columns

Index(['artist_name', 'artist_id', 'album_id', 'album_name', 'release_date',
       'total_tracks', 'album_type', 'followers', 'genres', 'name',
       'popularity', 'spotify_url', 'acousticness', 'danceability',
       'duration_ms', 'energy', 'explicit', 'instrumentalness', 'key',
       'liveness', 'loudness', 'mode', 'speechiness', 'tempo',
       'time_signature', 'track_id', 'track_name', 'track_number', 'valence'],
      dtype='object')

In [26]:
from sklearn.preprocessing import MinMaxScaler


In [27]:
cleaned_graph_data = graph_data.copy()
track_features =  ["danceability","energy","tempo","valence","acousticness","instrumentalness","liveness","speechiness","loudness"]
scaler = MinMaxScaler()
for feature in track_features:
    cleaned_graph_data[feature] = scaler.fit_transform(cleaned_graph_data[[feature]])
    print(cleaned_graph_data[feature].min(), cleaned_graph_data[feature].max())

0.0 1.0
0.0 0.9999999999999999
0.0 1.0
0.0 1.0
0.0 1.0
0.0 0.9999999999999999
0.0 1.0
0.0 1.0
0.0 0.9999999999999999


In [51]:
cleaned_graph_data.drop('featured_artists', inplace =True, axis =1 )

In [54]:
import random

all_artists = cleaned_graph_data['artist_name'].unique()
def create_fake_artist_features(artist_name):
    other_artists = [artist for artist in all_artists if artist != artist_name]
    return random.choice(other_artists)


cleaned_graph_data['featured_artists'] = cleaned_graph_data['artist_name'].apply(create_fake_artist_features)

In [55]:
cleaned_graph_data['featured_artists'].value_counts()


featured_artists
The Spinners                  207
The Contours                  207
New Philharmonia Orchestra    205
Fred Hammond                  203
LU                            201
                             ... 
2 minutos                     136
Johann Sebastian Bach         135
The Maddox Brothers & Rose    133
Mickemesk                     133
JUN SKY WALKER(S)             127
Name: count, Length: 440, dtype: int64

In [50]:
cleaned_graph_data[cleaned_graph_data['featured_artists'].apply(len) > 0]["featured_artists"]

577           [Robert Plant]
737         [Ray LaMontagne]
1354         [Kenny Chesney]
1355         [Willie Nelson]
1578       [Hillary Lindsey]
                ...         
71813    [Marshall McDonald]
71814         [Paul Cardall]
71815      [Al Van Der Beek]
71816         [Giles Reaves]
71818          [Jon Schmidt]
Name: featured_artists, Length: 632, dtype: object

In [18]:
cleaned_graph_data

Unnamed: 0,artist_name,artist_id,album_id,album_name,release_date,total_tracks,album_type,followers,genres,name,...,liveness,loudness,mode,speechiness,tempo,time_signature,track_id,track_name,track_number,valence
0,Peter Bradley Adams,0CdbG1eHVjqjkQsGoH2u1V,5swCwkecIl1ZafeyxJYbYk,A Face Like Mine,2017-04-21,9,album,97000,acoustic pop,Peter Bradley Adams,...,0.096880,0.609362,1,0.032054,0.622972,4,5cSQvlBEuIfcF9KBXGOHKC,Good Man,1,0.547862
1,Peter Bradley Adams,0CdbG1eHVjqjkQsGoH2u1V,5swCwkecIl1ZafeyxJYbYk,A Face Like Mine,2017-04-21,9,album,97000,acoustic pop,Peter Bradley Adams,...,0.092775,0.651063,1,0.028320,0.415807,4,53iFMaNUXn6Oj8hUIKSO5a,My Arms Were Always Around You,2,0.429735
2,Peter Bradley Adams,0CdbG1eHVjqjkQsGoH2u1V,5swCwkecIl1ZafeyxJYbYk,A Face Like Mine,2017-04-21,9,album,97000,acoustic pop,Peter Bradley Adams,...,0.081999,0.743814,1,0.030394,0.640788,4,7G2nClcTDSn4VtNlSSBQXi,Lorraine,3,0.447047
3,Peter Bradley Adams,0CdbG1eHVjqjkQsGoH2u1V,5swCwkecIl1ZafeyxJYbYk,A Face Like Mine,2017-04-21,9,album,97000,acoustic pop,Peter Bradley Adams,...,0.059626,0.592907,1,0.031846,0.345340,4,1z3vIIKSZ83hX6vTKo4536,A Face Like Mine,4,0.204684
4,Peter Bradley Adams,0CdbG1eHVjqjkQsGoH2u1V,5swCwkecIl1ZafeyxJYbYk,A Face Like Mine,2017-04-21,9,album,97000,acoustic pop,Peter Bradley Adams,...,0.085591,0.601259,0,0.036618,0.445618,4,0px8acghPxrLMXXGaS6Mwe,Who Else Could I Be,5,0.573320
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5481,Emre Altuğ,6CtMmrX12kYA3O4kINuWjZ,0YKWM0fWJPqohGAIj9nTEw,Kişiye Özel,2007-05-30,10,album,360605,turkish pop,Emre Altuğ,...,0.045464,0.867762,0,0.057469,0.627071,4,07hbg2yybbca8OIAginoLr,Ortam İnsanı,6,0.756619
5482,Emre Altuğ,6CtMmrX12kYA3O4kINuWjZ,0YKWM0fWJPqohGAIj9nTEw,Kişiye Özel,2007-05-30,10,album,360605,turkish pop,Emre Altuğ,...,0.176929,0.872312,0,0.059751,0.422806,4,4yfA4V6u00NLybBgh0LP9V,Seni Kaybettim,7,0.414460
5483,Emre Altuğ,6CtMmrX12kYA3O4kINuWjZ,0YKWM0fWJPqohGAIj9nTEw,Kişiye Özel,2007-05-30,10,album,360605,turkish pop,Emre Altuğ,...,0.202586,0.852366,0,0.026867,0.499859,4,1l83WQOvw7UpJmLlTJwZPE,Şans,8,0.186354
5484,Emre Altuğ,6CtMmrX12kYA3O4kINuWjZ,0YKWM0fWJPqohGAIj9nTEw,Kişiye Özel,2007-05-30,10,album,360605,turkish pop,Emre Altuğ,...,0.087644,0.838746,0,0.068257,0.336168,4,3BEIaentLkGWVhVLohXbdt,Senin Tenin Senin Kokun,9,0.363544


In [56]:
cleaned_graph_data.to_csv(r"C:\Projects\fall-2024-shack02\data\graph_data.csv")
cleaned_graph_data.to_csv(r"C:\Users\monst\.Neo4jDesktop\relate-data\dbmss\dbms-523e12c2-88d8-42c2-b0bc-b8c2acdb0b80\import\graph_data.csv")

In [2]:
graph_data = pd.read_csv(r"C:\Projects\fall-2024-shack02\data\graph_data.csv")

In [16]:
import re

def clean_column_names(df):
    """
    Clean column names by removing or replacing special characters that may cause errors in Neo4j.
    
    :param df: The pandas DataFrame whose columns need to be cleaned.
    :return: The DataFrame with cleaned column names.
    """
    # Define the characters to remove or replace
    def clean_name(col_name):
        # Replace spaces with underscores
        col_name = col_name.replace(" ", "_")
        
        # Remove problematic characters for Neo4j
        col_name = re.sub(r'[().\[\]{},:;/\\]', '', col_name)
        
        return col_name
    
    # Apply cleaning to all column names
    df.columns = [clean_name(col) for col in df.columns]
    
    return df

def clean_column_values(df):
    """
    Clean all string column values by removing or replacing special characters that may cause errors in Neo4j.
    
    :param df: The pandas DataFrame whose column values need to be cleaned.
    :return: The DataFrame with cleaned column values.
    """
    # Define the characters to remove or replace
    def clean_value(value):
        if isinstance(value, str):
            # Remove problematic characters for Neo4j in values
            value = re.sub(r'[().\[\]{}:;/\\]', '', value)
        return value
    
    # Apply cleaning to all values in the DataFrame
    df = df.applymap(clean_value)
    
    return df


In [36]:
clean_graph_data = clean_column_names(graph_data)
clean_graph_data = clean_column_values(clean_graph_data)
clean_graph_data.drop("Unnamed_0", axis=1,inplace=True)
clean_graph_data

  df = df.applymap(clean_value)


Unnamed: 0,artist_name,artist_id,album_id,album_name,release_date,total_tracks,album_type,name,genres,followers,...,tempo,valence,acousticness,instrumentalness,liveness,speechiness,key,mode,loudness,time_signature
0,Korn,3RNrq3jvMZxD9ZyoOZbQOD,5gizwaBQyfZYTEb8zdL7Z6,Untitled Deluxe,2023-07-28,15,album,Korn,"alternative metal, funk metal, hard rock, nu m...",7904909,...,79.970,0.187,0.878000,0.925000,0.0890,0.0255,11,0,-13.185,3
1,Korn,3RNrq3jvMZxD9ZyoOZbQOD,5gizwaBQyfZYTEb8zdL7Z6,Untitled Deluxe,2023-07-28,15,album,Korn,"alternative metal, funk metal, hard rock, nu m...",7904909,...,125.038,0.479,0.003610,0.201000,0.6510,0.0557,4,0,-5.203,4
2,Korn,3RNrq3jvMZxD9ZyoOZbQOD,5gizwaBQyfZYTEb8zdL7Z6,Untitled Deluxe,2023-07-28,15,album,Korn,"alternative metal, funk metal, hard rock, nu m...",7904909,...,172.196,0.527,0.000531,0.892000,0.4330,0.0704,11,0,-4.535,4
3,Korn,3RNrq3jvMZxD9ZyoOZbQOD,5gizwaBQyfZYTEb8zdL7Z6,Untitled Deluxe,2023-07-28,15,album,Korn,"alternative metal, funk metal, hard rock, nu m...",7904909,...,102.930,0.536,0.062700,0.017800,0.2390,0.0548,3,1,-4.925,4
4,Korn,3RNrq3jvMZxD9ZyoOZbQOD,5gizwaBQyfZYTEb8zdL7Z6,Untitled Deluxe,2023-07-28,15,album,Korn,"alternative metal, funk metal, hard rock, nu m...",7904909,...,99.994,0.151,0.000354,0.819000,0.0902,0.0945,9,1,-3.901,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
273,Korn,3RNrq3jvMZxD9ZyoOZbQOD,7D3XFJlfZIkmGWqZXm2X8z,Korn,1994-10-11,12,album,Korn,"alternative metal, funk metal, hard rock, nu m...",7904909,...,79.659,0.532,0.000982,0.017200,0.1130,0.1060,5,1,-6.246,4
274,Korn,3RNrq3jvMZxD9ZyoOZbQOD,7D3XFJlfZIkmGWqZXm2X8z,Korn,1994-10-11,12,album,Korn,"alternative metal, funk metal, hard rock, nu m...",7904909,...,98.027,0.164,0.003020,0.002310,0.2340,0.1500,1,0,-7.384,4
275,Korn,3RNrq3jvMZxD9ZyoOZbQOD,7D3XFJlfZIkmGWqZXm2X8z,Korn,1994-10-11,12,album,Korn,"alternative metal, funk metal, hard rock, nu m...",7904909,...,125.006,0.480,0.003030,0.000603,0.4790,0.1240,1,0,-6.381,4
276,Korn,3RNrq3jvMZxD9ZyoOZbQOD,7D3XFJlfZIkmGWqZXm2X8z,Korn,1994-10-11,12,album,Korn,"alternative metal, funk metal, hard rock, nu m...",7904909,...,170.756,0.664,0.002230,0.041700,0.2630,0.0952,9,1,-8.493,4


In [37]:
clean_graph_data.to_csv(r"C:\Projects\fall-2024-shack02\data\clean_graph_data.csv")