In [9]:
import os
import pandas as pd
import time
import datetime
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials


SPOTIPY_CLIENT_ID = os.getenv('SPOTIPY_CLIENT_ID')
SPOTIPY_CLIENT_SECRET = os.getenv('SPOTIPY_CLIENT_SECRET')

In [2]:
print(SPOTIPY_CLIENT_ID)
print(SPOTIPY_CLIENT_SECRET)

ff3012948a6f43dab91e79551f1a7a1f
922517197d414ec48a56a14f08ecd0ca


In [3]:
df_combined_clean = pd.read_csv('..\\data\\3_combined_clean.csv')
df_combined_clean.head()


Unnamed: 0,title,artist,hot
0,Die With A Smile,Lady Gaga & Bruno Mars,1
1,A Bar Song (Tipsy),Shaboozey,1
2,Birds Of A Feather,Billie Eilish,1
3,Lose Control,Teddy Swims,1
4,APT.,ROSE & Bruno Mars,1


In [None]:

lz_uri = 'spotify:artist:36QJpDe2go2KgaRleHCDTp'

spotify = spotipy.Spotify(client_credentials_manager=SpotifyClientCredentials(client_id=SPOTIPY_CLIENT_ID, client_secret=SPOTIPY_CLIENT_SECRET))

results = spotify.artist_top_tracks(lz_uri)

for track in results['tracks'][:10]:
    print('track    : ' + track['name'])
    print('cover art: ' + track['album']['images'][0]['url'])
    print()

In [None]:
print(results["tracks"][0]["name"])

In [7]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

lz_uri = 'spotify:artist:36QJpDe2go2KgaRleHCDTp'

spotify = spotipy.Spotify(client_credentials_manager=SpotifyClientCredentials(client_id=SPOTIPY_CLIENT_ID, client_secret=SPOTIPY_CLIENT_SECRET))

results = spotify.search(q="artist:Lana Del Rey track:Born to die", type='track', limit = 10)

In [7]:
print(results)


NameError: name 'results' is not defined

In [None]:
artist_id = results['tracks']['items'][0]['artists'][0]['id']
print(artist_id)

In [None]:
print(results["tracks"]["items"])

In [None]:
artist_results = spotify.artist(artist_id)
print(artist_results)


In [None]:
album_id = results['tracks']['items'][0]['album']['id']
print(album_id)

album_results = spotify.album(album_id)
print(album_results)



In [None]:
# Import time for rate limiting

# Create empty lists to store the new data
popularities = []
genres = []
not_found = []

# Process only first 100 songs
for index, row in df_combined_clean.iterrows():
    try:
        # Search for the track
        results = spotify.search(q=f"artist:{row['artist']} track:{row['title']}", type='track', limit=1)
        
        if results['tracks']['items']:
            # Get track popularity
            popularity = results['tracks']['items'][0]['popularity']
            
            # Get artist ID and their genres
            artist_id = results['tracks']['items'][0]['artists'][0]['id']
            artist_info = spotify.artist(artist_id)
            artist_genres = artist_info['genres']
            
            popularities.append(popularity)
            genres.append(artist_genres)
        else:
            popularities.append(None)
            genres.append(None)
            not_found.append(f"{row['title']} by {row['artist']}")
        
        # Add rate limiting break every 50 songs
        if (index + 1) % 50 == 0:
            print(f"Processed {index + 1} songs. Taking a 30 second break...")
            time.sleep(30)
            
    except Exception as e:
        print(f"Error processing {row['title']} by {row['artist']}: {str(e)}")
        popularities.append(None)
        genres.append(None)
        not_found.append(f"{row['title']} by {row['artist']}")

# Add new columns to the dataframe
df_combined_clean.loc[:99, 'popularity'] = popularities
#df_combined_clean.loc[:99, 'genres'] = genres


In [None]:
# Show results
print(f"\nSongs not found: {len(not_found)}")
print("\nFirst few rows of updated dataset:")
print(df_combined_clean.head())

In [None]:
# Process settings
BATCH_SIZE = 50
SLEEP_TIME = 30

# Get the last processed index from existing backup file
backup_file = '..\\data\\4_spotify_songs.csv'
if os.path.exists(backup_file):
    processed_df = pd.read_csv(backup_file)
    start_index = len(processed_df)
    print(f'Resuming from index {start_index}')
else:
    start_index = 0
    print('Starting new processing')

# Process in batches
for start_idx in range(start_index, len(df_combined_clean), BATCH_SIZE):
    batch = df_combined_clean.iloc[start_idx:start_idx + BATCH_SIZE]
    tracks_data = []
    
    print(f'\nProcessing batch {start_idx//BATCH_SIZE + 1} of {len(df_combined_clean)//BATCH_SIZE + 1}')
    print(f'Processing rows {start_idx} to {min(start_idx + BATCH_SIZE, len(df_combined_clean))}')
    
    for i, row in batch.iterrows():
        try:
            # search track
            results = spotify.search(q=f'track:{row["song_title"]} artist:{row["artist"]}', type='track', limit=1)
            
            if results['tracks']['items']:
                track = results['tracks']['items'][0]
                
                # get genres
                artist_id = track['artists'][0]['id']
                artist_info = spotify.artist(artist_id)
                genres = artist_info['genres']
                
                tracks_data.append({
                    'original_title': row['song_title'],
                    'original_artist': row['artist'],
                    'spotify_title': track['name'],
                    'spotify_artist': track['artists'][0]['name'],
                    'album': track['album']['name'],
                    'release_date': track['album']['release_date'],
                    'popularity': track['popularity'],
                    'duration_ms': track['duration_ms'],
                    'explicit': track['explicit'],
                    'album_cover': track['album']['images'][0]['url'] if track['album']['images'] else None,
                    'genres': genres if genres else None
                })
                
        except Exception as e:
            print(f'Error processing track {row["song_title"]}: {str(e)}')
            continue
    
    # Save batch results
    if tracks_data:
        temp_df = pd.DataFrame(tracks_data)
        temp_df.to_csv(backup_file, 
                      mode='a', 
                      header=not os.path.exists(backup_file), 
                      index=False)
        
        timestamp = datetime.now().strftime('%H:%M:%S')
        print(f'Batch completed at {timestamp}')
        print(f'Processed {len(tracks_data)} tracks in this batch')
        print(f'Total tracks processed: {start_idx + len(tracks_data)}')
        print(f'Last track processed: {row["song_title"]}')
        print(f'Last track genres: {genres if genres else "No genres found"}')
        
        if start_idx + BATCH_SIZE < len(million_songs_df):  # Don't sleep after the last batch
            print(f'Sleeping for {SLEEP_TIME} seconds...')
            print('-' * 50)
            time.sleep(SLEEP_TIME)

print(f'\nAll batches processed!')
print(f'Final total tracks processed: {start_idx + len(tracks_data)}')

In [5]:
df_combined_clean.to_csv('..\\data\\4_extended_features.csv', index=False)

In [10]:
df_spotify_songs = pd.read_csv('..\\data\\4_spotify_million_tracks.csv')

df_spotify_songs.head()

Unnamed: 0,original_title,original_artist,spotify_title,spotify_artist,album,release_date,popularity,duration_ms,explicit,album_cover,genres
0,Je Sais Que La Terre Est Plate,Raphaël,Je sais que la Terre est plate,Raphaël,Je Sais Que La Terre Est Plate,2008-03-14,14,150040,False,https://i.scdn.co/image/ab67616d0000b2739e6b95...,"['chanson', 'french pop', 'french rock', 'nouv..."
1,On Efface,Julie Zenatti,On efface,Julie Zenatti,Comme vous...,2004-03-21,1,253000,False,https://i.scdn.co/image/ab67616d0000b27398d445...,"['chanson', 'french pop']"
2,Howells Delight,The Baltimore Consort,Howells Delight,Anonymous,The Best of the Baltimore Consort,2011-02-01,3,240400,False,https://i.scdn.co/image/ab67616d0000b27353a906...,['medieval']
3,Martha Served,I Hate Sally,Martha Served,I Hate Sally,Don't Worry Lady,2007-06-12,1,138760,True,https://i.scdn.co/image/ab67616d0000b273e6d949...,"['canadian metal', 'canadian post-hardcore', '..."
4,Zip-A-Dee-Doo-Dah,Orlando Pops Orchestra,"Zip-a-Dee-Doo-Dah (From ""Song of the South"")",Orlando Pops Orchestra,"Most Amazing Movie, Musical & TV Themes, Vol.6",2022-10-07,0,199986,False,https://i.scdn.co/image/ab67616d0000b27349ea4d...,['pops orchestra']


In [17]:
df_spotify_songs.genres.unique()

all_genres = 

array(["['chanson', 'french pop', 'french rock', 'nouvelle chanson francaise']",
       "['chanson', 'french pop']", "['medieval']", ...,
       "['country rap', 'horrorcore']", "['jazz fusion', 'jazz guitar']",
       "['progressive sludge', 'sludge metal']"],
      shape=(1852,), dtype=object)