In [1]:
import os 
import pandas as pd
import requests
import json
import dotenv
import lyricscraper
dotenv.load_dotenv()

True

In [2]:
# Load in client ID and secret
spotify_id = os.getenv('SpotifyID')
spotify_secret = os.getenv("SpotifySecret")

In [3]:
url = "https://accounts.spotify.com/api/token"
headers = {
    "Content-Type": "application/x-www-form-urlencoded"
}
data = {
    "grant_type": "client_credentials",
    "client_id": spotify_id,
    "client_secret": spotify_secret
}

In [4]:
# Request access token (works for one hour)
r = requests.post(url, headers=headers, data=data)
r

<Response [200]>

In [5]:
access_data = json.loads(r.text)
access_token = access_data['access_token']
token_type = access_data['token_type']

In [6]:
# Load in API keys
genius_id = os.getenv("GeniusID")
genius_secret = os.getenv("GeniusSecret")
genius_token = os.getenv("GeniusToken")
# Get user-agent 
r = requests.get('https://httpbin.org/user-agent')
useragent = json.loads(r.text)['user-agent']
headers = {'User-agent': useragent}

In [7]:
playlist_data_full = pd.read_csv('playlist_data.csv')
playlist_data_full.head()

Unnamed: 0,Playlist_ID,Playlist_Name,Playlist_Songs,Playlist_Artists,Playlist_Song_IDs
0,6mtYuOxzl58vSGnEDtZ9uB,Pop Hits 2000s – 2025,"['Into You', 'Glad You Came', 'Dark Horse', 'W...","['Ariana Grande', 'The Wanted', 'Katy Perry', ...","['2meEiZKWkiN28gITzFwQo5', '1OXfWI3FQMdsKKC6lk..."
1,34NbomaTu7YuOYnky8nLXL,Pop Hits 2025 (Top 50),"['Die With A Smile', 'APT.', 'Espresso', ""we c...","['Lady Gaga', 'ROSÉ', 'Sabrina Carpenter', 'Ar...","['2plbrEY59IikOBgBGLjaoe', '5vNRhkKd0yEAg8suGB..."
2,4Jb4PDWREzNnbZcOHPcZPy,COUNTRY HITS 2025 🔥 New Country Songs + Top Hits,"['I Had Some Help (Feat. Morgan Wallen)', ""Aus...","['Post Malone', 'Dasha', 'mgk', 'Dylan Marlowe...","['5IZXB5IKAD2qlvTPJYDCFB', '2uqYupMHANxnwgeiXT..."
3,1Cgey68pUlQGsCPI2wJuxr,Best of 2025 🔥 Most Popular Hits 2025 Hits,"['APT.', 'Anxiety', 'Die With A Smile', 'Messy...","['ROSÉ', 'Doechii', 'Lady Gaga', 'Lola Young',...","['5vNRhkKd0yEAg8suGBpjeY', '1musbempyJAw5gfSKZ..."
4,2L2HwKRvUgBv1YetudaRI3,Pop 2000-2010 Bangers,"['Whatcha Say', 'Airplanes (feat. Hayley Willi...","['Jason Derulo', 'B.o.B', 'Bruno Mars', 'Tinch...","['7xkQdy0cy5ymoWT7nedvLz', '1QnvpPFP4Q3FHbDchq..."


In [9]:
genius_url_cache = {}  # {(song_title, artist): genius_url}
lyrics_cache = {}  # {genius_url: lyrics}
failed_song_counts = {}  # Dictionary to track missing songs

In [10]:
import time
import lyricscraper as ls
import importlib
import concurrent.futures

importlib.reload(ls)

playlist_lyrics = []
playlist_data = playlist_data_full.copy()

def fetch_genius_url(song_title, artist, genius_token):
    """Helper function to fetch Genius URL with caching and failure tracking."""
    if not song_title:
        song_title = ""
    if not artist:
        artist = ""
    song_key = (song_title.lower(), artist.lower())  # Normalize case
    
    if song_key in genius_url_cache:
        return genius_url_cache[song_key]  # Use cached URL
    
    url = ls.get_genius_url(song_title, artist, genius_token)
    genius_url_cache[song_key] = url  # Store in cache

    # Track failures if the URL is None
    if url is None:
        failed_song_counts[song_key] = failed_song_counts.get(song_key, 0) + 1

    return url

def fetch_lyrics(genius_url):
    """Helper function to fetch lyrics with caching."""
    if not genius_url:
        return "Lyrics not found"
    
    if genius_url in lyrics_cache:
        return lyrics_cache[genius_url]  # Use cached lyrics
    
    lyrics = ls.scrape_lyrics(genius_url)
    lyrics_cache[genius_url] = lyrics if lyrics else "Lyrics not found"
    return lyrics_cache[genius_url]

for playlist_name, songs, artists in zip(playlist_data['Playlist_Name'], playlist_data['Playlist_Songs'], playlist_data['Playlist_Artists']):
    print(f"Grabbing Genius URLs for playlist: {playlist_name}")
    
    success = False
    while not success:
        try:
            # Use ThreadPoolExecutor to parallelize Genius URL fetching
            with concurrent.futures.ThreadPoolExecutor() as executor:
                genius_urls = list(executor.map(lambda sa: fetch_genius_url(*sa, genius_token), zip(songs, artists)))

            # Use ThreadPoolExecutor to parallelize lyrics scraping
            with concurrent.futures.ThreadPoolExecutor() as executor:
                lyrics_list = list(executor.map(fetch_lyrics, genius_urls))

            playlist_lyrics.append(lyrics_list)  # or extend(lyrics_list) if you want a flat list

            success = True  # If everything is successful, exit the retry loop

        except Exception as e:
            print(f"Error processing playlist '{playlist_name}': {e}. Retrying...")
            time.sleep(5)  # Optional delay before retrying

# Add collected lyrics to the dataframe
playlist_data = playlist_data.copy()
playlist_data["Playlist_Lyrics"] = playlist_lyrics

# Sort and print the most frequent missing songs
if failed_song_counts:
    print("\n🔹 **Most Frequently Missing Songs** 🔹")
    sorted_missing_songs = sorted(failed_song_counts.items(), key=lambda x: x[1], reverse=True)
    for (song, artist), count in sorted_missing_songs[:10]:  # Show top 10 missing songs
        print(f"{song.title()} by {artist.title()} - Missing {count} times")


Grabbing Genius URLs for playlist: Pop Hits 2000s – 2025
No valid match found for: what i want by lil meta
No valid match found for: devil by solina
No valid match found for: rhude by lindasson
No valid match found for: sara by kenzo cregan


KeyboardInterrupt: 

In [12]:
playlist_data.to_csv("Playlist_data_with_lyrics.csv")

In [13]:
total_songs = sum(len(lyrics_list) for lyrics_list in playlist_data["Playlist_Lyrics"])
total_songs

153093

In [14]:
lyrics_not_found_count = sum(lyrics_list.count("Lyrics not found") for lyrics_list in playlist_data["Playlist_Lyrics"])
lyrics_not_found_count

39030

In [16]:
lyrics_not_found_count / total_songs

0.2549430738178754

In [None]:
len(genius_url_cache)