# 0. Set up

In [5]:
import pandas as pd
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import requests
import re
from tqdm import tqdm
import lyricsgenius
import time
import multiprocessing
import concurrent.futures

In [28]:

def get_artists_songs(artist):
    result = sp.search(q=artist, type='artist', limit=1)
    artist_id = result['artists']['items'][0]['id']

    song_triples = []
    for album in sp.artist_albums(artist_id, album_type='album')['items']:
        for track in sp.album_tracks(album['id'])['items']:
            song_triples.append((artist, album['name'], track['name']))

    return song_triples

def get_song_lyrics(song_name, artist_name=None, api_key="21c939dbe1fae1e1009de62aa3b171be"):
    """
    Fetches the lyrics of a song using the Musixmatch API.
    
    Parameters:
    - song_name (str): The name of the song.
    - artist_name (str, optional): The name of the artist to refine the search.
    - api_key (str): Your Musixmatch API key.
    
    Returns:
    - str: The lyrics of the song or an error message.
    """
    base_url = "https://api.musixmatch.com/ws/1.1/"
    
    # Step 1: Search for the track
    search_url = f"{base_url}track.search"
    params = {
        "q_track": song_name,
        "q_artist": artist_name if artist_name else "",
        "apikey": api_key,
        "s_track_rating": "desc",
        "page_size": 1
    }
    
    response = requests.get(search_url, params=params)
    data = response.json()
    
    try:
        track_id = data["message"]["body"]["track_list"][0]["track"]["track_id"]
    except:
        return "Song not found."

    # Step 2: Get the lyrics
    lyrics_url = f"{base_url}track.lyrics.get"
    params = {"track_id": track_id, "apikey": api_key}
    
    response = requests.get(lyrics_url, params=params)
    data = response.json()

    try:
        lyrics = data["message"]["body"]["lyrics"]["lyrics_body"]
        return (re.sub(r'\n', ' ', lyrics)).lower()
    except:
        return "Lyrics not available."
    
def get_song_lyrics_genius(song_name, artist_name):
    try:
        song = genius.search_song(song_name, artist_name)
        if song:
            return song.lyrics
        else:
            return "Lyrics not found."
    except:
        return f"An error occurred"
    
def clean_lyrics(lyrics):
    # Remove text before the first square bracket
    lyrics = re.sub(r'^[^\[]*', '', lyrics)
    lyrics = re.sub(r'\[.*?\]', '', lyrics)  # Remove text in square brackets
    lyrics = re.sub(r'\n+', ' ', lyrics)  # Replace multiple newlines with a single space
    lyrics = re.sub(r'\s+', ' ', lyrics)  # Replace multiple spaces with a single space
    return lyrics.strip().lower()

def get_song_info(song_name, artist_name):
    # Authenticate and create Spotipy client
    client_credentials_manager = SpotifyClientCredentials(client_id="your_client_id", client_secret="your_client_secret")
    sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

    # Search for the song by name and artist
    query = f"track:{song_name} artist:{artist_name}"
    results = sp.search(q=query, type='track', limit=1)
    
    # If song is found
    if results['tracks']['items']:
        song = results['tracks']['items'][0]
        
        # Get the track's release year
        year = song['album']['release_date'].split('-')[0]
        
        return (year)
    else:
        return ("Song not found")

def assign_gender(artist):
    if artist in ['Justin Bieber', 'Lil Baby', '21 Savage', 'Shawn Mendes', 
                  'DaBaby', 'The Weeknd', 'Drake', 'Ed Sheeran', 'Young Thug',
                  'Future', 'Kendrick Lamar', 'Post Malone', 'Chris Brown']:
        return 'M'
    elif artist in ['Ariana Grande', 'Rihanna', 'Megan Thee Stallion', 'Dua Lipa', 
                    'Taylor Swift', 'Beyonce', 'Nicki Minaj']:
        return 'F'
    elif artist == "Sam Smith":
        return 'NB'
    else:
        return 'Group'

## discographies

In [6]:
client_id = '7226939d4c5e43ab969715d406ad11d9'
client_secret = '9e8ee07ae94f49928c65c59ffd4a3dec'
auth_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)

sp = spotipy.Spotify(auth_manager=auth_manager)

In [None]:
artists = pd.read_csv(r"..\data\artists.csv", index_col=0)
artists = artists.query('num_songs_on_billboard > 9 and num_albums > 8').reset_index(drop=True)

In [5]:
songs = []
for artist in artists['artist']:
    songs.extend(get_artists_songs(artist))

songs_df = pd.DataFrame(songs, columns=['artist', 'album', 'song'])
songs_df.head(2)

Unnamed: 0,artist,album,song
0,Imagine Dragons,Reflections (From The Vault Of Smoke + Mirrors),Woke - Demo
1,Imagine Dragons,Reflections (From The Vault Of Smoke + Mirrors),The Ghost Intervention - Demo


In [None]:
songs_df = artists

In [6]:
albums_to_keep = ['$ome $exy $ongs 4 U',
 '+',
 '+-=÷× (Tour Collection)',
 '- (Deluxe)',
 '1.22.03 Acoustic',
 '1017 Thug',
 '11:11',
 "1989 (Taylor's Version)",
 '2 The Hard Way',
 '2step (The Remixes)',
 '4',
 '5',
 '=',
 'A Girl Like Me',
 'A Very Trainor Christmas',
 'ANTI',
 'AUSTIN',
 'Above And Beyoncé Dance Mixes',
 'After Hours',
 'Anything Goes',
 'Autumn Variations',
 "B'Day",
 'BEASTMODE 2',
 'BETTER THAN YOU',
 'BEYONCÉ [Platinum Edition]',
 'BLAME IT ON BABY',
 "BREEZY - It's Giving Christmas",
 'BUSINESS IS BUSINESS',
 'Baby On Baby',
 'Baby On Baby 2',
 'Baby Talk 5',
 'Back On My Baby Jesus Sh!t',
 'Beam Me Up Scotty',
 'Beautiful Thugger Girls',
 'Beauty Behind The Madness',
 'Believe',
 'Billion Dollar Baby',
 'Black Panther The Album Music From And Inspired By',
 'Black Panther: Wakanda Forever - Music From and Inspired By',
 'Blank Blank',
 'Breezy',
 'COWBOY CARTER',
 'Call And Response: The Remix Album',
 "Can't Say I Ain't Country",
 'Care Package',
 'Certified Lover Boy',
 'Changes',
 'Chris Brown (Expanded Edition)',
 'Culture',
 'Culture II',
 'Culture III',
 'Currency',
 'DAMN.',
 'DS2 (Deluxe)',
 'Dangerous Woman',
 'Dangerously In Love',
 'Dark Lane Demo Tapes',
 'Dawn FM',
 'Dear Evan Hansen (Original Motion Picture Soundtrack)',
 'Dig Your Roots',
 'Drip Harder',
 'Dua Lipa',
 'EVOL',
 'Echoes Of Silence (Original)',
 'Evolve',
 'Exclusive (Expanded Edition)',
 'F-1 Trillion',
 'F.A.M.E. (Expanded Edition)',
 'FUTURE',
 'Fan of A Fan The Album (Expanded Edition)',
 "Fearless (Taylor's Version)",
 'Fever',
 'For All The Dogs',
 'Fortune (Expanded Edition)',
 'Future & Juice WRLD Present... WRLD ON DRUGS',
 'Future Hndrxx Presents: The WIZRD',
 'Future Nostalgia',
 'GNX',
 'Gloria',
 "God's Work Resurrected",
 'Good Girl Gone Bad',
 'Good News',
 'Graffiti (Expanded Edition)',
 'Green Album',
 'HNDRXX',
 'HOMECOMING: THE LIVE ALBUM',
 'HOW TF IS THIS A MIXTAPE?',
 'Hands All Over (Deluxe)',
 'Handwritten',
 'Harder Than Ever',
 'Harder Than Hard',
 'Heartbreak on a Full Moon',
 'Her Loss',
 "Here's To The Good Times",
 "Here's To The Good Times...This Is How We Roll",
 'High Off Life',
 "Hollywood's Bleeding",
 'Honestly, Nevermind',
 'House Of Balloons (Original)',
 'Hurry Up Tomorrow',
 'I AM...SASHA FIERCE',
 'I Am...World Tour',
 'I NEVER LIKED YOU',
 "I'm Up",
 "If You're Reading This It's Too Late",
 'Illuminate',
 'In The Lonely Hour',
 'Indigo (Extended)',
 'Issa Album',
 "It Won't Be Soon Before Long.",
 "It's Only Me",
 'JEFFERY',
 'JORDI (Deluxe)',
 'Journals',
 'Justice',
 'Juug Season',
 'KIRK',
 'Kiss Land',
 'LOOM',
 'Lemonade',
 'Life Rolls On',
 'Loose Change',
 'Loud',
 'Love Goes',
 'Lover',
 'MEGAN',
 'MEGAN: ACT II',
 'MIXTAPE PLUTO',
 'MTV Unplugged',
 'Make It Hot',
 'Mercury - Acts 1 & 2',
 'Midnights',
 'More Life',
 'Mr. Morale & The Big Steppers',
 'Music Of The Sun',
 'My Dear Melancholy,',
 'My Turn',
 'My World',
 'My World 2.0',
 'NOT ALL HEROES WEAR CAPES (Deluxe)',
 'Never Say Never - The Remixes',
 'Night Visions',
 'No Label',
 'No Label II',
 'No.5 Collaborations Project',
 'No.6 Collaborations Project',
 'Nothing Was The Same',
 'Origins (Deluxe)',
 'Overexposed',
 'Overly Dedicated',
 'Perfect Timing',
 'Pink Friday',
 'Pink Friday ... Roman Reloaded',
 'Pink Friday 2',
 'Pluto x Baby Pluto',
 'Positions',
 'Punk',
 'Purple Reign',
 'Purpose (Deluxe)',
 'Queen',
 'Queen Radio: Volume 1',
 'RENAISSANCE',
 'Radical Optimism',
 'Rated R',
 "Red (Taylor's Version)",
 'Red Pill Blues (Deluxe)',
 'Rich Ni**a Timeline',
 'Rich Shooters',
 'Royalty (Deluxe Version)',
 'SAVAGE MODE II',
 'SAVE ME',
 'SUPER SLIMEY',
 'SUPERFLY (Original Motion Picture Soundtrack)',
 'Sam Smith - Live From The Roundhouse',
 'Savage Mode',
 'Scorpion',
 'Section.80',
 'Shawn',
 'Shawn Mendes',
 'Slaughter King',
 'Slime & B',
 'Slime Language',
 'Slime Language 2',
 'Slime Season',
 'Slime Season 2',
 'Slime Season 3',
 'Slime Season 4',
 'Smoke + Mirrors',
 'So Far Gone',
 'So Much Fun',
 'Something for Thee Hotties',
 'Songs About Jane',
 "Speak Now (Taylor's Version)",
 'Starboy',
 'Stoney (Complete Edition)',
 'Street Gossip',
 'Suga',
 'Sweetener',
 'THE TORTURED POETS DEPARTMENT',
 'TREAT MYSELF',
 'Take Care (Deluxe)',
 "Takin' It Back",
 'Talk That Talk',
 'Thank Me Later',
 'Thank You (Deluxe Version)',
 'The Diamond Collection (Deluxe)',
 'The Lion King: The Gift',
 'The Love Train',
 'The Pinkprint',
 'The Slaughter Tape',
 'The Thrill Of It All',
 'The Voice of the Heroes',
 'Thursday (Original)',
 'Timeless',
 'Tina Snow',
 'Title (Deluxe)',
 'To Pimp A Butterfly',
 'Too Hard',
 'Traumazine',
 'Trilogy',
 'Twelve Carat Toothache',
 'Unapologetic',
 'Under The Mistletoe',
 'V',
 'Views',
 "WE DON'T TRUST YOU",
 "WE STILL DON'T TRUST YOU",
 'WHAM (Extended Version)',
 'What A Time To Be Alive',
 'Wicked: The Soundtrack',
 'Without Warning',
 'Wonder',
 'X (Expanded Edition)',
 'YRN 2 (Young Rich Niggas 2)',
 'Young Rich N*ggas',
 'Yours Truly',
 'Yung Rich Nation',
 'american dream',
 'beerbongs & bentleys',
 'eternal sunshine',
 'evermore',
 'folklore',
 'good kid, m.A.A.d city',
 'i am > i was',
 'k bye for now (swt live)',
 'reputation',
 'thank u, next',
 'untitled unmastered.',
 'x (10th Anniversary Edition)',
 '÷ (Deluxe)']

In [7]:
songs_df = songs_df[songs_df['album'].isin(albums_to_keep)]
songs_df.reset_index(inplace=True)

## get songs

In [2]:
songs_df = pd.read_csv(r"..\data\songs.csv", index_col=0)
genius = lyricsgenius.Genius("Hj_rkTZ7oZe1mjRL6W2tnwRloCXQMdE0Y-w5ugiX0ZHtiqeBRzUj_j3xM_0qBs7C")
songs_df.columns

Index(['index', 'artist', 'album', 'song', 'lyrics', 'cleaned_lyrics'], dtype='object')

In [None]:
tqdm.pandas()
lyrics = []

with concurrent.futures.ThreadPoolExecutor() as executor:
    for i, row in songs_df.iterrows():
        future = executor.submit(get_song_lyrics_genius, row['song'], row['artist'])
        try:
            l = future.result(timeout=5)
        except concurrent.futures.TimeoutError:
            print(f"Skipping '{row['song']}' by {row['artist']} due to timeout.")
            l = None
        lyrics.append(l)

In [31]:
tqdm.pandas()
songs_df['lyrics'] = songs_df['lyrics'].progress_apply(lambda x: x if type(x) == str else None)
songs_df['cleaned_lyrics'] = songs_df['lyrics'].progress_apply(lambda x: clean_lyrics(x) if x else None)


100%|██████████| 4000/4000 [00:00<00:00, 530790.18it/s]
100%|██████████| 4000/4000 [00:00<00:00, 6224.15it/s]


In [32]:
songs_df.to_csv(r"..\data\songs.csv")

## more categories

In [4]:
songs_df['artist'].unique()

array(['Imagine Dragons', 'Meghan Trainor', 'Justin Bieber', 'Lil Baby',
       '21 Savage', 'Ariana Grande', 'Shawn Mendes', 'Sam Smith',
       'Rihanna', 'DaBaby', 'The Weeknd', 'Drake', 'Megan Thee Stallion',
       'Ed Sheeran', 'Young Thug', 'Maroon 5', 'Future', 'Kendrick Lamar',
       'Dua Lipa', 'Post Malone', 'Taylor Swift', 'Beyonce', 'Migos',
       'Chris Brown', 'Florida Georgia Line', 'Nicki Minaj'], dtype=object)

In [9]:
client_id = '7226939d4c5e43ab969715d406ad11d9'
client_secret = '9e8ee07ae94f49928c65c59ffd4a3dec'
auth_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)

sp = spotipy.Spotify(auth_manager=auth_manager)

In [24]:
songs_df['artist'].unique()

array(['Imagine Dragons', 'Meghan Trainor', 'Justin Bieber', 'Lil Baby',
       '21 Savage', 'Ariana Grande', 'Shawn Mendes', 'Sam Smith',
       'Rihanna', 'DaBaby', 'The Weeknd', 'Drake', 'Megan Thee Stallion',
       'Ed Sheeran', 'Young Thug', 'Maroon 5', 'Future', 'Kendrick Lamar',
       'Dua Lipa', 'Post Malone', 'Taylor Swift', 'Beyonce', 'Migos',
       'Chris Brown', 'Florida Georgia Line', 'Nicki Minaj'], dtype=object)

In [25]:
from tqdm import tqdm
tqdm.pandas()
songs_df['year'] = songs_df.progress_apply(lambda x: get_song_info(x['song'], x['artist']), axis=1)

100%|██████████| 4000/4000 [13:04<00:00,  5.10it/s]


In [33]:
songs_df['gender'] = songs_df['artist'].apply(assign_gender)
songs_df.columns

Index(['index', 'artist', 'album', 'song', 'lyrics', 'cleaned_lyrics', 'year',
       'gender'],
      dtype='object')

In [34]:
songs_full = pd.read_csv(r"..\data\songs_expanded_25_04_11.csv", index_col=0)
songs_full.columns

Index(['album', 'song', 'lyrics', 'cleaned_lyrics', 'Segment', 'WC',
       'Analytic', 'Clout', 'Authentic', 'Tone',
       ...
       'nrc_trust', 'nrc_anticipation', 'nrc_fear', 'nrc_surprise',
       'nrc_positive', 'nrc_negative', 'nrc_disgust', 'nrc_anger',
       'nrc_sadness', 'nrc_joy'],
      dtype='object', length=136)

In [None]:
songs_full = songs_full.merge(songs_df[['song', 'year', 'gender']], how='inner', on='song')

True

In [42]:
songs_full.to_csv(r"..\data\songs_expanded_25_04_11.csv")