In [2]:
import lyricsgenius
from langdetect import detect
import caffeine
import os
from IPython.display import clear_output
import time
from dotenv import load_dotenv
from tqdm.notebook import tqdm_notebook
import pandas as pd
from tqdm import tqdm

# Song Sentiment Data Set

The primary goal of our project is to create a song sentiment model. The is to be able to input a song and have the model report the leading emotion(s) or sentiment of the song. Ideally this would take an audio file as input, however the first pass target is to analyze song lyrics. 

In order to accomplish this goal, we require a dataset of songs labelled with appropriate sentiments. This notebook documents efforts to acquire such a dataset.

## Chat GPT Dataset Generation

We asked ChatGPT to give a list of popular lost along with their dominant sentiments. It was kind enough to provide us some data in csv format. This data was copied and pasted into song_list.csv.  

ChatGPT was reluctant to give us 1,000 entries. Upon further analysis, a significant number of its data points were duplicates.


In [4]:
df = pd.read_csv('../data/chatgpt_song_list.csv')
df.head()

Unnamed: 0,Sentiment,Song Title,Artist
0,Happy / Joyful,Happy,Pharrell Williams
1,Happy / Joyful,Walking on Sunshine,Katrina and the Waves
2,Happy / Joyful,Can't Stop the Feeling!,Justin Timberlake
3,Happy / Joyful,Uptown Funk,Mark Ronson ft. Bruno Mars
4,Happy / Joyful,I Gotta Feeling,The Black Eyed Peas


In [5]:
df.value_counts()

Sentiment        Song Title                  Artist           
Happy / Joyful   Good Vibrations             The Beach Boys       33
                 Happy Together              The Turtles          22
Love / Romantic  Crazy in Love               Beyoncé ft. Jay-Z    21
                 Can't Help Falling in Love  Elvis Presley        21
Happy / Joyful   Lovely Day                  Bill Withers         21
                                                                  ..
Wistful          Everybody Hurts             R.E.M.                1
                 Fast Car                    Tracy Chapman         1
                 Fields of Gold              Sting                 1
                 Fix You                     Coldplay              1
                 Yesterday                   The Beatles           1
Name: count, Length: 308, dtype: int64

In [6]:
df = df.drop_duplicates()
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 309 entries, 0 to 1888
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Sentiment   309 non-null    object
 1   Song Title  309 non-null    object
 2   Artist      308 non-null    object
dtypes: object(3)
memory usage: 9.7+ KB


Removing duplicate entries dropped to number of data points from 1,888 to 309. While this list might be useful, there has to be a better way.

## Kaggle Dataset

Dataset saved as muse_v3.csv


In [8]:
df = pd.read_csv('../data/muse_v3.csv')

df['seeds'] = df['seeds'].apply(lambda x: x.replace('[','').replace(']','').replace("'",'').replace(' ','').split(','))
df = df.explode('seeds')

df.head()

Unnamed: 0,lastfm_url,track,artist,seeds,number_of_emotion_tags,valence_tags,arousal_tags,dominance_tags,mbid,spotify_id,genre
0,https://www.last.fm/music/eminem/_/%2527till%2...,'Till I Collapse,Eminem,aggressive,6,4.55,5.273125,5.690625,cab93def-26c5-4fb0-bedd-26ec4c1619e1,4xkOaSrkexMciUUogZKVTS,rap
1,https://www.last.fm/music/metallica/_/st.%2banger,St. Anger,Metallica,aggressive,8,3.71,5.833,5.42725,727a2529-7ee8-4860-aef6-7959884895cb,3fOc9x06lKJBhz435mInlH,metal
2,https://www.last.fm/music/rick%2bross/_/speedi...,Speedin',Rick Ross,aggressive,1,3.08,5.87,5.49,,3Y96xd4Ce0J47dcalLrEC8,rap
3,https://www.last.fm/music/m.i.a./_/bamboo%2bbanga,Bamboo Banga,M.I.A.,aggressive,13,6.555071,5.537214,5.691357,99dd2c8c-e7c1-413e-8ea4-4497a00ffa18,6tqFC1DIOphJkCwrjVzPmg,hip-hop
3,https://www.last.fm/music/m.i.a./_/bamboo%2bbanga,Bamboo Banga,M.I.A.,fun,13,6.555071,5.537214,5.691357,99dd2c8c-e7c1-413e-8ea4-4497a00ffa18,6tqFC1DIOphJkCwrjVzPmg,hip-hop


Seeds is a list of identifying sentiments that when used as a search return the specified song. We are going to turn that around and presume that they act as identifying sentiments. 

First step is to unpack the list of sentiments give one line for each. 

In [10]:
df['seeds'].value_counts()


seeds
aggressive    1000
lush          1000
angry         1000
nocturnal     1000
gloomy        1000
              ... 
agreeable        2
sprightly        2
opulent          1
virile           1
hymn-like        1
Name: count, Length: 276, dtype: int64

In [11]:
# remove the songs with no sentiment
df_expanded = df[df['seeds'] != '']

# remove songs with fewer than 750 occurances of that sentiment
df = df.groupby('seeds').filter(lambda x: len(x) > 750)

# sample 100 songs from each remaining sentiment
df = df.groupby('seeds').apply(lambda x: x.sample(n=100)).reset_index(drop=True)
df['seeds'].value_counts()

seeds
aggressive    100
sensual       100
sardonic      100
sarcastic     100
sad           100
             ... 
gloomy        100
gentle        100
fun           100
fierce        100
witty         100
Name: count, Length: 99, dtype: int64

In [12]:

# randomize the order of the songs
df = df.sample(frac=1).reset_index(drop=True)
df.head()


Unnamed: 0,lastfm_url,track,artist,seeds,number_of_emotion_tags,valence_tags,arousal_tags,dominance_tags,mbid,spotify_id,genre
0,https://www.last.fm/music/albert%2bking/_/the%...,The Very Thought Of You (LP Version),Albert King,lazy,3,3.406104,3.400649,5.584286,42b8da32-2fd7-42e1-89dd-826e74db2b0f,,blues
1,https://www.last.fm/music/the%2bidle%2brace/_/...,The Lady Who Said She Could Fly,The Idle Race,whimsical,1,6.65,4.77,5.31,87b5f3cc-f9da-4228-83c4-1fcf8badbb1e,0S5aN73VSiew2whyl1hS3u,british
2,https://www.last.fm/music/pneumogaastriq/_/00005,00005,Pneumogaastriq,harsh,3,5.136667,3.796667,5.363333,,,soundtrack
3,https://www.last.fm/music/frankie%2blymon%2ban...,Teen Angel,Frankie Lymon and The Teenagers,intimate,2,3.61,3.01,3.145,,,
4,https://www.last.fm/music/the%2bguess%2bwho/_/...,Of A Dropping Pin,The Guess Who,passionate,1,7.17,6.33,6.62,4252f9f9-49ad-44c5-a559-b990e0b6ec08,6Mnl5gekqdpg4Ul3A7S6Xy,classic rock


In [16]:
load_dotenv()
GENIUS_API_KEY = os.getenv('GENIUS_API_KEY')

genius = lyricsgenius.Genius(
    access_token=GENIUS_API_KEY,
    timeout=15,
    sleep_time=1,
    verbose=False,
    remove_section_headers=True,
    skip_non_songs=True,
    excluded_terms=["(Remix)", "(Live)", "You might also like", "Embed"],
    retries=3
)

In [17]:
def tidy_lyrics(lyrics):
    if lyrics.find('Lyrics') != -1:
        lyrics = lyrics[(lyrics.find('Lyrics')+len('Lyrics')):]

    if lyrics[-5:] == 'Embed':
        lyrics = lyrics[:-5]

    lyrics = lyrics.replace('You might also like', '')
    lyrics = lyrics.replace(';', '')
    

    while lyrics[-1].isdigit():
        lyrics = lyrics[:-1]
        
    return lyrics     


def get_lyrics(track, artist):
    try:
        song = genius.search_song(track, artist, get_full_info=False)
    
        if song is None:
            lyrics = None
        else:
            lyrics = tidy_lyrics(song.lyrics)
    except:
        lyrics = None
    
    return lyrics

  

In [18]:

track = "Yellow"
artist = "ColdPlay"

lyrics = get_lyrics(track, artist)

print(lyrics)


Look at the stars
Look how they shine for you
And everything you do
Yeah, they were all yellow
I came along
I wrote a song for you
And all the things you do
And it was called "Yellow"
So then I took my turn
Oh, what a thing to have done
And it was all yellow

(Aah) Your skin, oh, yeah, your skin and bones
(Ooh) Turn into something beautiful
(Aah) And you know, you know I love you so
You know I love you so

I swam across
I jumped across for you
Oh, what a thing to do
'Cause you were all yellow
I drew a line
I drew a line for you
Oh, what a thing to do
And it was all yellow
See Coldplay LiveGet tickets as low as $93
(Aah) And your skin, oh, yeah, your skin and bones
(Ooh) Turn into something beautiful
(Aah) And you know, for you, I'd bleed myself dry
For you, I'd bleed myself dry

It's true
Look how they shine for you
Look how they shine for you
Look how they shine for
Look how they shine for you
Look how they shine for you
Look how they shine

Look at the stars
Look how they shine for 

In [None]:
tqdm_notebook.pandas()

caffeine.on(display=False)

N = 10

for i in tqdm(range(0, len(sampled), N)):
    if os.path.exists(f'lyrics_data/lyrics_{i}.csv'):
        continue
    
    try:
        chunk = sampled.iloc[i:i+N].copy()
        chunk['lyrics'] = chunk.progress_apply(lambda x: get_lyrics(x['track'], x['artist']), axis=1)
        chunk.to_csv(f'lyrics_data/lyrics_{i}.csv', index=False, sep=';')
    except Exception as e:
        time.sleep(60)
        continue
caffeine.off()

In [13]:
# combine the chunks into a single dataframe
lyrics_dfs = []
for file in os.listdir('lyrics_data/'):
    df = pd.read_csv(f'lyrics_data/{file}', sep=';')
    lyrics_dfs.append(df)
    

lyrics_df = pd.concat(lyrics_dfs)

lyrics_df.to_csv('lyrics_data.csv', index=False, sep=';')

In [51]:
# Read in the lyrics data
lyrics_df = pd.read_csv('lyrics_data.csv', sep=';')

# Remove songs with no lyrics
lyrics_df = lyrics_df[lyrics_df['lyrics'].notnull()]

# Remove songs that have none english lyrics
for row in tqdm(lyrics_df.itertuples()):
    try:
        if detect(row.lyrics) != 'en':
            lyrics_df = lyrics_df.drop(row.Index)
    except:
        lyrics_df = lyrics_df.drop(row.Index)



# Select lyrics with more than 200 characters and less than 1500 characters
lyrics_df = lyrics_df[lyrics_df['lyrics'].apply(lambda x: len(x) < 1500)]
lyrics_df = lyrics_df[lyrics_df['lyrics'].apply(lambda x: len(x) > 200)]

# Remove songs with $ in the lyrics
lyrics_df = lyrics_df[lyrics_df['lyrics'].apply(lambda x: '$' not in x)]

lyrics_df.to_csv('lyrics_data_cleaned.csv', index=False, sep=';')

4224it [00:06, 685.66it/s]

<class 'pandas.core.frame.DataFrame'>
Index: 4221 entries, 8 to 9898
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   lastfm_url              4221 non-null   object 
 1   track                   4221 non-null   object 
 2   artist                  4221 non-null   object 
 3   seeds                   4221 non-null   object 
 4   number_of_emotion_tags  4221 non-null   int64  
 5   valence_tags            4221 non-null   float64
 6   arousal_tags            4221 non-null   float64
 7   dominance_tags          4221 non-null   float64
 8   mbid                    3698 non-null   object 
 9   spotify_id              3679 non-null   object 
 10  genre                   4130 non-null   object 
 11  lyrics                  4221 non-null   object 
dtypes: float64(3), int64(1), object(8)
memory usage: 428.7+ KB





In [45]:
df = pd.read_csv('../data/lyrics_data_cleaned.csv', sep=';')
print(df.shape)

def categorize_sentiment(seeds, cat_map_file):
    cat_map = pd.read_csv(f"../data/category_mappings/{cat_map_file}")
    return [[column for column in cat_map.columns if seed in cat_map[column].values] for seed in seeds]
    
seeds = df['seeds']

claude_sent = categorize_sentiment(seeds, 'claude_cat_map.csv')
gpt_sent = categorize_sentiment(seeds, 'gpt_cat_map.csv')
gpt_sent_2 = categorize_sentiment(seeds, 'gpt_cat_map_2.csv')
gpt_sent_2 = [sent[0] if len(sent) > 0 else None for sent in gpt_sent_2]


sentiments = pd.DataFrame({'label': seeds, 'claude_sent': claude_sent, 'gpt_sent': gpt_sent, 'gpt_sent_2': gpt_sent_2})
sentiments = sentiments.explode('claude_sent').explode('gpt_sent').explode('gpt_sent_2')
sentiments['lyrics'] = df['lyrics']


sentiments.to_csv('../data/lyrics_sentiments.csv', index=False, sep=';')

(3154, 12)
