In [35]:
import lyricsgenius
from langdetect import detect
import caffeine
import os
from IPython.display import clear_output
import time
from dotenv import load_dotenv
from tqdm.notebook import tqdm_notebook
import pandas as pd
from tqdm import tqdm

# Song Sentiment Data Set

The primary goal of our project is to create a song sentiment model. The is to be able to input a song and have the model report the leading emotion(s) or sentiment of the song. Ideally this would take an audio file as input, however the first pass target is to analyze song lyrics. 

In order to accomplish this goal, we require a dataset of songs labelled with appropriate sentiments. This notebook documents efforts to acquire such a dataset.

## Chat GPT Dataset Generation

We asked ChatGPT to give a list of popular lost along with their dominant sentiments. It was kind enough to provide us some data in csv format. This data was copied and pasted into song_list.csv.  

ChatGPT was reluctant to give us 1,000 entries. Upon further analysis, a significant number of its data points were duplicates.


In [None]:
df = pd.read_csv('song_list.csv')
df.head()

In [None]:
df.value_counts()

In [None]:
df = df.drop_duplicates()
df.info()

Removing duplicate entries dropped to number of data points from 1,888 to 309. While this list might be useful, there has to be a better way.

## Kaggle Dataset

Dataset saved as muse_v3.csv


In [6]:
df = pd.read_csv('muse_v3.csv')

df['seeds'] = df['seeds'].apply(lambda x: x.replace('[','').replace(']','').replace("'",'').replace(' ','').split(','))

df.head()

Unnamed: 0,lastfm_url,track,artist,seeds,number_of_emotion_tags,valence_tags,arousal_tags,dominance_tags,mbid,spotify_id,genre
0,https://www.last.fm/music/eminem/_/%2527till%2...,'Till I Collapse,Eminem,[aggressive],6,4.55,5.273125,5.690625,cab93def-26c5-4fb0-bedd-26ec4c1619e1,4xkOaSrkexMciUUogZKVTS,rap
1,https://www.last.fm/music/metallica/_/st.%2banger,St. Anger,Metallica,[aggressive],8,3.71,5.833,5.42725,727a2529-7ee8-4860-aef6-7959884895cb,3fOc9x06lKJBhz435mInlH,metal
2,https://www.last.fm/music/rick%2bross/_/speedi...,Speedin',Rick Ross,[aggressive],1,3.08,5.87,5.49,,3Y96xd4Ce0J47dcalLrEC8,rap
3,https://www.last.fm/music/m.i.a./_/bamboo%2bbanga,Bamboo Banga,M.I.A.,"[aggressive, fun, sexy, energetic]",13,6.555071,5.537214,5.691357,99dd2c8c-e7c1-413e-8ea4-4497a00ffa18,6tqFC1DIOphJkCwrjVzPmg,hip-hop
4,https://www.last.fm/music/dope/_/die%2bmf%2bdie,Die MF Die,Dope,[aggressive],7,3.771176,5.348235,5.441765,b9eb3484-5e0e-4690-ab5a-ca91937032a5,5bU4KX47KqtDKKaLM4QCzh,metal


Seeds is a list of identifying sentiments that when used as a search return the specified song. We are going to turn that around and presume that they act as identifying sentiments. 

First step is to unpack the list of sentiments give one line for each. 

In [7]:
df_expanded = df.explode('seeds')
df_expanded.head(10)

Unnamed: 0,lastfm_url,track,artist,seeds,number_of_emotion_tags,valence_tags,arousal_tags,dominance_tags,mbid,spotify_id,genre
0,https://www.last.fm/music/eminem/_/%2527till%2...,'Till I Collapse,Eminem,aggressive,6,4.55,5.273125,5.690625,cab93def-26c5-4fb0-bedd-26ec4c1619e1,4xkOaSrkexMciUUogZKVTS,rap
1,https://www.last.fm/music/metallica/_/st.%2banger,St. Anger,Metallica,aggressive,8,3.71,5.833,5.42725,727a2529-7ee8-4860-aef6-7959884895cb,3fOc9x06lKJBhz435mInlH,metal
2,https://www.last.fm/music/rick%2bross/_/speedi...,Speedin',Rick Ross,aggressive,1,3.08,5.87,5.49,,3Y96xd4Ce0J47dcalLrEC8,rap
3,https://www.last.fm/music/m.i.a./_/bamboo%2bbanga,Bamboo Banga,M.I.A.,aggressive,13,6.555071,5.537214,5.691357,99dd2c8c-e7c1-413e-8ea4-4497a00ffa18,6tqFC1DIOphJkCwrjVzPmg,hip-hop
3,https://www.last.fm/music/m.i.a./_/bamboo%2bbanga,Bamboo Banga,M.I.A.,fun,13,6.555071,5.537214,5.691357,99dd2c8c-e7c1-413e-8ea4-4497a00ffa18,6tqFC1DIOphJkCwrjVzPmg,hip-hop
3,https://www.last.fm/music/m.i.a./_/bamboo%2bbanga,Bamboo Banga,M.I.A.,sexy,13,6.555071,5.537214,5.691357,99dd2c8c-e7c1-413e-8ea4-4497a00ffa18,6tqFC1DIOphJkCwrjVzPmg,hip-hop
3,https://www.last.fm/music/m.i.a./_/bamboo%2bbanga,Bamboo Banga,M.I.A.,energetic,13,6.555071,5.537214,5.691357,99dd2c8c-e7c1-413e-8ea4-4497a00ffa18,6tqFC1DIOphJkCwrjVzPmg,hip-hop
4,https://www.last.fm/music/dope/_/die%2bmf%2bdie,Die MF Die,Dope,aggressive,7,3.771176,5.348235,5.441765,b9eb3484-5e0e-4690-ab5a-ca91937032a5,5bU4KX47KqtDKKaLM4QCzh,metal
5,https://www.last.fm/music/drowning%2bpool/_/st...,Step Up,Drowning Pool,aggressive,9,2.971389,5.5375,4.726389,49e7b4d2-3772-4301-ba25-3cc46ceb342e,4Q1w4Ryyi8KNxxaFlOQClK,metal
6,https://www.last.fm/music/kanye%2bwest/_/feedback,Feedback,Kanye West,aggressive,1,3.08,5.87,5.49,,49fT6owWuknekShh9utsjv,hip-hop


In [8]:
sentiment_balance = df_expanded['seeds'].value_counts()
sentiment_balance

seeds
aggressive    1000
lush          1000
angry         1000
nocturnal     1000
gloomy        1000
              ... 
agreeable        2
sprightly        2
opulent          1
virile           1
hymn-like        1
Name: count, Length: 276, dtype: int64

In [9]:
# remove the songs with no sentiment
df_expanded = df_expanded[df_expanded['seeds'] != '']
df_expanded['seeds'].value_counts()

# remove songs with fewer than 750 occurances of that sentiment
df_expanded = df_expanded.groupby('seeds').filter(lambda x: len(x) > 750)
df_expanded['seeds'].value_counts()

# sample 100 songs from each remaining sentiment
sampled = df_expanded.groupby('seeds').apply(lambda x: x.sample(n=100)).reset_index(drop=True)
sampled['seeds'].value_counts()

seeds
aggressive    100
sensual       100
sardonic      100
sarcastic     100
sad           100
             ... 
gloomy        100
gentle        100
fun           100
fierce        100
witty         100
Name: count, Length: 99, dtype: int64

In [10]:
sampled
# randomize the order of the songs
sampled = sampled.sample(frac=1).reset_index(drop=True)
sampled


Unnamed: 0,lastfm_url,track,artist,seeds,number_of_emotion_tags,valence_tags,arousal_tags,dominance_tags,mbid,spotify_id,genre
0,https://www.last.fm/music/nina%2bnastasia/_/ou...,Outlaster,Nina Nastasia,wistful,2,3.710000,1.585000,3.475000,282c7787-9063-442a-a798-9434290c38d2,,
1,https://www.last.fm/music/eels/_/fashion%2bawards,Fashion Awards,Eels,bittersweet,5,4.331667,3.300000,4.803333,abc28701-02e9-4b36-a3a1-8996337ecb68,7BXJuouNlS0JXQllk41MbO,indie
2,https://www.last.fm/music/interpol/_/hands%2baway,Hands Away,Interpol,melancholy,20,4.945268,3.897143,5.041161,1fe3ee2e-7589-47ac-86a8-610177fdf9a3,0QBVrYh1Gt9GCLCuCSRZqW,indie
3,https://www.last.fm/music/lights%2bset%2bnorth...,Trapped In Walls,Lights Set North,delicate,24,5.857000,3.389500,5.421000,,,electronic
4,https://www.last.fm/music/stereolab/_/percolator,Percolator,Stereolab,happy,8,5.091818,3.478182,4.414545,04e426d4-8310-44bb-a0de-2da84fe18292,7mHp0WR9fqa5zC52QvRU0q,french
...,...,...,...,...,...,...,...,...,...,...,...
9895,https://www.last.fm/music/frank%2bzappa/_/loui...,Louie Louie,Frank Zappa,sarcastic,3,2.513333,3.323333,3.433333,95f05ca7-7a7c-4c55-88ae-36a5af01cde3,65KDQ3Ge2OlIrRhMCQA9cK,jazz
9896,https://www.last.fm/music/iron%2b%2526%2bwine/...,Southern Anthem,Iron & Wine,calm,11,6.430331,2.890000,6.391736,f0eb9c57-d271-4a5b-ad71-19f5ca3a75e9,4J1kxUdGpOorYX3NDRFGWy,folk
9897,https://www.last.fm/music/james%2bblunt/_/billy,Billy,James Blunt,sad,14,5.405000,3.845833,5.682083,e540ba85-bb2e-4c99-b60c-4043c9efee8a,6G8WUgzLob8uKBfV80s3Ns,pop
9898,https://www.last.fm/music/parijat/_/hearts%2ba...,Hearts Awakening,Parijat,meditative,3,5.278596,2.979298,4.496140,,0yyMVgem0vtPt5YwhjIM1x,ambient


In [3]:
load_dotenv()
GENIUS_API_KEY = os.getenv('GENIUS_API_KEY')

genius = lyricsgenius.Genius(
    access_token=GENIUS_API_KEY,
    timeout=15,
    sleep_time=1,
    verbose=False,
    remove_section_headers=True,
    skip_non_songs=True,
    excluded_terms=["(Remix)", "(Live)", "You might also like", "Embed"],
    retries=3
)

In [11]:
def tidy_lyrics(lyrics):
    if lyrics.find('Lyrics') != -1:
        lyrics = lyrics[(lyrics.find('Lyrics')+len('Lyrics')):]

    if lyrics[-5:] == 'Embed':
        lyrics = lyrics[:-5]

    lyrics = lyrics.replace('You might also like', '')
    lyrics = lyrics.replace(';', '')
    

    while lyrics[-1].isdigit():
        lyrics = lyrics[:-1]
        
    return lyrics     


def get_lyrics(track, artist):
    try:
        song = genius.search_song(track, artist, get_full_info=False)
    
        if song is None:
            lyrics = None
        else:
            lyrics = tidy_lyrics(song.lyrics)
    except:
        lyrics = None
    
    return lyrics

  

In [12]:
i = 1
track = sampled['track'][i]
artist = sampled['artist'][i]

lyrics = get_lyrics(track, artist)

print(lyrics)

Let's go down to the fashion show
With all the pretty people that you don't know
We'll sit down in the velvet chairs
They'll hand awards out for Best Hair
And if we don't win one, well, then
We'll blow off our heads in despair
We'll blow off our heads in despair
I smell magic in the room
Flashing lights and sonic booms
Lovely saps all without a care
Nobody said that the world was fair
And if they did say so, well, then
We'll blow off our heads in despair
We'll blow off our heads in despair
Let's go down to the fashion show
With all the pretty people and piles of blow
We'll sit down in the velvet chairs
And hang on tight to our bus fare
And if it falls between the seats
We'll blow off our heads in despair
We'll blow off our heads in despair


In [None]:
tqdm_notebook.pandas()

caffeine.on(display=False)

N = 10

for i in tqdm(range(0, len(sampled), N)):
    if os.path.exists(f'lyrics_data/lyrics_{i}.csv'):
        continue
    
    try:
        chunk = sampled.iloc[i:i+N].copy()
        chunk['lyrics'] = chunk.progress_apply(lambda x: get_lyrics(x['track'], x['artist']), axis=1)
        chunk.to_csv(f'lyrics_data/lyrics_{i}.csv', index=False, sep=';')
    except Exception as e:
        time.sleep(60)
        continue
caffeine.off()

In [13]:
# combine the chunks into a single dataframe
lyrics_dfs = []
for file in os.listdir('lyrics_data/'):
    df = pd.read_csv(f'lyrics_data/{file}', sep=';')
    lyrics_dfs.append(df)
    

lyrics_df = pd.concat(lyrics_dfs)

lyrics_df.to_csv('lyrics_data.csv', index=False, sep=';')

In [14]:
lyrics_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9900 entries, 0 to 9
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   lastfm_url              9900 non-null   object 
 1   track                   9900 non-null   object 
 2   artist                  9900 non-null   object 
 3   seeds                   9900 non-null   object 
 4   number_of_emotion_tags  9900 non-null   int64  
 5   valence_tags            9900 non-null   float64
 6   arousal_tags            9900 non-null   float64
 7   dominance_tags          9900 non-null   float64
 8   mbid                    7084 non-null   object 
 9   spotify_id              6972 non-null   object 
 10  genre                   9431 non-null   object 
 11  lyrics                  8109 non-null   object 
dtypes: float64(3), int64(1), object(8)
memory usage: 1005.5+ KB


In [15]:
lyrics_df = lyrics_df[lyrics_df['lyrics'].notnull()]
lyrics_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8109 entries, 0 to 9
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   lastfm_url              8109 non-null   object 
 1   track                   8109 non-null   object 
 2   artist                  8109 non-null   object 
 3   seeds                   8109 non-null   object 
 4   number_of_emotion_tags  8109 non-null   int64  
 5   valence_tags            8109 non-null   float64
 6   arousal_tags            8109 non-null   float64
 7   dominance_tags          8109 non-null   float64
 8   mbid                    6113 non-null   object 
 9   spotify_id              6123 non-null   object 
 10  genre                   7754 non-null   object 
 11  lyrics                  8109 non-null   object 
dtypes: float64(3), int64(1), object(8)
memory usage: 823.6+ KB


In [16]:
lyrics_df['seeds'].value_counts()

seeds
sexy           100
angry           99
happy           99
sad             99
bittersweet     99
              ... 
martial         60
negative        60
harsh           59
exotic          58
technical       56
Name: count, Length: 99, dtype: int64

In [28]:
lyrics_df = pd.read_csv('lyrics_data.csv', sep=';')
lyrics_df = lyrics_df[lyrics_df['lyrics'].notnull()]
lyrics_df = lyrics_df[lyrics_df['lyrics'].apply(lambda x: len(x) < 1500)]


lyrics_df.info()



<class 'pandas.core.frame.DataFrame'>
Index: 4643 entries, 1 to 9898
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   lastfm_url              4643 non-null   object 
 1   track                   4643 non-null   object 
 2   artist                  4643 non-null   object 
 3   seeds                   4643 non-null   object 
 4   number_of_emotion_tags  4643 non-null   int64  
 5   valence_tags            4643 non-null   float64
 6   arousal_tags            4643 non-null   float64
 7   dominance_tags          4643 non-null   float64
 8   mbid                    4028 non-null   object 
 9   spotify_id              3961 non-null   object 
 10  genre                   4521 non-null   object 
 11  lyrics                  4643 non-null   object 
dtypes: float64(3), int64(1), object(8)
memory usage: 471.6+ KB


In [30]:
lyrics_df['seeds'].unique()

array(['martial', 'uplifting', 'warm', 'nostalgic', 'reflective',
       'intimate', 'bittersweet', 'dark', 'wistful', 'exciting', 'gloomy',
       'melancholy', 'energetic', 'cheerful', 'sensual', 'quirky',
       'positive', 'fun', 'lazy', 'cynical', 'quiet', 'angry', 'sad',
       'sexual', 'thoughtful', 'trippy', 'calm', 'mellow', 'happy',
       'bright', 'soft', 'passionate', 'tender', 'aggressive', 'poignant',
       'dreamy', 'elegant', 'theatrical', 'hypnotic', 'serious', 'cold',
       'fierce', 'ethereal', 'romantic', 'light', 'smooth', 'negative',
       'nocturnal', 'ominous', 'scary', 'intense', 'technical', 'eerie',
       'sweet', 'sentimental', 'sleazy', 'mysterious', 'relaxed',
       'gritty', 'sardonic', 'ironic', 'epic', 'lively', 'mystical',
       'dramatic', 'erotic', 'playful', 'soothing', 'introspective',
       'driving', 'stylish', 'powerful', 'lush', 'sexy', 'whimsical',
       'witty', 'silly', 'brooding', 'sacred', 'delicate', 'peaceful',
       'sarcasti

In [51]:
# remove the songs that have non-english lyrics
x = lyrics_df['lyrics'][100]


# print(x)

# print(lyrics_df['seeds'][100])

# print(lyrics_df['track'][100])

# print(lyrics_df['artist'][100])

# detect(x)

for row in tqdm(lyrics_df.itertuples()):
    try:
        if detect(row.lyrics) != 'en':
            lyrics_df = lyrics_df.drop(row.Index)
    except:
        lyrics_df = lyrics_df.drop(row.Index)


lyrics_df.info()

4224it [00:06, 685.66it/s]

<class 'pandas.core.frame.DataFrame'>
Index: 4221 entries, 8 to 9898
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   lastfm_url              4221 non-null   object 
 1   track                   4221 non-null   object 
 2   artist                  4221 non-null   object 
 3   seeds                   4221 non-null   object 
 4   number_of_emotion_tags  4221 non-null   int64  
 5   valence_tags            4221 non-null   float64
 6   arousal_tags            4221 non-null   float64
 7   dominance_tags          4221 non-null   float64
 8   mbid                    3698 non-null   object 
 9   spotify_id              3679 non-null   object 
 10  genre                   4130 non-null   object 
 11  lyrics                  4221 non-null   object 
dtypes: float64(3), int64(1), object(8)
memory usage: 428.7+ KB





In [56]:

lyrics_df = lyrics_df[lyrics_df['lyrics'].apply(lambda x: len(x) > 200)]
lyrics_df = lyrics_df[lyrics_df['lyrics'].apply(lambda x: '$' not in x)]

lyrics_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3156 entries, 8 to 9898
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   lastfm_url              3156 non-null   object 
 1   track                   3156 non-null   object 
 2   artist                  3156 non-null   object 
 3   seeds                   3156 non-null   object 
 4   number_of_emotion_tags  3156 non-null   int64  
 5   valence_tags            3156 non-null   float64
 6   arousal_tags            3156 non-null   float64
 7   dominance_tags          3156 non-null   float64
 8   mbid                    2749 non-null   object 
 9   spotify_id              2717 non-null   object 
 10  genre                   3077 non-null   object 
 11  lyrics                  3156 non-null   object 
dtypes: float64(3), int64(1), object(8)
memory usage: 320.5+ KB


In [59]:

test_songs = lyrics_df.sample(10)

for row in test_songs.itertuples():
    print(row.track)
    print(row.artist)
    print(row.lyrics)
    print(row.seeds)
    print('---')

Through These Eyes
Lynch Mob
Music: Lynch
Morning comes too soon
Another day unfolds and the rains are cold
That fall on you
I wanna tell you
But I can't seem to get it
Can't seem to get it right
Oh, it's killing me
I realize that it cuts me deep inside
When I can feel in my soul
This empty whole you took from me
Cause on and on
You keep tellin' me the things that make you smile
If I could hold you somehow
And show you now
That the tears in vain will never ease the pain
Oh, I, I can only see
As far as you
You're never to far away
Oh my eyes can only see
What I feel for you
I could fly the heavens
Although the distance may be far
I'm just one dream away
And the tears will say
That there's no turning back
Quiet in my thoughts
Safe within myself
I reach out to a time
When you laid by my side
And oh how it felt so right
Don't you even worry now
Through these eyes
sleazy
---
Therma
Ataraxia

The dreams we had are over now
They're all reality
She cannot hurt us anymore
There’s no more disbel

In [60]:
lyrics_df.to_csv('lyrics_data_cleaned.csv', index=False, sep=';')