## Load Dataset

In [None]:
import pandas as pd
from tqdm import tqdm
import numpy as np
import pickle
data = pd.read_excel("data/ver2.xlsx")
# data = pickle.load(open("data/ver2_w_lyrics_features.p", "rb"))

In [61]:
# only 2,921 songs with lyrics
data.has_lyrics.value_counts()

has_lyrics
False    3368
True     2921
Name: count, dtype: int64

In [3]:
# view some lyrics
count = 0
for i in range(len(data)):
    if data.iloc[i].has_lyrics:
        count += 1
        if count % 800 == 0:
            print(i)
            print(data.iloc[i].lyrics)
            print("="*100)

1854
[Intro]
Did no one ever teach you how to dance?
Nobody ever taught you how to dance?
Wellâ€”well,â€…everyoneâ€…knows how toâ€…dance
There's only so much time
[Verse 1]
Yeah, somebodyâ€…died today, I
I saw his picture in the funny papers
Didn'tâŸthinkâŸanybodyâŸdied on aâŸFriday
Some angry banker,âŸsome kind of money trader
Recently divorced, was drunk drivin' down the highway
And drove off the bridge to his wedding song
Blew out the bass in his speakers, you can still hear the treble goin' (Treble goin')
The hospital was useless, and everything was quiet but the music
Recently, I only meet peace when in deep sleep
Been the same dream, world safe, smile on her face
Waitin' on the other side (The other side)
I wonder if He'll take me to the other side (The other side), yeah
What your eyes see, too naive for war, and that'll screw ya
Still bet it all on the glory, hallelujah
I heard the answer in the gibberish of an old drunk
All he said was he's in no rush
[Chorus]
If I could j

In [4]:
# lyrics cleaning
import re

def clean_lyrics(lyrics):
    """
    # Example usage
    sections = clean_lyrics(sample_lyrics)
    """
    # Split sections based on headers (e.g., [Intro], [Chorus], [Verse 1])
    sections = re.split(r"(\[.*?\])", lyrics)

    # Remove empty strings and whitespace
    # Keep lyrics without section headers
    sections = [s.strip() for s in sections if s.strip() and not re.match(r"\[.*?\]", s.strip())]
    # Unicode normalisation
    sections = [re.sub(r'[^\x00-\x7F]+', ' ', s) for s in sections]

    return sections

# sample
# for section in clean_lyrics(data.iloc[908].lyrics):
#     print(section)
#     print("-"*100)

lyrics_sections = [None]*len(data)
for i in range(len(data)):
    if data.iloc[i].has_lyrics:
        try:
            lyrics_sections[i] = clean_lyrics(data.iloc[i].lyrics)
        except:
            print(i)
            print(data.iloc[i].lyrics)
            print(data.iloc[i].scraped)
            print(data.iloc[i].has_lyrics)
            print("="*100)
data["lyrics_sections"] = lyrics_sections
# there are some missing lyrics

764
nan
True
True
2097
nan
True
True
2606
nan
True
True
2607
nan
True
True
2919
nan
True
True
3911
nan
True
True
5007
nan
True
True
5014
nan
True
True
6176
nan
True
True
6198
nan
True
True
6199
nan
True
True
6200
nan
True
True


## Load Model

In [7]:
from transformers import pipeline
# pipe = pipeline(model="facebook/bart-large-mnli")
pipe = pipeline("zero-shot-classification", model="bart", device="cuda:0")

Device set to use cuda:0


## Extract Sentiments
Using NLI: Checks entailment with each of the label.

In [9]:
candidate_labels=["positive sentiment", "negative sentiment", "neutral sentiment"]

In [11]:
# new features to extract
sentiments_raw = [None]*len(data)
sentiments = [None]*len(data)
overall_sentiments = [None]*len(data)

for i in tqdm(range(len(data))):
    if data.iloc[i].lyrics_sections:
        # checks if have been extracted already
        if sentiments_raw[i] is None:
            # loop through sections
            sentiments_i = []
            sentiments_raw_i = []
            for section in data.iloc[i].lyrics_sections:
                pred = pipe(section, candidate_labels)
                sentiments_i.append(pred["labels"][0])
                sentiments_raw_i.append(dict(zip(pred["labels"], pred["scores"])))
            sentiments[i] = sentiments_i
            overall_sentiments[i] = max(set(sentiments_i), key=sentiments_i.count)
            sentiments_raw[i] = sentiments_raw_i

data["sentiments_raw"] = sentiments_raw
data["sentiments"] = sentiments
data["overall_sentiments"] = overall_sentiments

with open('data/ver2_w_lyrics_features.p', 'wb') as f:
    pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)

  0%|                                                                              | 3/6289 [00:02<1:22:38,  1.27it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|█████████████████████████████████████████████████████████████████████████████| 6289/6289 [14:44<00:00,  7.11it/s]


## Extract Emotions
Using NLI: Checks entailment with each of the label.

In [78]:
candidate_labels=["emotion is joy", "emotion is anger", "emotion is sadness", "emotion is disgust", "emotion is fear", "emotion is optimism"]

In [11]:
# # to remove once done
# emotions_raw=[eval(x) if x else x for x in data["emotions_raw"].replace(np.nan, None).tolist()]
# emotions=[eval(x) if x else x for x in data["emotions"].replace(np.nan, None).tolist()]
# overall_emotions=data["overall_emotions"].replace(np.nan, None).tolist()
# data["lyrics_sections"] = [eval(x) if x else x for x in data["lyrics_sections"].replace(np.nan, None).tolist()]

In [None]:
# new features to extract
emotions_raw = [None]*len(data)
emotions = [None]*len(data)
overall_emotions = [None]*len(data)

for i in tqdm(range(len(data))):
    if data.iloc[i].lyrics_sections:
        # checks if have been extracted already
        if emotions_raw[i] is None:
            # loop through sections
            emotions_i = []
            emotions_raw_i = []
            for section in data.iloc[i].lyrics_sections:
                pred = pipe(section, candidate_labels, multi_label=True)
                emotions_i.append(pred["labels"][0])
                emotions_raw_i.append(dict(zip(pred["labels"], pred["scores"])))
            emotions[i] = emotions_i
            overall_emotions[i] = max(set(emotions_i), key=emotions_i.count)
            emotions_raw[i] = emotions_raw_i

data["emotions_raw"] = emotions_raw
data["emotions"] = emotions
data["overall_emotions"] = overall_emotions

with open('data/ver2_w_lyrics_features.p', 'wb') as f:
    pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)

 88%|████████████████████████████████████████████████████████████████▌        | 5559/6289 [25:07<04:42,  2.59it/s]

## Extract Topics (part 1)
Using NLI: Checks entailment with each of the label.

In [35]:
candidate_labels=["song about love", "song about friendship", "song about mental health", "song about personal growth", "song about loss and grief", "song about social issues", "song about spirituality and faith", "song about happiness and celebration", "song about nature and the environment", "song about ambition and success"]

In [36]:
# new features to extract
topics_raw = [None]*len(data)
topics = [None]*len(data)
overall_topics = [None]*len(data)

for i in tqdm(range(len(data))):
    if data.iloc[i].lyrics_sections:
        # checks if have been extracted already
        if topics_raw[i] is None:
            # loop through sections
            topics_i = []
            topics_raw_i = []
            for section in data.iloc[i].lyrics_sections:
                pred = pipe(section, candidate_labels, multi_label=True)
                topics_i.append(pred["labels"][0])
                topics_raw_i.append(dict(zip(pred["labels"], pred["scores"])))
            topics[i] = topics_i
            overall_topics[i] = max(set(topics_i), key=topics_i.count)
            topics_raw[i] = topics_raw_i

data["topics_raw"] = topics_raw
data["topics"] = topics
data["overall_topics"] = overall_topics

with open('data/ver2_w_lyrics_features.p', 'wb') as f:
    pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)

100%|█████████████████████████████████████████████████████████████████████████| 6289/6289 [50:16<00:00,  2.08it/s]


## Extract Features

In [54]:
# remove duplicated lyrics (maybe drop entry with most number of nan or those without popularity/streams/rank/daily_rank?)

# data["num_missing"] = data.isna().sum(axis=1)
# data_subset = data[~data.lyrics.isnull()]
# df_sorted = data_subset.sort_values(["lyrics", "num_missing"], ascending=[True, True])
# df_cleaned = df_sorted.drop_duplicates(subset=['lyrics'], keep='first')
# df_cleaned = df_cleaned.drop(columns=["num_missing"])

In [64]:
# initial statistics
print(data.overall_sentiments.value_counts())
print(data.overall_emotions.value_counts())
print(data.overall_topics.value_counts())

overall_sentiments
negative sentiment    2272
positive sentiment     631
neutral sentiment        6
Name: count, dtype: int64
overall_emotions
emotion is sadness     844
emotion is optimism    723
emotion is anger       599
emotion is joy         264
emotion is fear        262
emotion is disgust     217
Name: count, dtype: int64
overall_topics
song about loss and grief                798
song about love                          578
song about social issues                 492
song about ambition and success          355
song about personal growth               339
song about happiness and celebration     174
song about friendship                     68
song about spirituality and faith         59
song about mental health                  23
song about nature and the environment     23
Name: count, dtype: int64


In [65]:
# extract overall for now
data = data.drop(columns=["lyrics_sections", "sentiments", "sentiments_raw", "emotions", "emotions_raw", "topics", "topics_raw"])
data.to_csv("data/ver2_w_lyrics_features.csv", index=False)

## Extract Topics (part 2)
Using BERTopic: leverages transformers and c-TF-IDF to create dense clusters allowing for easily interpretable topics whilst keeping important words in the topic descriptions