# Lyrics Analysis

In [53]:
lyrics_directory = "Lyrics_Data/"
tracks_csv = "tracks.csv"
stopwords_file = "stopwords.txt"

## Import data

In [54]:
import pandas as pd

In [55]:
df = pd.read_csv(tracks_csv)

df.head(3)

Unnamed: 0,track_name,track_id,grouping
0,Ice Cream (with Selena Gomez),2J4P46vCFm1rPkNkp9pZWX,kpop
1,Dynamite,0v1x6rN6JHRapa03JElljE,kpop
2,THE BADDEST,2V4Fx72svQRxrFvNT1eq5f,kpop


In [56]:
df.shape

(3811, 3)

In [57]:
df["grouping"].unique()

array(['kpop', 'global', 'billboard'], dtype=object)

In [58]:
def get_lyrics(track_id):
    try:
        lyrics = ""
        with open(lyrics_directory + track_id + ".txt", "r") as f:
            lyrics = f.read()
            
        return lyrics            
    except:
        return None
    
df["lyrics"] = df["track_id"].apply(get_lyrics)

df.shape

(3811, 4)

In [59]:
# stop words
# https://www.kaggle.com/datasets/rowhitswami/stopwords/
stopwords = set()
with open(stopwords_file, "r") as f:
    stopwords = set(f.read().split("\n"))
    
stopwords

{',',
 '>',
 '?',
 'Just',
 'Yes',
 'a',
 'abaft',
 'abafter',
 'abaftest',
 'about',
 'abouter',
 'aboutest',
 'above',
 'abover',
 'abovest',
 'accordingly',
 'aer',
 'aest',
 'afore',
 'after',
 'afterer',
 'afterest',
 'afterward',
 'afterwards',
 'again',
 'against',
 'aid',
 'ain',
 'albeit',
 'all',
 'aller',
 'allest',
 'alls',
 'allyou',
 'almost',
 'along',
 'alongside',
 'already',
 'also',
 'although',
 'always',
 'amid',
 'amidst',
 'among',
 'amongst',
 'an',
 'and',
 'andor',
 'anear',
 'anent',
 'another',
 'any',
 'anybody',
 'anyhow',
 'anyone',
 'anything',
 'anywhere',
 'apart',
 'aparter',
 'apartest',
 'appear',
 'appeared',
 'appearing',
 'appears',
 'appropriate',
 'appropriated',
 'appropriater',
 'appropriates',
 'appropriatest',
 'appropriating',
 'are',
 'ares',
 'around',
 'as',
 'ases',
 'aside',
 'asides',
 'aslant',
 'astraddle',
 'astraddler',
 'astraddlest',
 'astride',
 'astrider',
 'astridest',
 'at',
 'athwart',
 'atop',
 'atween',
 'aught',
 'aught

## Analyze

In [60]:
def is_stopword(word):
    word = word.lower()
    
    if len(word) <= 1:
        return True
    
    # in the list
    if word in stopwords:
        return True
    
    # is Korean
    if ord("가") <= ord(word[0]) <= ord("힣"):
        return True
    
    return False

### Characteristic Keyword
Using TF-IDF

In [61]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [62]:
vectorizer = TfidfVectorizer(stop_words=list(stopwords), max_features=1000, min_df=10)

x = vectorizer.fit_transform(df["lyrics"].fillna("")).toarray()

df_tfidf = pd.DataFrame(x, columns=vectorizer.get_feature_names_out()).T



In [63]:
threshold = 0.1
counter = {}
summer = {}

for cnt in range(len(df)):
    if df["grouping"][cnt] != "kpop":
        continue
    
    tfidf_series = df_tfidf[cnt]
    
    keywords = tfidf_series[tfidf_series > threshold].index.tolist()
    for keyword in keywords:
        counter[keyword] = counter.get(keyword, 0) + 1
        summer[keyword] = summer.get(keyword, 0) + tfidf_series[keyword]

In [64]:
def sort_dictionary(dictionary):
    return [(k, dictionary[k]) for k in sorted(dictionary, key=dictionary.get, reverse=True)]

In [65]:
sort_dictionary(counter)

[('love', 151),
 ('sorry', 136),
 ('news', 109),
 ('let', 91),
 ('yes', 80),
 ('home', 76),
 ('heart', 73),
 ('follow', 64),
 ('price_varies', 61),
 ('eyes', 60),
 ('feel', 57),
 ('night', 56),
 ('baby', 51),
 ('한국', 51),
 ('dream', 49),
 ('new', 47),
 ('light', 46),
 ('say', 44),
 ('room', 43),
 ('take', 43),
 ('back', 42),
 ('world', 41),
 ('am', 41),
 ('ah', 41),
 ('comment', 39),
 ('woo', 39),
 ('dance', 38),
 ('boy', 38),
 ('end', 37),
 ('leave', 37),
 ('한국어', 37),
 ('only', 37),
 ('way', 35),
 ('tell', 35),
 ('wait', 35),
 ('give', 35),
 ('life', 34),
 ('hot', 34),
 ('agriculture', 34),
 ('need', 34),
 ('을和', 33),
 ('beautiful', 33),
 ('informazioni', 33),
 ('share', 32),
 ('find', 32),
 ('bad', 31),
 ('fire', 30),
 ('think', 30),
 ('today', 30),
 ('mind', 30),
 ('cancel', 30),
 ('hands', 30),
 ('sweet', 29),
 ('hear', 29),
 ('teen', 29),
 ('girl', 29),
 ('child', 28),
 ('di', 28),
 ('looking', 28),
 ('touch', 28),
 ('price', 28),
 ('blue', 28),
 ('sky', 27),
 ('god', 27),
 ('run

In [66]:
sort_dictionary(summer)

[('love', 48.55836047684298),
 ('sorry', 34.69424650510435),
 ('home', 30.28655297176334),
 ('한국', 22.777027998347513),
 ('news', 22.043043875590087),
 ('yes', 18.829226822492913),
 ('heart', 18.620344466903866),
 ('let', 18.24896449640406),
 ('follow', 16.059904232588927),
 ('night', 15.43230471889167),
 ('woo', 14.590180749286153),
 ('price_varies', 13.374253222129877),
 ('ah', 12.953874422087333),
 ('feel', 12.661176056653932),
 ('dance', 12.493142718143003),
 ('hot', 12.36988765332626),
 ('boy', 12.357121492770778),
 ('eyes', 12.288759128524024),
 ('light', 11.830627360202842),
 ('dream', 11.722361541229729),
 ('baby', 10.664616374710748),
 ('new', 10.450066147578717),
 ('am', 9.7854644596937),
 ('bad', 9.712276351863487),
 ('room', 8.96983337258604),
 ('world', 8.919269765327815),
 ('say', 8.084704676142914),
 ('agriculture', 7.968343318560492),
 ('한국어', 7.881441247623746),
 ('back', 7.837137995794142),
 ('을和', 7.755186517134875),
 ('need', 7.728319155476253),
 ('fire', 7.64511570

### Topic analysis

In [67]:
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel

In [68]:
def pre_process(df_lyrics):
    # tokenize lyrics
    lyrics_processed = df_lyrics.fillna("").str.split()

    for _list in lyrics_processed:
        for cnt in range(len(_list)):
            _list[cnt] = _list[cnt].replace(".", "").replace(",", "").replace("?", "").replace("\\", "").replace("/", "").replace(":", "").lower()
            
            if is_stopword(_list[cnt]):
                _list[cnt] = ""
                
    # remove empty string
    for cnt in range(len(lyrics_processed)):
        while(True):
            try:
                lyrics_processed[cnt].remove("")
            except:
                break

    return lyrics_processed

In [75]:
#train model
lyrics_processed = pre_process(df["lyrics"])
dictionary = Dictionary(lyrics_processed)
corpus = [dictionary.doc2bow(text) for text in lyrics_processed]
lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=100)

all topics

In [76]:
# observe topics
topics = lda.print_topics()

#sort
topics = sorted(topics, key=lambda x: x[0])

topics

[(0,
  '0.031*"count" + 0.030*"della" + 0.023*"hurt" + 0.015*"love" + 0.012*"stars" + 0.011*"oh)" + 0.010*"nigger" + 0.010*"closer" + 0.008*"night" + 0.008*"moved"'),
 (10,
  '0.053*"dream" + 0.023*"christmas" + 0.022*"labour" + 0.019*"family" + 0.015*"deceive" + 0.013*"toy" + 0.012*"flashing" + 0.012*"mascara" + 0.010*"bambi" + 0.008*"true"'),
 (13,
  '0.037*"run" + 0.033*"summer" + 0.029*"hero" + 0.024*"bubble" + 0.017*"bring" + 0.014*"new" + 0.013*"attack" + 0.013*"york" + 0.012*"saying" + 0.010*"take"'),
 (26,
  '0.033*"say" + 0.020*"can\'t" + 0.015*"lonely" + 0.015*"love" + 0.014*"baby" + 0.014*"maybe" + 0.013*"think" + 0.011*"solosta" + 0.011*"pipe" + 0.008*"pedal"'),
 (41,
  '0.039*"home" + 0.024*"lights" + 0.020*"house" + 0.020*"worse" + 0.018*"love" + 0.011*"mediocre" + 0.011*"take" + 0.010*"body" + 0.008*"wild" + 0.007*"say"'),
 (46,
  '0.029*"bin" + 0.026*"pop" + 0.016*"fly" + 0.016*"give" + 0.016*"butterfly" + 0.016*"life" + 0.015*"love" + 0.014*"infinite" + 0.012*"crying" 

what topics are in kpop?

In [77]:
threshold = 0.3

counter = {}
summer = {}

for cnt in range(len(df)):
    if df["grouping"][cnt] != "kpop":
        continue
    
    topics = lda.get_document_topics(corpus[cnt], minimum_probability=0)
    
    for topic in topics:
        if topic[1] > threshold:
            counter[topic[0]] = counter.get(topic[0], 0) + 1
            
        summer[topic[0]] = summer.get(topic[0], 0) + topic[1]

In [78]:
sort_dictionary(counter)

[(30, 46),
 (98, 27),
 (92, 20),
 (50, 20),
 (93, 16),
 (44, 16),
 (61, 16),
 (40, 14),
 (41, 13),
 (1, 13),
 (94, 12),
 (23, 12),
 (12, 12),
 (6, 12),
 (49, 11),
 (35, 11),
 (45, 11),
 (4, 11),
 (67, 11),
 (84, 11),
 (46, 10),
 (82, 10),
 (2, 9),
 (79, 9),
 (91, 9),
 (57, 9),
 (17, 9),
 (71, 9),
 (95, 8),
 (13, 8),
 (31, 8),
 (43, 8),
 (66, 8),
 (37, 8),
 (53, 8),
 (7, 8),
 (22, 8),
 (86, 8),
 (55, 7),
 (78, 7),
 (51, 7),
 (29, 7),
 (80, 7),
 (85, 7),
 (16, 7),
 (36, 7),
 (3, 6),
 (58, 6),
 (27, 6),
 (33, 6),
 (52, 6),
 (97, 6),
 (89, 6),
 (48, 6),
 (19, 6),
 (8, 6),
 (47, 6),
 (11, 5),
 (42, 5),
 (24, 5),
 (26, 5),
 (9, 5),
 (81, 5),
 (96, 5),
 (18, 5),
 (83, 4),
 (76, 4),
 (59, 4),
 (14, 4),
 (65, 4),
 (15, 4),
 (70, 4),
 (34, 4),
 (90, 4),
 (5, 3),
 (10, 3),
 (63, 3),
 (62, 3),
 (75, 3),
 (99, 3),
 (60, 3),
 (28, 3),
 (21, 2),
 (88, 2),
 (73, 2),
 (38, 2),
 (20, 2),
 (0, 2),
 (32, 2),
 (54, 1),
 (69, 1),
 (74, 1),
 (72, 1),
 (77, 1),
 (25, 1),
 (68, 1)]

In [79]:
sums = sort_dictionary(summer)

sums

[(30, 37.79214342489831),
 (98, 25.07582538252791),
 (93, 22.731048760017075),
 (44, 21.263049757983026),
 (92, 20.702901704595206),
 (50, 20.20135823242981),
 (61, 17.648196526062748),
 (45, 16.111898041186578),
 (23, 15.445429259269076),
 (40, 15.301501325649951),
 (67, 13.846386771010657),
 (1, 13.829506861306072),
 (94, 13.614862742733749),
 (4, 12.98793389853563),
 (84, 12.96822253495884),
 (78, 12.95220438692013),
 (41, 12.875048603176765),
 (71, 12.838863268838395),
 (12, 12.64536109927758),
 (57, 12.576827913130728),
 (49, 12.40050671051904),
 (17, 11.97117998788417),
 (6, 11.886919606245101),
 (22, 11.68301675963994),
 (35, 11.187163574993974),
 (46, 10.65529372347919),
 (28, 10.60487893975187),
 (80, 10.555223475324965),
 (82, 10.44144434577629),
 (43, 10.378965555699324),
 (79, 10.333007027758867),
 (37, 10.18048843890756),
 (2, 10.037383533328466),
 (13, 9.740150335348517),
 (58, 9.52434900694243),
 (85, 9.343912500115039),
 (66, 9.278712526820527),
 (55, 9.205854565139816)

In [80]:
for _sum in sums[:5]:
    print("Topic ID: " + str(_sum[0]))
    print("Sum: " + str(_sum[1]))
    print("Count: " + str(counter[_sum[0]]))
    
    print("Keywords: ")
    print(lda.print_topic(int(_sum[0])))
    
    print("\n")

Topic ID: 30
Sum: 37.79214342489831
Count: 46
Keywords: 
0.099*"love" + 0.017*"need" + 0.013*"can't" + 0.012*"take" + 0.011*"feel" + 0.011*"tell" + 0.010*"heart" + 0.009*"let" + 0.008*"place" + 0.007*"way"


Topic ID: 98
Sum: 25.07582538252791
Count: 27
Keywords: 
0.058*"love" + 0.017*"feel" + 0.017*"hope" + 0.015*"baby" + 0.013*"girl" + 0.012*"hell" + 0.011*"gives" + 0.010*"way" + 0.008*"let" + 0.007*"take"


Topic ID: 93
Sum: 22.731048760017075
Count: 16
Keywords: 
0.043*"light" + 0.015*"wish" + 0.014*"detail" + 0.012*"news" + 0.011*"back" + 0.010*"only" + 0.010*"think" + 0.009*"cool" + 0.008*"kids" + 0.008*"makes"


Topic ID: 44
Sum: 21.263049757983026
Count: 16
Keywords: 
0.029*"let's" + 0.016*"love" + 0.015*"baby" + 0.015*"try" + 0.014*"girls" + 0.011*"can't" + 0.010*"oh!" + 0.010*"wrong" + 0.009*"feel" + 0.009*"girl"


Topic ID: 92
Sum: 20.702901704595206
Count: 20
Keywords: 
0.053*"night" + 0.023*"party" + 0.015*"heart" + 0.014*"feel" + 0.012*"light" + 0.012*"give" + 0.011*"life