# Lyrics Analysis

In [3]:
lyrics_directory = "Lyrics_Data/"
tracks_csv = "tracks.csv"
stopwords_file = "stopwords.txt"

## Import data

In [4]:
import pandas as pd

In [5]:
df = pd.read_csv(tracks_csv)

df.head(3)

Unnamed: 0,track_name,track_id,grouping
0,Party In The U.S.A.,5Q0Nhxo0l2bP3pNjpGJwV1,billboard_2010s
1,Magic (feat. Rivers Cuomo),5uHYcK0nbEYgRaFTY5BqnP,billboard_2010s
2,Misery,6KBYk8OFtod7brGuZ3Y67q,billboard_2010s


In [6]:
df["grouping"].unique()

array(['billboard_2010s', 'japan', 'kpop'], dtype=object)

In [7]:
def get_lyrics(track_id):
    try:
        lyrics = ""
        with open(lyrics_directory + track_id + ".txt", "r") as f:
            lyrics = f.read()
            
        return lyrics            
    except:
        return None
    
df["lyrics"] = df["track_id"].apply(get_lyrics)

df.shape

(686, 4)

In [8]:
# stop words
# https://www.kaggle.com/datasets/rowhitswami/stopwords/
stopwords = set()
with open(stopwords_file, "r") as f:
    stopwords = set(f.read().split("\n"))
    
stopwords

{',',
 '>',
 '?',
 'Just',
 'a',
 'abaft',
 'abafter',
 'abaftest',
 'about',
 'abouter',
 'aboutest',
 'above',
 'abover',
 'abovest',
 'accordingly',
 'aer',
 'aest',
 'afore',
 'after',
 'afterer',
 'afterest',
 'afterward',
 'afterwards',
 'again',
 'against',
 'aid',
 'ain',
 'albeit',
 'all',
 'aller',
 'allest',
 'alls',
 'allyou',
 'almost',
 'along',
 'alongside',
 'already',
 'also',
 'although',
 'always',
 'amid',
 'amidst',
 'among',
 'amongst',
 'an',
 'and',
 'andor',
 'anear',
 'anent',
 'another',
 'any',
 'anybody',
 'anyhow',
 'anyone',
 'anything',
 'anywhere',
 'apart',
 'aparter',
 'apartest',
 'appear',
 'appeared',
 'appearing',
 'appears',
 'appropriate',
 'appropriated',
 'appropriater',
 'appropriates',
 'appropriatest',
 'appropriating',
 'are',
 'ares',
 'around',
 'as',
 'ases',
 'aside',
 'asides',
 'aslant',
 'astraddle',
 'astraddler',
 'astraddlest',
 'astride',
 'astrider',
 'astridest',
 'at',
 'athwart',
 'atop',
 'atween',
 'aught',
 'aughts',
 'av

## Analyze

In [9]:
def is_stopword(word):
    word = word.lower()
    
    if len(word) <= 1:
        return True
    
    # in the list
    if word in stopwords:
        return True
    
    # is Korean
    if ord("가") <= ord(word[0]) <= ord("힣"):
        return True
    
    return False

### Characteristic Keyword
Using TF-IDF

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

ModuleNotFoundError: No module named 'sklearn'

In [None]:
vectorizer = TfidfVectorizer(stop_words=list(stopwords), max_features=1000, min_df=10)

x = vectorizer.fit_transform(df["lyrics"].fillna("")).toarray()

df_tfidf = pd.DataFrame(x, columns=vectorizer.get_feature_names_out()).T



In [None]:
threshold = 0.1
counter = {}
summer = {}

for cnt in range(len(df)):
    if df["grouping"][cnt] != "kpop":
        continue
    
    tfidf_series = df_tfidf[cnt]
    
    keywords = tfidf_series[tfidf_series > threshold].index.tolist()
    for keyword in keywords:
        counter[keyword] = counter.get(keyword, 0) + 1
        summer[keyword] = summer.get(keyword, 0) + tfidf_series[keyword]

In [None]:
def sort_dictionary(dictionary):
    return [(k, dictionary[k]) for k in sorted(dictionary, key=dictionary.get, reverse=True)]

In [None]:
sort_dictionary(counter)

[('like', 20),
 ('love', 19),
 ('yes', 16),
 ('sorry', 15),
 ('don', 15),
 ('한국', 14),
 ('come', 14),
 ('right', 13),
 ('world', 13),
 ('hey', 13),
 ('let', 13),
 ('time', 13),
 ('baby', 13),
 ('한국어', 12),
 ('ll', 11),
 ('see', 11),
 ('room', 11),
 ('look', 10),
 ('here', 10),
 ('day', 10),
 ('stanza', 10),
 ('god', 9),
 ('night', 9),
 ('feel', 9),
 ('take', 9),
 ('down', 9),
 ('child', 9),
 ('woo', 8),
 ('am', 8),
 ('need', 8),
 ('news', 8),
 ('eyes', 8),
 ('give', 8),
 ('call', 8),
 ('ah', 8),
 ('honey', 7),
 ('ride', 7),
 ('girls', 7),
 ('way', 7),
 ('back', 7),
 ('okay', 7),
 ('keep', 7),
 ('stupid', 7),
 ('leave', 7),
 ('say', 7),
 ('new', 7),
 ('fire', 7),
 ('move', 7),
 ('di', 7),
 ('il', 7),
 ('을和', 6),
 ('life', 6),
 ('wait', 6),
 ('care', 6),
 ('feeling', 6),
 ('tonight', 6),
 ('bring', 6),
 ('ready', 6),
 ('coming', 6),
 ('agriculture', 6),
 ('share', 6),
 ('heart', 6),
 ('star', 6),
 ('dream', 6),
 ('price', 6),
 ('high', 6),
 ('bad', 5),
 ('ooh', 5),
 ('mind', 5),
 ('mine'

In [None]:
sort_dictionary(summer)

[('like', 6.192201935586422),
 ('한국', 5.523687546365437),
 ('love', 4.628227132549016),
 ('god', 3.3699290052752375),
 ('come', 3.344006367366077),
 ('right', 3.3116461214436814),
 ('yes', 3.238239598117313),
 ('let', 2.9654794379463842),
 ('time', 2.8378209089480126),
 ('woo', 2.8131146754658793),
 ('sorry', 2.7984546683099984),
 ('world', 2.794715916884609),
 ('girls', 2.782723363092857),
 ('night', 2.7667549248794168),
 ('child', 2.7110735871803233),
 ('hey', 2.6498030610129084),
 ('pop', 2.6106587280834335),
 ('한국어', 2.5503253223263944),
 ('baby', 2.4666877214357257),
 ('bang', 2.4024520960863422),
 ('share', 2.366525801350969),
 ('look', 2.3062279603038847),
 ('see', 2.297194021103997),
 ('room', 2.293649739529002),
 ('new', 2.283919011839281),
 ('di', 2.2675126414852618),
 ('day', 2.2207628046799552),
 ('am', 2.2112410145740165),
 ('don', 2.2090947429468017),
 ('il', 2.207756962966828),
 ('here', 2.151917618049032),
 ('coming', 2.1102833866145994),
 ('news', 1.9407762590164892),


### Topic analysis

In [None]:
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel

In [None]:
def pre_process(df_lyrics):
    # tokenize lyrics
    lyrics_processed = df_lyrics.fillna("").str.split()

    for _list in lyrics_processed:
        for cnt in range(len(_list)):
            _list[cnt] = _list[cnt].replace(".", "").replace(",", "").replace("?", "").replace("\\", "").replace("/", "").replace(":", "").lower()
            
            if is_stopword(_list[cnt]):
                _list[cnt] = ""
                
    # remove empty string
    for cnt in range(len(lyrics_processed)):
        while(True):
            try:
                lyrics_processed[cnt].remove("")
            except:
                break

    return lyrics_processed

In [None]:
#train model
lyrics_processed = pre_process(df["lyrics"])
dictionary = Dictionary(lyrics_processed)
corpus = [dictionary.doc2bow(text) for text in lyrics_processed]
lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=50)

all topics

In [None]:
# observe topics
topics = lda.print_topics()

#sort
topics = sorted(topics, key=lambda x: x[0])

topics

[(1,
  '0.013*"like" + 0.011*"here" + 0.011*"say" + 0.010*"song" + 0.010*"home" + 0.009*"heart" + 0.009*"love" + 0.009*"beautiful" + 0.008*"star" + 0.006*"life"'),
 (4,
  '0.034*"baby" + 0.019*"like" + 0.017*"dance" + 0.016*"check" + 0.010*"yes" + 0.009*"home" + 0.009*"love" + 0.008*"can\'t" + 0.008*"day" + 0.007*"back"'),
 (6,
  '0.032*"eyes" + 0.017*"world" + 0.010*"heart" + 0.010*"night" + 0.009*"like" + 0.008*"attention" + 0.008*"mask" + 0.007*"look" + 0.007*"percocets" + 0.007*"chase"'),
 (7,
  '0.093*"child" + 0.074*"invalid" + 0.053*"night" + 0.031*"awake" + 0.030*"we\'re" + 0.018*"luck" + 0.012*"bubble" + 0.011*"like" + 0.009*"love" + 0.009*"beep"'),
 (8,
  '0.023*"break" + 0.020*"come" + 0.016*"like" + 0.016*"take" + 0.016*"love" + 0.010*"shake" + 0.010*"heart" + 0.010*"baddie" + 0.009*"call" + 0.007*"can\'t"'),
 (9,
  '0.051*"love" + 0.033*"like" + 0.023*"boom" + 0.013*"let" + 0.010*"think" + 0.009*"honey" + 0.008*"am" + 0.007*"still" + 0.007*"need" + 0.006*"time"'),
 (13,
  

what topics are in kpop?

In [None]:
threshold = 0.3

counter = {}
summer = {}

for cnt in range(len(df)):
    if df["grouping"][cnt] != "kpop":
        continue
    
    topics = lda.get_document_topics(corpus[cnt], minimum_probability=0)
    
    for topic in topics:
        if topic[1] > threshold:
            counter[topic[0]] = counter.get(topic[0], 0) + 1
            
        summer[topic[0]] = summer.get(topic[0], 0) + topic[1]

In [None]:
sort_dictionary(counter)

[(18, 8),
 (48, 6),
 (27, 5),
 (25, 5),
 (3, 5),
 (12, 5),
 (20, 5),
 (39, 5),
 (34, 4),
 (41, 4),
 (8, 4),
 (16, 4),
 (4, 4),
 (33, 4),
 (7, 4),
 (42, 4),
 (21, 4),
 (13, 4),
 (45, 4),
 (37, 4),
 (26, 4),
 (11, 3),
 (2, 3),
 (43, 3),
 (38, 3),
 (31, 3),
 (23, 3),
 (44, 3),
 (29, 3),
 (49, 2),
 (6, 2),
 (17, 2),
 (32, 2),
 (28, 2),
 (40, 2),
 (30, 2),
 (9, 2),
 (15, 2),
 (36, 2),
 (35, 1),
 (24, 1),
 (22, 1),
 (14, 1),
 (0, 1),
 (46, 1)]

In [None]:
sums = sort_dictionary(summer)

sums

[(18, 6.511035505864129),
 (25, 5.829874104267219),
 (12, 5.736170761156245),
 (48, 5.65344371345418),
 (20, 5.408308543046587),
 (21, 4.847438654069265),
 (39, 4.680760434595868),
 (7, 4.577855028124759),
 (42, 4.570883950131247),
 (16, 4.522276637566392),
 (13, 4.51497782181832),
 (27, 4.513692242966499),
 (3, 4.485684544008109),
 (45, 4.38620277075097),
 (26, 4.376367206510622),
 (37, 3.975818828883348),
 (4, 3.8699636624078266),
 (8, 3.845819205555017),
 (11, 3.669206549115188),
 (41, 3.603105908499856),
 (43, 3.4876521295082057),
 (31, 3.337340612197295),
 (38, 3.086904126801528),
 (2, 3.0855243993137265),
 (33, 3.059902722830884),
 (44, 2.922754667015397),
 (29, 2.9011504183945362),
 (23, 2.827096342894947),
 (34, 2.8028740143417963),
 (15, 2.793483726476552),
 (36, 2.637244693847606),
 (17, 2.6233591335330857),
 (30, 2.61391725546855),
 (9, 2.4470920100284275),
 (32, 2.3828218028647825),
 (49, 2.3168033124820795),
 (40, 2.254464600158826),
 (28, 2.0222945621389954),
 (6, 1.94586

In [None]:
for _sum in sums[:5]:
    print("Topic ID: " + str(_sum[0]))
    print("Sum: " + str(_sum[1]))
    print("Count: " + str(counter[_sum[0]]))
    
    print("Keywords: ")
    print(lda.print_topic(int(_sum[0])))
    
    print("\n")

Topic ID: 18
Sum: 6.511035505864129
Count: 8
Keywords: 
0.038*"baby" + 0.016*"like" + 0.014*"time" + 0.014*"worse" + 0.013*"look" + 0.010*"take" + 0.009*"home" + 0.008*"world" + 0.008*"love" + 0.008*"made"


Topic ID: 25
Sum: 5.829874104267219
Count: 5
Keywords: 
0.086*"dance" + 0.017*"open" + 0.013*"like" + 0.012*"[bleep]" + 0.010*"night" + 0.009*"giddy" + 0.009*"day" + 0.007*"[bleep" + 0.007*"hold" + 0.007*"zoom"


Topic ID: 12
Sum: 5.736170761156245
Count: 5
Keywords: 
0.042*"low" + 0.017*"time" + 0.016*"day" + 0.015*"name" + 0.011*"like" + 0.011*"spicy" + 0.010*"come" + 0.010*"can't" + 0.009*"right" + 0.007*"see"


Topic ID: 48
Sum: 5.65344371345418
Count: 6
Keywords: 
0.064*"side" + 0.042*"open" + 0.012*"giddy" + 0.011*"like" + 0.008*"love" + 0.008*"see" + 0.008*"sugar" + 0.007*"hate" + 0.007*"give" + 0.007*"day"


Topic ID: 20
Sum: 5.408308543046587
Count: 5
Keywords: 
0.017*"love" + 0.015*"yes" + 0.010*"move" + 0.010*"say" + 0.010*"only" + 0.010*"like" + 0.009*"lips" + 0.009*"th