# Lyrics Analysis

In [15]:
lyrics_directory = "Lyrics_Data/"
tracks_csv = "tracks.csv"
stopwords_file = "stopwords.txt"

## Import data

In [16]:
import pandas as pd

In [17]:
df = pd.read_csv(tracks_csv)

df.head(3)

Unnamed: 0,track_name,track_id,grouping
0,Party In The U.S.A.,5Q0Nhxo0l2bP3pNjpGJwV1,billboard_2010s
1,Magic (feat. Rivers Cuomo),5uHYcK0nbEYgRaFTY5BqnP,billboard_2010s
2,Misery,6KBYk8OFtod7brGuZ3Y67q,billboard_2010s


In [18]:
df["grouping"].unique()

array(['billboard_2010s', 'japan', 'kpop'], dtype=object)

In [19]:
def get_lyrics(track_id):
    try:
        lyrics = ""
        with open(lyrics_directory + track_id + ".txt", "r") as f:
            lyrics = f.read()
            
        return lyrics            
    except:
        return None
    
df["lyrics"] = df["track_id"].apply(get_lyrics)

df.shape

(686, 4)

In [28]:
# stop words
# https://www.kaggle.com/datasets/rowhitswami/stopwords/
stopwords = set()
with open(stopwords_file, "r") as f:
    stopwords = set(f.read().split("\n"))
    
stopwords

{'a',
 'abaft',
 'abafter',
 'abaftest',
 'about',
 'abouter',
 'aboutest',
 'above',
 'abover',
 'abovest',
 'accordingly',
 'aer',
 'aest',
 'afore',
 'after',
 'afterer',
 'afterest',
 'afterward',
 'afterwards',
 'again',
 'against',
 'aid',
 'ain',
 'albeit',
 'all',
 'aller',
 'allest',
 'alls',
 'allyou',
 'almost',
 'along',
 'alongside',
 'already',
 'also',
 'although',
 'always',
 'amid',
 'amidst',
 'among',
 'amongst',
 'an',
 'and',
 'andor',
 'anear',
 'anent',
 'another',
 'any',
 'anybody',
 'anyhow',
 'anyone',
 'anything',
 'anywhere',
 'apart',
 'aparter',
 'apartest',
 'appear',
 'appeared',
 'appearing',
 'appears',
 'appropriate',
 'appropriated',
 'appropriater',
 'appropriates',
 'appropriatest',
 'appropriating',
 'are',
 'ares',
 'around',
 'as',
 'ases',
 'aside',
 'asides',
 'aslant',
 'astraddle',
 'astraddler',
 'astraddlest',
 'astride',
 'astrider',
 'astridest',
 'at',
 'athwart',
 'atop',
 'atween',
 'aught',
 'aughts',
 'available',
 'availabler',
 '

## Analyze

### Characteristic Keyword
Using TF-IDF

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [22]:
vectorizer = TfidfVectorizer(stop_words="english", max_features=1000, min_df=10)

x = vectorizer.fit_transform(df["lyrics"].fillna("")).toarray()

df_tfidf = pd.DataFrame(x, columns=vectorizer.get_feature_names_out()).T

In [23]:
threshold = 0.1
counter = {}
summer = {}

for cnt in range(len(df)):
    if df["grouping"][cnt] != "kpop":
        continue
    
    tfidf_series = df_tfidf[cnt]
    
    keywords = tfidf_series[tfidf_series > threshold].index.tolist()
    for keyword in keywords:
        counter[keyword] = counter.get(keyword, 0) + 1
        summer[keyword] = summer.get(keyword, 0) + tfidf_series[keyword]

In [24]:
def sort_dictionary(dictionary):
    return [(k, dictionary[k]) for k in sorted(dictionary, key=dictionary.get, reverse=True)]

In [25]:
sort_dictionary(counter)

[('oh', 31),
 ('yeah', 24),
 ('know', 23),
 ('like', 20),
 ('want', 19),
 ('love', 17),
 ('la', 16),
 ('sorry', 15),
 ('yes', 15),
 ('한국', 14),
 ('right', 13),
 ('don', 12),
 ('한국어', 12),
 ('baby', 12),
 ('hey', 11),
 ('room', 11),
 ('time', 11),
 ('come', 11),
 ('good', 10),
 ('world', 10),
 ('just', 10),
 ('look', 10),
 ('let', 10),
 ('day', 10),
 ('god', 9),
 ('need', 9),
 ('ll', 9),
 ('stop', 9),
 ('stanza', 9),
 ('을和', 8),
 ('night', 8),
 ('feel', 8),
 ('stupid', 8),
 ('child', 8),
 ('woo', 7),
 ('news', 7),
 ('leave', 7),
 ('new', 7),
 ('heart', 7),
 ('di', 7),
 ('honey', 6),
 ('wait', 6),
 ('care', 6),
 ('girls', 6),
 ('tonight', 6),
 ('things', 6),
 ('bring', 6),
 ('ready', 6),
 ('coming', 6),
 ('agriculture', 6),
 ('okay', 6),
 ('say', 6),
 ('eyes', 6),
 ('price', 6),
 ('il', 6),
 ('bad', 5),
 ('ooh', 5),
 ('really', 5),
 ('feeling', 5),
 ('free', 5),
 ('line', 5),
 ('way', 5),
 ('got', 5),
 ('think', 5),
 ('laughter', 5),
 ('information', 5),
 ('play', 5),
 ('kingdom', 5),
 (

In [26]:
sort_dictionary(summer)

[('la', 9.575411421833309),
 ('oh', 8.809471581317675),
 ('like', 5.857083182716018),
 ('한국', 5.5353993843614),
 ('yeah', 5.3212423747924085),
 ('want', 4.640449856570278),
 ('know', 4.391296984255933),
 ('love', 4.29073316347268),
 ('na', 3.782885716495845),
 ('god', 3.3662812875633397),
 ('right', 3.1654110637109243),
 ('yes', 3.050779123741074),
 ('look', 2.812038307123898),
 ('come', 2.78193949738054),
 ('sorry', 2.718237540373842),
 ('baby', 2.648629148815511),
 ('child', 2.6350370260660854),
 ('pop', 2.6010438808545273),
 ('let', 2.5624837846626227),
 ('한국어', 2.518970794133598),
 ('time', 2.444759959770453),
 ('world', 2.442402454222596),
 ('da', 2.4135868081051273),
 ('night', 2.40512345940562),
 ('coming', 2.2996681922251287),
 ('hey', 2.2810976034224315),
 ('girls', 2.257374602239615),
 ('new', 2.2521130852951297),
 ('good', 2.2452752651975536),
 ('room', 2.2169317314579455),
 ('di', 2.177262040741513),
 ('woo', 2.002526461260487),
 ('need', 1.9767791539413062),
 ('don', 1.971

### Topic analysis

In [None]:
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel

In [31]:
#train model

# tokenize lyrics
lyrics_processed = df["lyrics"].fillna("").str.split()

# remove stopwords
lyrics_processed = [[word.lower() for word in lyrics if word.lower() not in stopwords] for lyrics in lyrics_processed]

dictionary = Dictionary(lyrics_processed)
corpus = [dictionary.doc2bow(text) for text in lyrics_processed]
lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=20)

[(0,
  '0.161*"baby" + 0.030*"oh," + 0.024*"하" + 0.020*"na" + 0.019*"pop" + 0.012*"i\'m" + 0.011*"la" + 0.010*"♪" + 0.008*"like" + 0.008*"α"'),
 (1,
  '0.031*"like" + 0.022*"♪" + 0.020*"i\'m" + 0.016*"?" + 0.011*"love" + 0.011*"you\'re" + 0.010*"want" + 0.009*"don\'t" + 0.009*"it\'s" + 0.008*"한"'),
 (2,
  '0.054*"labour" + 0.033*"worse" + 0.032*">" + 0.026*"work" + 0.026*"want" + 0.022*"don\'t" + 0.019*"♪" + 0.011*"제" + 0.010*"love" + 0.009*"!"'),
 (3,
  '0.098*"cake" + 0.036*"제" + 0.018*"?" + 0.018*"don\'t" + 0.014*"significa" + 0.013*"나" + 0.013*"i\'m" + 0.012*"♪" + 0.008*"pop" + 0.008*"like"'),
 (4,
  '0.261*"du" + 0.083*"la" + 0.015*"open" + 0.007*"♪" + 0.007*"ð" + 0.007*"now" + 0.007*"labour" + 0.006*"?" + 0.006*"need" + 0.006*"♫"'),
 (5,
  '0.207*"na" + 0.062*"la" + 0.044*"우" + 0.020*"♪" + 0.014*"ð" + 0.014*"하" + 0.011*"?" + 0.010*"du" + 0.009*"i\'m" + 0.009*"만"'),
 (6,
  '0.063*"work" + 0.046*"labour" + 0.024*"ð" + 0.022*"i\'m" + 0.019*"invalid" + 0.018*"worse" + 0.013*"♪" + 0.0