# Lyrics Analysis

In [108]:
lyrics_directory = "Lyrics_Data/"
tracks_csv = "tracks_group.csv"
stopwords_file = "stopwords.txt"

In [109]:
target_group = "kpop"
kpop_group = set(['kpop', 'itzy', 'seventeen', 'bts', 'twice'])
global_group = set(['global', 'billboard'])

target_and_kpop = kpop_group.union(set([target_group]))
target_and_global = global_group.union(set([target_group]))

## Import data

In [110]:
import pandas as pd

In [111]:
df = pd.read_csv(tracks_csv)

df.head(3)

Unnamed: 0,track_name,track_id,grouping
0,Ice Cream (with Selena Gomez),2J4P46vCFm1rPkNkp9pZWX,kpop
1,Dynamite,0v1x6rN6JHRapa03JElljE,kpop
2,THE BADDEST,2V4Fx72svQRxrFvNT1eq5f,kpop


In [112]:
#all to "kpop"
df.loc[df['grouping'].isin(target_and_kpop), 'grouping'] = "kpop"

In [113]:
df.shape

(4618, 3)

In [114]:
df["grouping"].unique()

array(['kpop', 'global', 'billboard'], dtype=object)

In [115]:
def get_lyrics(track_id):
    try:
        lyrics = ""
        with open(lyrics_directory + track_id + ".txt", "r") as f:
            lyrics = f.read()
            
        return lyrics            
    except:
        return None
    
df["lyrics"] = df["track_id"].apply(get_lyrics)

df.shape

(4618, 4)

In [116]:
# stop words
# https://www.kaggle.com/datasets/rowhitswami/stopwords/
stopwords = set()
with open(stopwords_file, "r") as f:
    stopwords = set(f.read().split("\n"))
    
stopwords

{',',
 '>',
 '?',
 'Just',
 'Yes',
 'a',
 'abaft',
 'abafter',
 'abaftest',
 'about',
 'abouter',
 'aboutest',
 'above',
 'abover',
 'abovest',
 'accordingly',
 'aer',
 'aest',
 'afore',
 'after',
 'afterer',
 'afterest',
 'afterward',
 'afterwards',
 'again',
 'against',
 'aid',
 'ain',
 'albeit',
 'all',
 'aller',
 'allest',
 'alls',
 'allyou',
 'almost',
 'along',
 'alongside',
 'already',
 'also',
 'although',
 'always',
 'amid',
 'amidst',
 'among',
 'amongst',
 'an',
 'and',
 'andor',
 'anear',
 'anent',
 'another',
 'any',
 'anybody',
 'anyhow',
 'anyone',
 'anything',
 'anywhere',
 'apart',
 'aparter',
 'apartest',
 'appear',
 'appeared',
 'appearing',
 'appears',
 'appropriate',
 'appropriated',
 'appropriater',
 'appropriates',
 'appropriatest',
 'appropriating',
 'are',
 'ares',
 'around',
 'as',
 'ases',
 'aside',
 'asides',
 'aslant',
 'astraddle',
 'astraddler',
 'astraddlest',
 'astride',
 'astrider',
 'astridest',
 'at',
 'athwart',
 'atop',
 'atween',
 'aught',
 'aught

## Analyze

In [117]:
def is_stopword(word):
    word = word.lower()
    
    if len(word) <= 1:
        return True
    
    # in the list
    if word in stopwords:
        return True
    
    # is Korean
    if ord("가") <= ord(word[0]) <= ord("힣"):
        return True
    
    return False

### Characteristic Keyword
Using TF-IDF

In [118]:
from sklearn.feature_extraction.text import TfidfVectorizer

#### Among global

In [119]:
df_looking = df[df["grouping"].isin(target_and_global)].reset_index(drop=True)

In [120]:
vectorizer = TfidfVectorizer(stop_words=list(stopwords), max_features=1000, min_df=10)

x = vectorizer.fit_transform(df_looking["lyrics"].fillna("")).toarray()

df_tfidf = pd.DataFrame(x, columns=vectorizer.get_feature_names_out()).T



In [121]:
threshold = 0.001
counter = {}
summer = {}

for cnt in range(len(df_looking)):
    if df_looking["grouping"][cnt] != target_group:
        continue
    
    tfidf_series = df_tfidf[cnt]
    
    keywords = tfidf_series[tfidf_series > threshold].index.tolist()
    for keyword in keywords:
        counter[keyword] = counter.get(keyword, 0) + 1
        summer[keyword] = summer.get(keyword, 0) + tfidf_series[keyword]

In [122]:
def sort_dictionary(dictionary):
    return [(k, dictionary[k]) for k in sorted(dictionary, key=dictionary.get, reverse=True)]

In [123]:
sort_dictionary(counter)

[('love', 521),
 ('let', 507),
 ('sorry', 497),
 ('news', 412),
 ('home', 410),
 ('heart', 362),
 ('yes', 345),
 ('eyes', 313),
 ('baby', 311),
 ('feel', 301),
 ('say', 299),
 ('think', 292),
 ('back', 279),
 ('night', 270),
 ('take', 267),
 ('world', 239),
 ('new', 226),
 ('follow', 224),
 ('only', 220),
 ('am', 217),
 ('need', 212),
 ('tell', 206),
 ('life', 202),
 ('leave', 202),
 ('light', 199),
 ('dream', 199),
 ('keep', 195),
 ('room', 190),
 ('give', 179),
 ('way', 177),
 ('end', 171),
 ('mind', 170),
 ('down', 162),
 ('wait', 149),
 ('still', 146),
 ('hands', 143),
 ('child', 142),
 ('show', 140),
 ('dance', 139),
 ('girl', 139),
 ('looking', 136),
 ('care', 135),
 ('price_varies', 133),
 ('contact', 133),
 ('little', 130),
 ('hear', 128),
 ('ah', 128),
 ('okay', 126),
 ('away', 125),
 ('honey', 122),
 ('boy', 121),
 ('beautiful', 121),
 ('sleep', 119),
 ('find', 119),
 ('start', 119),
 ('ready', 118),
 ('feeling', 118),
 ('bad', 117),
 ('stay', 116),
 ('call', 115),
 ('sky', 1

In [124]:
sort_dictionary(summer)

[('love', 90.37153562331105),
 ('home', 76.21004040264718),
 ('sorry', 69.1397655218261),
 ('news', 48.08603122812252),
 ('let', 45.65971006638431),
 ('heart', 43.53378986841094),
 ('yes', 39.32657988099398),
 ('baby', 37.152868190550656),
 ('한국', 32.36819854379235),
 ('follow', 32.046654207752745),
 ('eyes', 30.337788713241792),
 ('night', 30.09200698624242),
 ('feel', 28.39620085138629),
 ('dream', 27.485997323149512),
 ('back', 27.444226966689936),
 ('world', 27.138972303382236),
 ('new', 25.668929887759134),
 ('say', 25.19307183864961),
 ('ah', 24.989323963741565),
 ('light', 24.884317238554694),
 ('only', 24.528770918077953),
 ('think', 24.049286804765085),
 ('woo', 23.311984021567397),
 ('room', 22.957403813727545),
 ('am', 22.89032814755495),
 ('dance', 22.81597776759301),
 ('need', 22.554843859626192),
 ('life', 21.359150404595603),
 ('contact', 20.8388989921961),
 ('hot', 20.835045446552304),
 ('give', 20.800300456215705),
 ('boy', 20.55275934228878),
 ('end', 19.7053305786462

#### Among KPOP

In [125]:
df_looking = df[df["grouping"].isin(target_and_kpop)].reset_index(drop=True)

In [126]:
vectorizer = TfidfVectorizer(stop_words=list(stopwords), max_features=1000, min_df=10)

x = vectorizer.fit_transform(df_looking["lyrics"].fillna("")).toarray()

df_tfidf = pd.DataFrame(x, columns=vectorizer.get_feature_names_out()).T



In [127]:
threshold = 0.001
counter = {}
summer = {}

for cnt in range(len(df_looking)):
    if df_looking["grouping"][cnt] != target_group:
        continue
    
    tfidf_series = df_tfidf[cnt]
    
    keywords = tfidf_series[tfidf_series > threshold].index.tolist()
    for keyword in keywords:
        counter[keyword] = counter.get(keyword, 0) + 1
        summer[keyword] = summer.get(keyword, 0) + tfidf_series[keyword]

In [128]:
sort_dictionary(counter)

[('love', 521),
 ('let', 507),
 ('sorry', 497),
 ('news', 413),
 ('home', 408),
 ('heart', 363),
 ('yes', 345),
 ('eyes', 313),
 ('baby', 312),
 ('feel', 301),
 ('say', 299),
 ('think', 292),
 ('back', 279),
 ('night', 271),
 ('take', 267),
 ('world', 239),
 ('new', 226),
 ('follow', 224),
 ('only', 221),
 ('am', 217),
 ('need', 212),
 ('tell', 207),
 ('life', 202),
 ('leave', 202),
 ('light', 199),
 ('dream', 199),
 ('keep', 195),
 ('room', 190),
 ('give', 179),
 ('way', 177),
 ('end', 171),
 ('mind', 170),
 ('down', 162),
 ('wait', 149),
 ('still', 146),
 ('hands', 144),
 ('child', 142),
 ('show', 140),
 ('dance', 139),
 ('girl', 139),
 ('looking', 136),
 ('care', 135),
 ('price_varies', 133),
 ('contact', 133),
 ('little', 130),
 ('hear', 128),
 ('ah', 128),
 ('away', 125),
 ('okay', 125),
 ('honey', 122),
 ('boy', 121),
 ('beautiful', 121),
 ('sleep', 119),
 ('find', 119),
 ('start', 119),
 ('feeling', 118),
 ('bad', 117),
 ('ready', 117),
 ('stay', 116),
 ('call', 115),
 ('sky', 1

In [129]:
sort_dictionary(summer)

[('love', 93.38607915430586),
 ('home', 67.60988950474074),
 ('sorry', 50.59637894638318),
 ('let', 47.93110110384121),
 ('heart', 41.116791964066365),
 ('yes', 41.10677536323856),
 ('baby', 38.186684222047816),
 ('news', 31.517989144550324),
 ('feel', 31.44896588575648),
 ('night', 31.329232070269644),
 ('say', 29.465943981617627),
 ('back', 28.917235907807306),
 ('only', 27.346261757010232),
 ('eyes', 27.17744214823235),
 ('think', 25.90023701555005),
 ('follow', 25.761256945043133),
 ('need', 25.761205628558344),
 ('world', 25.658269008485153),
 ('한국', 25.315827692846227),
 ('am', 24.95654988991367),
 ('take', 24.1156828940417),
 ('life', 23.881421538952736),
 ('give', 23.697773262598623),
 ('new', 23.122089100039577),
 ('ah', 22.84938734799792),
 ('dream', 22.769037842367304),
 ('light', 22.613825747417657),
 ('dance', 21.8695620276186),
 ('tell', 21.632868387047996),
 ('woo', 21.403899689771514),
 ('boy', 20.841865832124057),
 ('leave', 20.144998389279486),
 ('room', 19.6380575782

### Topic analysis

In [130]:
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel

In [131]:
def pre_process(df_lyrics):
    # tokenize lyrics
    lyrics_processed = df_lyrics.fillna("").str.split()

    for _list in lyrics_processed:
        for cnt in range(len(_list)):
            _list[cnt] = _list[cnt].replace(".", "").replace(",", "").replace("?", "").replace("\\", "").replace("/", "").replace(":", "").lower()
            
            if is_stopword(_list[cnt]):
                _list[cnt] = ""
                
    # remove empty string
    for cnt in range(len(lyrics_processed)):
        while(True):
            try:
                lyrics_processed[cnt].remove("")
            except:
                break

    return lyrics_processed

#### Among global

In [132]:
df_looking = df[df["grouping"].isin(target_and_global)].reset_index(drop=True)

In [133]:
#train model
lyrics_processed = pre_process(df_looking["lyrics"])
dictionary = Dictionary(lyrics_processed)
corpus = [dictionary.doc2bow(text) for text in lyrics_processed]
lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=100)

all topics

In [134]:
# observe topics
topics = lda.print_topics()

#sort
topics = sorted(topics, key=lambda x: x[0])

topics

[(6,
  '0.067*"love" + 0.061*"head" + 0.052*"boom" + 0.024*"heart" + 0.019*"shining" + 0.016*"name" + 0.012*"feel" + 0.012*"clap" + 0.012*"world" + 0.011*"change"'),
 (13,
  '0.027*"ooh" + 0.024*"say" + 0.023*"love" + 0.013*"mob" + 0.012*"fake" + 0.010*"life" + 0.009*"take" + 0.008*"shoot!" + 0.008*"(still" + 0.008*"true"'),
 (19,
  '0.135*"give" + 0.043*"fine" + 0.021*"find" + 0.019*"tan" + 0.018*"up!" + 0.016*"bitter" + 0.015*"well" + 0.011*"blood" + 0.011*"way" + 0.011*"feel"'),
 (21,
  '0.554*"drunk" + 0.015*"try" + 0.008*"follow" + 0.007*"need" + 0.005*"ass" + 0.005*"take" + 0.005*"science" + 0.004*"importance" + 0.004*"let\'s" + 0.004*"love"'),
 (27,
  '0.091*"imma" + 0.048*"candy" + 0.034*"yes" + 0.027*"tough" + 0.021*"fuck" + 0.018*"back)" + 0.016*"dam" + 0.011*"queen" + 0.010*"ocean" + 0.010*"back"'),
 (33,
  '0.032*"release" + 0.032*"che" + 0.025*"id" + 0.018*"mic" + 0.017*"goccia" + 0.016*"così" + 0.014*"take" + 0.014*"rousing" + 0.013*"bene" + 0.013*"ma"'),
 (38,
  '0.036*"

what topics are in kpop?

In [135]:
threshold = 0.3

counter = {}
summer = {}

for cnt in range(len(df_looking)):
    if df_looking["grouping"][cnt] != target_group:
        continue
    
    topics = lda.get_document_topics(corpus[cnt], minimum_probability=0)
    
    for topic in topics:
        if topic[1] > threshold:
            counter[topic[0]] = counter.get(topic[0], 0) + 1
            
        summer[topic[0]] = summer.get(topic[0], 0) + topic[1]

In [136]:
sort_dictionary(counter)

[(46, 110),
 (26, 55),
 (94, 44),
 (52, 42),
 (98, 29),
 (57, 27),
 (61, 23),
 (81, 22),
 (96, 21),
 (6, 21),
 (30, 21),
 (19, 19),
 (63, 19),
 (10, 17),
 (13, 17),
 (95, 15),
 (38, 14),
 (2, 14),
 (39, 14),
 (65, 13),
 (83, 13),
 (29, 12),
 (18, 12),
 (34, 12),
 (55, 12),
 (64, 12),
 (1, 12),
 (62, 11),
 (25, 11),
 (56, 11),
 (20, 10),
 (44, 10),
 (71, 10),
 (77, 10),
 (84, 10),
 (15, 9),
 (60, 9),
 (75, 9),
 (43, 9),
 (53, 9),
 (70, 9),
 (33, 9),
 (4, 9),
 (66, 9),
 (78, 9),
 (72, 8),
 (31, 8),
 (58, 8),
 (28, 8),
 (23, 8),
 (7, 8),
 (49, 8),
 (67, 8),
 (5, 7),
 (76, 7),
 (90, 7),
 (85, 7),
 (22, 7),
 (11, 7),
 (54, 7),
 (42, 7),
 (99, 7),
 (12, 7),
 (37, 7),
 (74, 7),
 (87, 6),
 (27, 6),
 (91, 6),
 (45, 6),
 (97, 6),
 (3, 6),
 (79, 6),
 (69, 6),
 (21, 5),
 (48, 5),
 (73, 5),
 (50, 4),
 (16, 4),
 (24, 4),
 (93, 4),
 (92, 4),
 (32, 4),
 (40, 4),
 (0, 4),
 (89, 3),
 (8, 3),
 (80, 3),
 (9, 3),
 (41, 3),
 (14, 2),
 (47, 2),
 (82, 2),
 (36, 2),
 (86, 2),
 (68, 2),
 (51, 2),
 (59, 1),
 (88

In [137]:
sums = sort_dictionary(summer)

sums

[(46, 94.34745529053271),
 (26, 63.96423925339059),
 (94, 44.44893861426317),
 (52, 40.287915803066426),
 (98, 39.36686307965783),
 (57, 34.8034904187798),
 (19, 31.589083668302465),
 (6, 28.985962629220012),
 (63, 28.945663333066022),
 (96, 28.043495560307292),
 (61, 27.82724805036196),
 (81, 27.599877303800895),
 (30, 26.32019776665038),
 (44, 24.8367213714223),
 (71, 24.648481582762543),
 (55, 24.21835835615275),
 (64, 23.626762061008776),
 (72, 22.598628589596956),
 (13, 22.21245981701213),
 (18, 21.304797137447622),
 (25, 20.551923422996424),
 (10, 20.54696081247448),
 (43, 20.369529699722534),
 (65, 20.2457305297321),
 (20, 20.222119668640516),
 (22, 19.868375761792322),
 (75, 19.536300962840414),
 (38, 19.414757197721883),
 (39, 18.91188365360358),
 (60, 18.693693060541136),
 (95, 18.52101289630764),
 (58, 18.297943226596544),
 (2, 18.2950037894675),
 (15, 18.01228848045639),
 (29, 17.70503469491814),
 (1, 17.160757744037255),
 (62, 16.824888300705425),
 (34, 16.665964629235532)

In [138]:
for _sum in sums[:10]:
    print("Topic ID: " + str(_sum[0]))
    print("Sum: " + str(_sum[1]))
    print("Count: " + str(counter[_sum[0]]))
    
    print("Keywords: ")
    print(lda.print_topic(int(_sum[0])))
    
    print("\n")

Topic ID: 46
Sum: 94.34745529053271
Count: 110
Keywords: 
0.049*"love" + 0.017*"can't" + 0.017*"need" + 0.017*"news" + 0.015*"back" + 0.013*"touch" + 0.013*"crazy" + 0.012*"heart" + 0.011*"miss" + 0.010*"only"


Topic ID: 26
Sum: 63.96423925339059
Count: 55
Keywords: 
0.027*"comment" + 0.026*"home" + 0.025*"think" + 0.018*"am" + 0.017*"news" + 0.015*"on!" + 0.015*"put" + 0.014*"days" + 0.012*"sorry" + 0.011*"reply"


Topic ID: 94
Sum: 44.44893861426317
Count: 44
Keywords: 
0.131*"love" + 0.028*"let" + 0.021*"life" + 0.019*"let's" + 0.017*"need" + 0.016*"honey" + 0.015*"give" + 0.014*"dawn" + 0.013*"eyes" + 0.013*"baby"


Topic ID: 52
Sum: 40.287915803066426
Count: 42
Keywords: 
0.413*"home" + 0.012*"sorry" + 0.011*"heart" + 0.011*"domino" + 0.011*"place" + 0.010*"run" + 0.008*"on!" + 0.007*"back" + 0.007*"chiudi" + 0.007*"there's"


Topic ID: 98
Sum: 39.36686307965783
Count: 29
Keywords: 
0.034*"need" + 0.026*"heart" + 0.018*"can't" + 0.017*"think" + 0.017*"feel" + 0.016*"love" + 0.014

#### Among KPOP

In [139]:
df_looking = df[df["grouping"].isin(target_and_kpop)].reset_index(drop=True)

In [140]:
#train model
lyrics_processed = pre_process(df_looking["lyrics"])
dictionary = Dictionary(lyrics_processed)
corpus = [dictionary.doc2bow(text) for text in lyrics_processed]
lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=100)

# observe topics
topics = lda.print_topics()

In [141]:
threshold = 0.3

counter = {}
summer = {}

for cnt in range(len(df_looking)):
    if df_looking["grouping"][cnt] != target_group:
        continue
    
    topics = lda.get_document_topics(corpus[cnt], minimum_probability=0)
    
    for topic in topics:
        if topic[1] > threshold:
            counter[topic[0]] = counter.get(topic[0], 0) + 1
            
        summer[topic[0]] = summer.get(topic[0], 0) + topic[1]


In [142]:
sort_dictionary(counter)

[(48, 48),
 (42, 48),
 (64, 42),
 (84, 42),
 (99, 39),
 (12, 32),
 (7, 31),
 (31, 31),
 (58, 29),
 (28, 22),
 (35, 22),
 (87, 21),
 (29, 21),
 (62, 21),
 (97, 21),
 (72, 21),
 (63, 20),
 (82, 19),
 (76, 19),
 (43, 19),
 (23, 19),
 (96, 19),
 (98, 18),
 (61, 18),
 (83, 18),
 (45, 18),
 (41, 18),
 (13, 18),
 (27, 17),
 (78, 17),
 (89, 17),
 (15, 17),
 (73, 16),
 (74, 16),
 (9, 16),
 (16, 16),
 (20, 16),
 (17, 15),
 (34, 15),
 (19, 15),
 (69, 15),
 (59, 15),
 (92, 15),
 (90, 15),
 (22, 15),
 (21, 14),
 (80, 14),
 (67, 14),
 (60, 14),
 (88, 14),
 (56, 14),
 (37, 14),
 (81, 13),
 (53, 13),
 (6, 13),
 (93, 13),
 (33, 13),
 (51, 13),
 (75, 13),
 (47, 13),
 (24, 13),
 (1, 12),
 (77, 12),
 (49, 12),
 (4, 12),
 (71, 11),
 (86, 11),
 (10, 11),
 (54, 11),
 (38, 11),
 (44, 11),
 (50, 11),
 (70, 10),
 (18, 10),
 (95, 10),
 (14, 10),
 (68, 10),
 (3, 9),
 (91, 9),
 (11, 9),
 (52, 9),
 (55, 9),
 (66, 9),
 (40, 8),
 (79, 8),
 (94, 8),
 (25, 8),
 (46, 8),
 (32, 8),
 (57, 7),
 (85, 7),
 (8, 7),
 (65, 7),


In [143]:

sums = sort_dictionary(summer)

sums

[(42, 47.87212189932461),
 (48, 46.39122459341661),
 (64, 42.65885938686324),
 (84, 39.12348793377669),
 (99, 37.11065647573696),
 (7, 31.035525688264443),
 (12, 30.864193163326036),
 (31, 29.68879391432847),
 (58, 29.318574029284264),
 (72, 24.299220275757307),
 (87, 22.832280059804816),
 (28, 22.814942863698434),
 (73, 22.300994403821733),
 (23, 22.130376888294904),
 (63, 21.861947919945578),
 (35, 21.545712849766915),
 (62, 20.941421931509012),
 (76, 20.5857677625404),
 (45, 20.582934994150037),
 (96, 20.522714962711234),
 (29, 20.485677266851326),
 (97, 19.770961405426988),
 (43, 19.71347072369963),
 (83, 19.517215378380115),
 (82, 19.463463431018226),
 (98, 19.136327407473345),
 (27, 18.919885771573263),
 (20, 18.84635671433489),
 (41, 18.685622694597896),
 (15, 18.636599308485074),
 (37, 18.247729307785903),
 (78, 17.99946661603599),
 (89, 17.655000901575022),
 (61, 17.604536578857278),
 (13, 17.593606907111734),
 (59, 17.56852834436131),
 (74, 17.451359031526863),
 (16, 17.18568

In [144]:
#sort
topics = sorted(topics, key=lambda x: x[0])


for _sum in sums[:10]:
    print("Topic ID: " + str(_sum[0]))
    print("Sum: " + str(_sum[1]))
    print("Count: " + str(counter[_sum[0]]))
    
    print("Keywords: ")
    print(lda.print_topic(int(_sum[0])))
    
    print("\n")

Topic ID: 42
Sum: 47.87212189932461
Count: 48
Keywords: 
0.062*"love" + 0.016*"say" + 0.015*"can't" + 0.015*"let" + 0.010*"waiting" + 0.009*"leave" + 0.009*"back" + 0.008*"take" + 0.008*"news" + 0.007*"zu"


Topic ID: 48
Sum: 46.39122459341661
Count: 48
Keywords: 
0.026*"love" + 0.025*"can't" + 0.021*"sorry" + 0.015*"only" + 0.014*"ohohohohohohohohohoh" + 0.012*"say" + 0.011*"heart" + 0.011*"gucci" + 0.011*"home" + 0.010*"give"


Topic ID: 64
Sum: 42.65885938686324
Count: 42
Keywords: 
0.222*"home" + 0.051*"invalid" + 0.016*"baby" + 0.015*"shut" + 0.014*"bum" + 0.009*"head" + 0.008*"chiudi" + 0.007*"id" + 0.007*"light" + 0.007*"love"


Topic ID: 84
Sum: 39.12348793377669
Count: 42
Keywords: 
0.039*"love" + 0.016*"moon" + 0.016*"baby" + 0.014*"need" + 0.012*"let" + 0.012*"fall" + 0.011*"heart" + 0.009*"point" + 0.008*"rock" + 0.008*"dawn"


Topic ID: 99
Sum: 37.11065647573696
Count: 39
Keywords: 
0.090*"love" + 0.024*"heart" + 0.018*"crazy" + 0.010*"give" + 0.010*"sorry" + 0.009*"happy"

### Emotion analysis
Emotions:  
fear
anger
anticipation
trust
surprise
positive
negative
sadness
disgust
joy

In [145]:
import numpy as np

In [146]:
emotions = [
    "anger",
    "anticipation",
    "disgust",
    "fear",
    "joy",
    "negative",	
    "positive",
    "sadness",
    "surprise",	
    "trust",
]

In [147]:
df_nrc = pd.read_csv("NRC.txt", sep="\t", names=["word", "emotion", "amount"])

words_to_emotions = {}

for _, row in df_nrc.iterrows():
    words_to_emotions[row["word"]] = words_to_emotions.get(row["word"], np.zeros((10, 1)))
    words_to_emotions[row["word"]][emotions.index(row["emotion"])] = row["amount"]

In [148]:
df_emotion = df.copy()
for emotion in emotions:
    df_emotion[emotion] = 0

In [149]:
for cnt in range(len(df)):
    lyrics = df["lyrics"][cnt]

    if type(lyrics) != str:
        continue

    emotion_this = np.zeros((10, 1))

    for word in lyrics.split():
        if word in words_to_emotions:
            emotion_this += words_to_emotions[word]
            
    if np.linalg.norm(emotion_this) != 0:
        emotion_this /= np.linalg.norm(emotion_this)
        
    for emotion in emotions:
        df_emotion[emotion][cnt] = emotion_this[emotions.index(emotion)]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_emotion[emotion][cnt] = emotion_this[emotions.index(emotion)]
  df_emotion[emotion][cnt] = emotion_this[emotions.index(emotion)]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_emotion[emotion][cnt] = emotion_this[emotions.index(emotion)]
  df_emotion[emotion][cnt] = emotion_this[emotions.index(emotion)]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_emotion[emotion][cnt] = emotion_this[emotions.index(emotion)]
  df_emotion[emotion][cnt] = emotion_this[emoti

In [150]:
df_emotion_global = df_emotion[df_emotion["grouping"].isin(global_group)]
mean_emotion_global = df_emotion_global[emotions].mean()
normalized_mean_emotion_global = mean_emotion_global / np.linalg.norm(mean_emotion_global)

df_emotion_kpop = df_emotion[df_emotion["grouping"].isin(kpop_group)]
mean_emotion_kpop = df_emotion_kpop[emotions].mean()
normalized_mean_emotion_kpop = mean_emotion_kpop / np.linalg.norm(mean_emotion_kpop)

df_emotion_target = df_emotion[df_emotion["grouping"] == target_group]
mean_emotion_target = df_emotion_target[emotions].mean()
normalized_mean_emotion_target = mean_emotion_target / np.linalg.norm(mean_emotion_target)

In [None]:
normalized_mean_emotion_target

against global

In [151]:
emotion_percentage = (normalized_mean_emotion_target - normalized_mean_emotion_global) / normalized_mean_emotion_global * 100

emotion_percentage

anger          -25.268986
anticipation     8.704336
disgust        -27.502075
fear           -11.956912
joy             12.735731
negative       -22.729707
positive        17.274200
sadness        -15.694206
surprise        -2.115851
trust           -4.557240
dtype: float64

against KPOP

In [152]:
emotion_percentage = (normalized_mean_emotion_target - normalized_mean_emotion_kpop) / normalized_mean_emotion_kpop * 100

emotion_percentage

anger           0.0
anticipation    0.0
disgust         0.0
fear            0.0
joy             0.0
negative        0.0
positive        0.0
sadness         0.0
surprise        0.0
trust           0.0
dtype: float64