# Lyrics Analysis

In [95]:
lyrics_directory = "Lyrics_Data/"
tracks_csv = "tracks_group.csv"
stopwords_file = "stopwords.txt"

In [96]:
target_group = "seventeen"
kpop_group = set(['kpop', 'itzy', 'seventeen', 'bts', 'twice'])
global_group = set(['global', 'billboard'])

target_and_kpop = kpop_group.union(set([target_group]))
target_and_global = global_group.union(set([target_group]))

## Import data

In [97]:
import pandas as pd

In [98]:
df = pd.read_csv(tracks_csv)

df.head(3)

Unnamed: 0,track_name,track_id,grouping
0,Ice Cream (with Selena Gomez),2J4P46vCFm1rPkNkp9pZWX,kpop
1,Dynamite,0v1x6rN6JHRapa03JElljE,kpop
2,THE BADDEST,2V4Fx72svQRxrFvNT1eq5f,kpop


In [99]:
df.shape

(4618, 3)

In [100]:
df["grouping"].unique()

array(['kpop', 'itzy', 'seventeen', 'bts', 'twice', 'global', 'billboard'],
      dtype=object)

In [101]:
def get_lyrics(track_id):
    try:
        lyrics = ""
        with open(lyrics_directory + track_id + ".txt", "r") as f:
            lyrics = f.read()
            
        return lyrics            
    except:
        return None
    
df["lyrics"] = df["track_id"].apply(get_lyrics)

df.shape

(4618, 4)

In [102]:
# stop words
# https://www.kaggle.com/datasets/rowhitswami/stopwords/
stopwords = set()
with open(stopwords_file, "r") as f:
    stopwords = set(f.read().split("\n"))
    
stopwords

{',',
 '>',
 '?',
 'Just',
 'Yes',
 'a',
 'abaft',
 'abafter',
 'abaftest',
 'about',
 'abouter',
 'aboutest',
 'above',
 'abover',
 'abovest',
 'accordingly',
 'aer',
 'aest',
 'afore',
 'after',
 'afterer',
 'afterest',
 'afterward',
 'afterwards',
 'again',
 'against',
 'aid',
 'ain',
 'albeit',
 'all',
 'aller',
 'allest',
 'alls',
 'allyou',
 'almost',
 'along',
 'alongside',
 'already',
 'also',
 'although',
 'always',
 'amid',
 'amidst',
 'among',
 'amongst',
 'an',
 'and',
 'andor',
 'anear',
 'anent',
 'another',
 'any',
 'anybody',
 'anyhow',
 'anyone',
 'anything',
 'anywhere',
 'apart',
 'aparter',
 'apartest',
 'appear',
 'appeared',
 'appearing',
 'appears',
 'appropriate',
 'appropriated',
 'appropriater',
 'appropriates',
 'appropriatest',
 'appropriating',
 'are',
 'ares',
 'around',
 'as',
 'ases',
 'aside',
 'asides',
 'aslant',
 'astraddle',
 'astraddler',
 'astraddlest',
 'astride',
 'astrider',
 'astridest',
 'at',
 'athwart',
 'atop',
 'atween',
 'aught',
 'aught

## Analyze

In [103]:
def is_stopword(word):
    word = word.lower()
    
    if len(word) <= 1:
        return True
    
    # in the list
    if word in stopwords:
        return True
    
    # is Korean
    if ord("가") <= ord(word[0]) <= ord("힣"):
        return True
    
    return False

### Characteristic Keyword
Using TF-IDF

In [104]:
from sklearn.feature_extraction.text import TfidfVectorizer

#### Among global

In [105]:
df_looking = df[df["grouping"].isin(target_and_global)].reset_index(drop=True)

In [106]:
vectorizer = TfidfVectorizer(stop_words=list(stopwords), max_features=1000, min_df=10)

x = vectorizer.fit_transform(df_looking["lyrics"].fillna("")).toarray()

df_tfidf = pd.DataFrame(x, columns=vectorizer.get_feature_names_out()).T



In [107]:
threshold = 0.001
counter = {}
summer = {}

for cnt in range(len(df_looking)):
    if df_looking["grouping"][cnt] != target_group:
        continue
    
    tfidf_series = df_tfidf[cnt]
    
    keywords = tfidf_series[tfidf_series > threshold].index.tolist()
    for keyword in keywords:
        counter[keyword] = counter.get(keyword, 0) + 1
        summer[keyword] = summer.get(keyword, 0) + tfidf_series[keyword]

In [108]:
def sort_dictionary(dictionary):
    return [(k, dictionary[k]) for k in sorted(dictionary, key=dictionary.get, reverse=True)]

In [109]:
sort_dictionary(counter)

[('home', 81),
 ('sorry', 80),
 ('news', 74),
 ('let', 72),
 ('heart', 61),
 ('love', 60),
 ('think', 48),
 ('baby', 42),
 ('new', 39),
 ('night', 39),
 ('only', 39),
 ('follow', 38),
 ('feel', 38),
 ('world', 38),
 ('end', 34),
 ('eyes', 34),
 ('back', 33),
 ('contact', 33),
 ('keep', 32),
 ('yes', 31),
 ('say', 31),
 ('looking', 30),
 ('leave', 30),
 ('dream', 30),
 ('mind', 29),
 ('happy', 29),
 ('sleep', 28),
 ('hands', 27),
 ('need', 25),
 ('yeh', 25),
 ('give', 25),
 ('morning', 25),
 ('find', 24),
 ('take', 24),
 ('am', 24),
 ('hand', 24),
 ('sky', 24),
 ('wait', 23),
 ('worry', 23),
 ('smile', 22),
 ('room', 21),
 ('tell', 21),
 ('ah', 20),
 ('sun', 20),
 ('dance', 20),
 ('meet', 20),
 ('today', 19),
 ('life', 19),
 ('music', 19),
 ('away', 18),
 ('way', 18),
 ('play', 18),
 ('light', 18),
 ('hear', 18),
 ('wind', 18),
 ('little', 17),
 ('change', 17),
 ('still', 17),
 ('child', 17),
 ('easy', 17),
 ('tomorrow', 17),
 ('place', 17),
 ('remember', 17),
 ('share', 16),
 ('call', 

In [110]:
sort_dictionary(summer)

[('home', 15.736966679808186),
 ('news', 14.518524685914194),
 ('sorry', 11.91577797479948),
 ('heart', 8.811270094419466),
 ('love', 8.688800538611188),
 ('baby', 7.788746881942375),
 ('contact', 6.832942029185571),
 ('yeh', 6.542506996848126),
 ('world', 6.220133391429427),
 ('let', 6.0279484609790055),
 ('follow', 5.612129997871336),
 ('dream', 5.371939112373928),
 ('only', 5.096528380955112),
 ('new', 5.071695046562155),
 ('end', 4.9540229578548205),
 ('think', 4.712041506435446),
 ('happy', 4.496023423359318),
 ('back', 4.005746254043052),
 ('give', 3.7335500790827494),
 ('looking', 3.6827297732633477),
 ('wind', 3.670400931134151),
 ('worry', 3.6425433783146657),
 ('smile', 3.589368585436858),
 ('shut', 3.571071355579521),
 ('leave', 3.5368330698902164),
 ('ay', 3.4872861784621976),
 ('night', 3.483549123653218),
 ('yes', 3.44780622637595),
 ('run', 3.4310940206210545),
 ('song', 3.418346823020923),
 ('life', 3.364894648261042),
 ('um', 3.3641824991900555),
 ('music', 3.347872862

#### Among KPOP

In [111]:
df_looking = df[df["grouping"].isin(target_and_kpop)].reset_index(drop=True)

In [112]:
vectorizer = TfidfVectorizer(stop_words=list(stopwords), max_features=1000, min_df=10)

x = vectorizer.fit_transform(df_looking["lyrics"].fillna("")).toarray()

df_tfidf = pd.DataFrame(x, columns=vectorizer.get_feature_names_out()).T



In [113]:
threshold = 0.001
counter = {}
summer = {}

for cnt in range(len(df_looking)):
    if df_looking["grouping"][cnt] != target_group:
        continue
    
    tfidf_series = df_tfidf[cnt]
    
    keywords = tfidf_series[tfidf_series > threshold].index.tolist()
    for keyword in keywords:
        counter[keyword] = counter.get(keyword, 0) + 1
        summer[keyword] = summer.get(keyword, 0) + tfidf_series[keyword]

In [114]:
sort_dictionary(counter)

[('home', 80),
 ('sorry', 80),
 ('news', 74),
 ('let', 72),
 ('heart', 61),
 ('love', 60),
 ('think', 48),
 ('baby', 42),
 ('night', 40),
 ('only', 40),
 ('new', 39),
 ('follow', 38),
 ('feel', 38),
 ('world', 38),
 ('end', 34),
 ('eyes', 34),
 ('back', 33),
 ('contact', 33),
 ('keep', 32),
 ('yes', 31),
 ('say', 31),
 ('looking', 30),
 ('leave', 30),
 ('dream', 30),
 ('mind', 29),
 ('happy', 29),
 ('sleep', 28),
 ('hands', 27),
 ('need', 25),
 ('yeh', 25),
 ('give', 25),
 ('morning', 25),
 ('find', 24),
 ('take', 24),
 ('am', 24),
 ('hand', 24),
 ('sky', 24),
 ('wait', 23),
 ('worry', 23),
 ('tell', 22),
 ('smile', 22),
 ('room', 21),
 ('ah', 20),
 ('sun', 20),
 ('dance', 20),
 ('comment', 20),
 ('meet', 20),
 ('today', 19),
 ('life', 19),
 ('music', 19),
 ('away', 18),
 ('price_varies', 18),
 ('way', 18),
 ('play', 18),
 ('light', 18),
 ('hear', 18),
 ('wind', 18),
 ('little', 17),
 ('change', 17),
 ('still', 17),
 ('child', 17),
 ('easy', 17),
 ('song', 17),
 ('questions', 17),
 ('t

In [115]:
sort_dictionary(summer)

[('home', 12.15820554016299),
 ('love', 9.029950266202746),
 ('heart', 7.8499695759031205),
 ('baby', 7.383031344719721),
 ('news', 7.112408166336561),
 ('sorry', 6.890558195969502),
 ('only', 5.749848884568152),
 ('yeh', 5.713494252203872),
 ('let', 5.712994427725181),
 ('think', 5.069674096544697),
 ('world', 4.969372652295935),
 ('back', 4.683235562496428),
 ('give', 4.351780180473527),
 ('end', 4.047276678189264),
 ('dream', 4.043967495151701),
 ('new', 4.031170116408636),
 ('happy', 4.013403552645574),
 ('contact', 3.9751484423575043),
 ('life', 3.706154365069785),
 ('run', 3.6648673415278665),
 ('follow', 3.6406844515529726),
 ('looking', 3.619639156326462),
 ('yes', 3.5779200142749383),
 ('night', 3.481554551838795),
 ('say', 3.4765591544016083),
 ('feel', 3.3922187540080064),
 ('song', 3.3899439264015068),
 ('call', 3.38277431976287),
 ('am', 3.371554583814393),
 ('music', 3.370028153834897),
 ('find', 3.316858735067583),
 ('leave', 3.3125686232572074),
 ('ay', 3.24221179522855

### Topic analysis

In [116]:
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel

In [117]:
def pre_process(df_lyrics):
    # tokenize lyrics
    lyrics_processed = df_lyrics.fillna("").str.split()

    for _list in lyrics_processed:
        for cnt in range(len(_list)):
            _list[cnt] = _list[cnt].replace(".", "").replace(",", "").replace("?", "").replace("\\", "").replace("/", "").replace(":", "").lower()
            
            if is_stopword(_list[cnt]):
                _list[cnt] = ""
                
    # remove empty string
    for cnt in range(len(lyrics_processed)):
        while(True):
            try:
                lyrics_processed[cnt].remove("")
            except:
                break

    return lyrics_processed

#### Among global

In [118]:
df_looking = df[df["grouping"].isin(target_and_global)].reset_index(drop=True)

In [119]:
#train model
lyrics_processed = pre_process(df_looking["lyrics"])
dictionary = Dictionary(lyrics_processed)
corpus = [dictionary.doc2bow(text) for text in lyrics_processed]
lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=100)

all topics

In [120]:
# observe topics
topics = lda.print_topics()

#sort
topics = sorted(topics, key=lambda x: x[0])

topics

[(2,
  '0.149*"party" + 0.033*"thank" + 0.016*"animals" + 0.012*"party)" + 0.010*"say" + 0.009*"flower" + 0.009*"(thank" + 0.009*"next" + 0.009*"next)" + 0.008*"fucking"'),
 (5,
  '0.193*"dance" + 0.028*"worth" + 0.011*"let" + 0.010*"baby" + 0.009*"increasing" + 0.009*"say" + 0.009*"love" + 0.009*"let\'s" + 0.007*"hands" + 0.007*"can\'t"'),
 (9,
  '0.127*"show" + 0.048*"way" + 0.015*"only" + 0.014*"safe" + 0.011*"healthy" + 0.010*"find" + 0.008*"love" + 0.007*"world" + 0.007*"far" + 0.007*"closer"'),
 (16,
  '0.285*"eyes" + 0.059*"g6" + 0.036*"infinite" + 0.032*"dj" + 0.014*"take" + 0.013*"hands" + 0.013*"put" + 0.011*"fall" + 0.008*"love" + 0.008*"(ah"'),
 (22,
  '0.036*"solosta" + 0.021*"need" + 0.015*"give" + 0.010*"ride" + 0.009*"hand" + 0.009*"once" + 0.009*"feel" + 0.009*"yes" + 0.009*"girl" + 0.009*"alone"'),
 (25,
  '0.021*"girl" + 0.019*"looks" + 0.015*"rock" + 0.015*"body" + 0.012*"straight" + 0.012*"big" + 0.010*"only" + 0.010*"take" + 0.010*"yes" + 0.010*"man"'),
 (28,
  '0

what topics are in kpop?

In [121]:
threshold = 0.3

counter = {}
summer = {}

for cnt in range(len(df_looking)):
    if df_looking["grouping"][cnt] != target_group:
        continue
    
    topics = lda.get_document_topics(corpus[cnt], minimum_probability=0)
    
    for topic in topics:
        if topic[1] > threshold:
            counter[topic[0]] = counter.get(topic[0], 0) + 1
            
        summer[topic[0]] = summer.get(topic[0], 0) + topic[1]

In [122]:
sort_dictionary(counter)

[(36, 17),
 (32, 12),
 (79, 9),
 (24, 8),
 (6, 7),
 (72, 5),
 (51, 4),
 (0, 4),
 (54, 4),
 (34, 3),
 (97, 3),
 (94, 3),
 (1, 3),
 (33, 3),
 (92, 3),
 (30, 3),
 (68, 3),
 (70, 2),
 (77, 2),
 (60, 2),
 (55, 2),
 (48, 2),
 (46, 2),
 (10, 2),
 (88, 2),
 (23, 2),
 (4, 2),
 (87, 1),
 (75, 1),
 (83, 1),
 (56, 1),
 (39, 1),
 (41, 1),
 (50, 1),
 (13, 1),
 (26, 1),
 (49, 1),
 (5, 1),
 (27, 1),
 (38, 1),
 (73, 1),
 (61, 1),
 (59, 1),
 (82, 1),
 (53, 1),
 (3, 1),
 (19, 1)]

In [123]:
sums = sort_dictionary(summer)

sums

[(36, 15.5893348401396),
 (32, 11.91839600962976),
 (79, 8.511111778981103),
 (6, 8.128743125150322),
 (51, 7.332326606406241),
 (94, 7.309730531729656),
 (34, 7.061807837099877),
 (24, 5.98641967075946),
 (72, 5.709724241265576),
 (30, 5.124376632099484),
 (92, 4.719367550696006),
 (38, 4.32157221068519),
 (48, 4.318225657482799),
 (33, 4.314088649774021),
 (22, 4.063316682149889),
 (97, 3.9126402728097673),
 (46, 3.5866152379849154),
 (0, 3.576765094784605),
 (59, 3.313649829629867),
 (87, 3.3123557820572387),
 (54, 3.278474110697971),
 (1, 3.22765487002016),
 (77, 3.217694475024473),
 (61, 3.2091273535361324),
 (75, 3.159904057089989),
 (60, 3.1475812619964927),
 (19, 3.1288701944858985),
 (55, 3.125835261964312),
 (70, 3.1091034165501696),
 (88, 3.02433581848436),
 (23, 2.9001070760305083),
 (17, 2.887709945500319),
 (64, 2.8641585269970165),
 (11, 2.7833231043650812),
 (90, 2.564852961538236),
 (49, 2.553538547122116),
 (63, 2.5242298623370516),
 (10, 2.5171694262562596),
 (4, 2.4

In [124]:
for _sum in sums[:10]:
    print("Topic ID: " + str(_sum[0]))
    print("Sum: " + str(_sum[1]))
    print("Count: " + str(counter[_sum[0]]))
    
    print("Keywords: ")
    print(lda.print_topic(int(_sum[0])))
    
    print("\n")

Topic ID: 36
Sum: 15.5893348401396
Count: 17
Keywords: 
0.024*"call" + 0.019*"back" + 0.018*"say" + 0.015*"let's" + 0.015*"wave" + 0.010*"night" + 0.010*"air" + 0.008*"love" + 0.007*"feel" + 0.007*"summer"


Topic ID: 32
Sum: 11.91839600962976
Count: 12
Keywords: 
0.117*"love" + 0.019*"heart" + 0.017*"girl" + 0.013*"can't" + 0.011*"only" + 0.011*"let" + 0.009*"home" + 0.009*"think" + 0.008*"tell" + 0.007*"take"


Topic ID: 79
Sum: 8.511111778981103
Count: 9
Keywords: 
0.022*"love" + 0.019*"heart" + 0.016*"only" + 0.016*"girl" + 0.015*"take" + 0.013*"little" + 0.013*"home" + 0.013*"feel" + 0.013*"tell" + 0.010*"say"


Topic ID: 6
Sum: 8.128743125150322
Count: 7
Keywords: 
0.460*"baby" + 0.013*"power" + 0.009*"ya" + 0.008*"heart" + 0.007*"we'll" + 0.007*"ay" + 0.006*"hit" + 0.006*"hold" + 0.005*"cuz" + 0.005*"fly"


Topic ID: 51
Sum: 7.332326606406241
Count: 4
Keywords: 
0.040*"wait" + 0.025*"lights" + 0.019*"happy" + 0.014*"can't" + 0.013*"news" + 0.012*"say" + 0.011*"love" + 0.009*"fee

#### Among KPOP

In [125]:
df_looking = df[df["grouping"].isin(target_and_kpop)].reset_index(drop=True)

In [126]:
#train model
lyrics_processed = pre_process(df_looking["lyrics"])
dictionary = Dictionary(lyrics_processed)
corpus = [dictionary.doc2bow(text) for text in lyrics_processed]
lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=100)

# observe topics
topics = lda.print_topics()

In [127]:
threshold = 0.3

counter = {}
summer = {}

for cnt in range(len(df_looking)):
    if df_looking["grouping"][cnt] != target_group:
        continue
    
    topics = lda.get_document_topics(corpus[cnt], minimum_probability=0)
    
    for topic in topics:
        if topic[1] > threshold:
            counter[topic[0]] = counter.get(topic[0], 0) + 1
            
        summer[topic[0]] = summer.get(topic[0], 0) + topic[1]


In [128]:
sort_dictionary(counter)

[(74, 10),
 (66, 7),
 (29, 7),
 (87, 7),
 (81, 6),
 (57, 6),
 (19, 6),
 (4, 5),
 (83, 5),
 (61, 5),
 (88, 5),
 (0, 5),
 (77, 4),
 (98, 4),
 (37, 4),
 (85, 4),
 (18, 4),
 (70, 4),
 (5, 3),
 (71, 3),
 (49, 3),
 (41, 3),
 (14, 3),
 (95, 3),
 (75, 3),
 (6, 3),
 (1, 3),
 (20, 3),
 (67, 3),
 (21, 3),
 (10, 3),
 (76, 3),
 (15, 3),
 (2, 3),
 (8, 2),
 (22, 2),
 (79, 2),
 (58, 2),
 (53, 2),
 (55, 2),
 (60, 2),
 (27, 2),
 (32, 2),
 (12, 2),
 (31, 2),
 (23, 2),
 (97, 2),
 (30, 2),
 (73, 2),
 (3, 2),
 (52, 2),
 (42, 2),
 (35, 2),
 (45, 2),
 (93, 1),
 (94, 1),
 (69, 1),
 (56, 1),
 (99, 1),
 (86, 1),
 (46, 1),
 (7, 1),
 (47, 1),
 (50, 1),
 (28, 1),
 (13, 1),
 (80, 1),
 (59, 1),
 (91, 1),
 (16, 1),
 (90, 1),
 (78, 1),
 (36, 1),
 (26, 1),
 (89, 1),
 (9, 1),
 (44, 1),
 (92, 1),
 (62, 1),
 (54, 1),
 (33, 1)]

In [129]:

sums = sort_dictionary(summer)

sums

[(74, 7.53662898629409),
 (83, 6.697580643187393),
 (29, 6.409054710085911),
 (66, 6.377055175282294),
 (57, 6.346622561148251),
 (19, 6.148868624375609),
 (87, 6.1287213917967165),
 (81, 6.095300212546135),
 (70, 5.281404701854626),
 (88, 5.062123182280629),
 (98, 4.925158909129095),
 (4, 4.827530357673822),
 (77, 4.777764074897277),
 (61, 4.708217222621897),
 (85, 4.477699531307735),
 (37, 4.42885576075787),
 (0, 4.279442219551129),
 (75, 4.2665555423736805),
 (18, 4.251563349855132),
 (95, 4.118555302462482),
 (21, 3.954200449967175),
 (1, 3.9273885219736258),
 (67, 3.8901459029584657),
 (6, 3.8584369057352887),
 (15, 3.6920271464114194),
 (14, 3.594542479957454),
 (41, 3.3697585102781886),
 (71, 3.3330627011018805),
 (20, 3.3094114027553587),
 (3, 3.088496930387919),
 (12, 2.9865094357010094),
 (76, 2.950348244718043),
 (2, 2.927789156776271),
 (23, 2.9046782628865913),
 (49, 2.8598288234425127),
 (5, 2.8499000459123636),
 (53, 2.8329897031289875),
 (8, 2.816788261759939),
 (58, 2.

In [130]:
#sort
topics = sorted(topics, key=lambda x: x[0])


for _sum in sums[:10]:
    print("Topic ID: " + str(_sum[0]))
    print("Sum: " + str(_sum[1]))
    print("Count: " + str(counter[_sum[0]]))
    
    print("Keywords: ")
    print(lda.print_topic(int(_sum[0])))
    
    print("\n")

Topic ID: 74
Sum: 7.53662898629409
Count: 10
Keywords: 
0.046*"dance" + 0.042*"home" + 0.012*"follow" + 0.011*"dream" + 0.011*"sorry" + 0.010*"feel" + 0.010*"play" + 0.009*"yes" + 0.008*"heart" + 0.008*"eyes"


Topic ID: 83
Sum: 6.697580643187393
Count: 5
Keywords: 
0.015*"sorry" + 0.015*"news" + 0.013*"yes" + 0.013*"baby" + 0.011*"take" + 0.010*"end" + 0.010*"share" + 0.010*"night" + 0.008*"we're" + 0.008*"world"


Topic ID: 29
Sum: 6.409054710085911
Count: 7
Keywords: 
0.047*"home" + 0.044*"baby" + 0.033*"aite" + 0.015*"on!" + 0.014*"love" + 0.009*"hold" + 0.009*"feel" + 0.009*"domino" + 0.009*"hey!" + 0.008*"sorry"


Topic ID: 66
Sum: 6.377055175282294
Count: 7
Keywords: 
0.027*"sorry" + 0.021*"love" + 0.015*"heart" + 0.012*"surf" + 0.012*"home" + 0.010*"ring" + 0.009*"swipe" + 0.009*"baby" + 0.009*"happy" + 0.008*"world"


Topic ID: 57
Sum: 6.346622561148251
Count: 6
Keywords: 
0.084*"home" + 0.029*"ay" + 0.028*"ya" + 0.022*"cuz" + 0.017*"sky" + 0.015*"aya" + 0.013*"highway" + 0.01

### Emotion analysis
Emotions:  
fear
anger
anticipation
trust
surprise
positive
negative
sadness
disgust
joy

In [131]:
import numpy as np

In [132]:
emotions = [
    "anger",
    "anticipation",
    "disgust",
    "fear",
    "joy",
    "negative",	
    "positive",
    "sadness",
    "surprise",	
    "trust",
]

In [133]:
df_nrc = pd.read_csv("NRC.txt", sep="\t", names=["word", "emotion", "amount"])

words_to_emotions = {}

for _, row in df_nrc.iterrows():
    words_to_emotions[row["word"]] = words_to_emotions.get(row["word"], np.zeros((10, 1)))
    words_to_emotions[row["word"]][emotions.index(row["emotion"])] = row["amount"]

In [134]:
df_emotion = df.copy()
for emotion in emotions:
    df_emotion[emotion] = 0

In [135]:
for cnt in range(len(df)):
    lyrics = df["lyrics"][cnt]

    if type(lyrics) != str:
        continue

    emotion_this = np.zeros((10, 1))

    for word in lyrics.split():
        if word in words_to_emotions:
            emotion_this += words_to_emotions[word]
            
    if np.linalg.norm(emotion_this) != 0:
        emotion_this /= np.linalg.norm(emotion_this)
        
    for emotion in emotions:
        df_emotion[emotion][cnt] = emotion_this[emotions.index(emotion)]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_emotion[emotion][cnt] = emotion_this[emotions.index(emotion)]
  df_emotion[emotion][cnt] = emotion_this[emotions.index(emotion)]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_emotion[emotion][cnt] = emotion_this[emotions.index(emotion)]
  df_emotion[emotion][cnt] = emotion_this[emotions.index(emotion)]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_emotion[emotion][cnt] = emotion_this[emotions.index(emotion)]
  df_emotion[emotion][cnt] = emotion_this[emoti

In [136]:
df_emotion_global = df_emotion[df_emotion["grouping"].isin(global_group)]
mean_emotion_global = df_emotion_global[emotions].mean()
normalized_mean_emotion_global = mean_emotion_global / np.linalg.norm(mean_emotion_global)

df_emotion_kpop = df_emotion[df_emotion["grouping"].isin(kpop_group)]
mean_emotion_kpop = df_emotion_kpop[emotions].mean()
normalized_mean_emotion_kpop = mean_emotion_kpop / np.linalg.norm(mean_emotion_kpop)

df_emotion_target = df_emotion[df_emotion["grouping"] == target_group]
mean_emotion_target = df_emotion_target[emotions].mean()
normalized_mean_emotion_target = mean_emotion_target / np.linalg.norm(mean_emotion_target)

against global

In [137]:
emotion_percentage = (normalized_mean_emotion_target - normalized_mean_emotion_global) / normalized_mean_emotion_global * 100

emotion_percentage

anger          -54.343950
anticipation    24.733574
disgust        -42.846661
fear           -17.049931
joy              8.217464
negative       -30.594136
positive        21.500227
sadness        -27.956571
surprise       -14.356268
trust            7.200270
dtype: float64

against KPOP

In [138]:
emotion_percentage = (normalized_mean_emotion_target - normalized_mean_emotion_kpop) / normalized_mean_emotion_kpop * 100

emotion_percentage

anger          -38.906155
anticipation    14.745721
disgust        -21.165552
fear            -5.784689
joy             -4.007840
negative       -10.177817
positive         3.603543
sadness        -14.545104
surprise       -12.505004
trust           12.318912
dtype: float64