# Lyrics Analysis

In [45]:
lyrics_directory = "Lyrics_Data/"
tracks_csv = "tracks_group.csv"
stopwords_file = "stopwords.txt"

In [46]:
target_group = "itzy"
kpop_group = set(['kpop', 'itzy', 'seventeen', 'bts', 'twice'])
global_group = set(['global', 'billboard'])

target_and_kpop = kpop_group.union(set([target_group]))
target_and_global = global_group.union(set([target_group]))

## Import data

In [47]:
import pandas as pd

In [48]:
df = pd.read_csv(tracks_csv)

df.head(3)

Unnamed: 0,track_name,track_id,grouping
0,Ice Cream (with Selena Gomez),2J4P46vCFm1rPkNkp9pZWX,kpop
1,Dynamite,0v1x6rN6JHRapa03JElljE,kpop
2,THE BADDEST,2V4Fx72svQRxrFvNT1eq5f,kpop


In [49]:
df.shape

(4618, 3)

In [50]:
df["grouping"].unique()

array(['kpop', 'itzy', 'seventeen', 'bts', 'twice', 'global', 'billboard'],
      dtype=object)

In [51]:
def get_lyrics(track_id):
    try:
        lyrics = ""
        with open(lyrics_directory + track_id + ".txt", "r") as f:
            lyrics = f.read()
            
        return lyrics            
    except:
        return None
    
df["lyrics"] = df["track_id"].apply(get_lyrics)

df.shape

(4618, 4)

In [52]:
# stop words
# https://www.kaggle.com/datasets/rowhitswami/stopwords/
stopwords = set()
with open(stopwords_file, "r") as f:
    stopwords = set(f.read().split("\n"))
    
stopwords

{',',
 '>',
 '?',
 'Just',
 'Yes',
 'a',
 'abaft',
 'abafter',
 'abaftest',
 'about',
 'abouter',
 'aboutest',
 'above',
 'abover',
 'abovest',
 'accordingly',
 'aer',
 'aest',
 'afore',
 'after',
 'afterer',
 'afterest',
 'afterward',
 'afterwards',
 'again',
 'against',
 'aid',
 'ain',
 'albeit',
 'all',
 'aller',
 'allest',
 'alls',
 'allyou',
 'almost',
 'along',
 'alongside',
 'already',
 'also',
 'although',
 'always',
 'amid',
 'amidst',
 'among',
 'amongst',
 'an',
 'and',
 'andor',
 'anear',
 'anent',
 'another',
 'any',
 'anybody',
 'anyhow',
 'anyone',
 'anything',
 'anywhere',
 'apart',
 'aparter',
 'apartest',
 'appear',
 'appeared',
 'appearing',
 'appears',
 'appropriate',
 'appropriated',
 'appropriater',
 'appropriates',
 'appropriatest',
 'appropriating',
 'are',
 'ares',
 'around',
 'as',
 'ases',
 'aside',
 'asides',
 'aslant',
 'astraddle',
 'astraddler',
 'astraddlest',
 'astride',
 'astrider',
 'astridest',
 'at',
 'athwart',
 'atop',
 'atween',
 'aught',
 'aught

## Analyze

In [53]:
def is_stopword(word):
    word = word.lower()
    
    if len(word) <= 1:
        return True
    
    # in the list
    if word in stopwords:
        return True
    
    # is Korean
    if ord("가") <= ord(word[0]) <= ord("힣"):
        return True
    
    return False

### Characteristic Keyword
Using TF-IDF

In [54]:
from sklearn.feature_extraction.text import TfidfVectorizer

#### Among global

In [55]:
df_looking = df[df["grouping"].isin(target_and_global)].reset_index(drop=True)

In [56]:
vectorizer = TfidfVectorizer(stop_words=list(stopwords), max_features=1000, min_df=10)

x = vectorizer.fit_transform(df_looking["lyrics"].fillna("")).toarray()

df_tfidf = pd.DataFrame(x, columns=vectorizer.get_feature_names_out()).T



In [57]:
threshold = 0.001
counter = {}
summer = {}

for cnt in range(len(df_looking)):
    if df_looking["grouping"][cnt] != target_group:
        continue
    
    tfidf_series = df_tfidf[cnt]
    
    keywords = tfidf_series[tfidf_series > threshold].index.tolist()
    for keyword in keywords:
        counter[keyword] = counter.get(keyword, 0) + 1
        summer[keyword] = summer.get(keyword, 0) + tfidf_series[keyword]

In [58]:
def sort_dictionary(dictionary):
    return [(k, dictionary[k]) for k in sorted(dictionary, key=dictionary.get, reverse=True)]

In [59]:
sort_dictionary(counter)

[('sorry', 23),
 ('love', 22),
 ('mind', 22),
 ('yes', 21),
 ('let', 20),
 ('say', 19),
 ('tell', 14),
 ('feel', 14),
 ('baby', 13),
 ('down', 13),
 ('heart', 13),
 ('keep', 13),
 ('take', 12),
 ('care', 12),
 ('back', 12),
 ('bad', 11),
 ('eyes', 11),
 ('matter', 11),
 ('crazy', 11),
 ('think', 11),
 ('show', 10),
 ('control', 10),
 ('need', 10),
 ('room', 10),
 ('ah', 10),
 ('am', 9),
 ('turn', 9),
 ('put', 9),
 ('little', 9),
 ('forget', 9),
 ('way', 9),
 ('home', 9),
 ('only', 8),
 ('hear', 8),
 ('coming', 8),
 ('girl', 8),
 ('game', 8),
 ('doesn', 7),
 ('face', 7),
 ('honey', 7),
 ('ready', 7),
 ('break', 7),
 ('dance', 7),
 ('worry', 7),
 ('okay', 7),
 ('stay', 7),
 ('follow', 7),
 ('play', 7),
 ('head', 7),
 ('high', 7),
 ('hit', 7),
 ('light', 7),
 ('world', 7),
 ('cuz', 6),
 ('give', 6),
 ('child', 6),
 ('fight', 6),
 ('kill', 6),
 ('feeling', 6),
 ('uh', 6),
 ('boy', 6),
 ('didn', 6),
 ('start', 6),
 ('still', 6),
 ('top', 6),
 ('new', 6),
 ('free', 6),
 ('end', 6),
 ('lost',

In [60]:
sort_dictionary(summer)

[('love', 3.2111590137486092),
 ('sorry', 3.1624247485449186),
 ('trust', 2.3862535651729093),
 ('hot', 2.352824882534737),
 ('woo', 2.1142086197441174),
 ('mind', 2.1046574503957345),
 ('put', 2.020884450575801),
 ('flower', 1.980310452818118),
 ('em', 1.9626333700933452),
 ('shy', 1.9226858042564654),
 ('ready', 1.9188426805354042),
 ('worry', 1.837092103431801),
 ('yes', 1.8106564463544794),
 ('business', 1.8082325752768063),
 ('fall', 1.8011038824731416),
 ('heart', 1.7928345825331622),
 ('care', 1.7165829596424889),
 ('chi', 1.7042387770576761),
 ('keep', 1.6902832699626618),
 ('taking', 1.608198525916642),
 ('matter', 1.5818529324911066),
 ('home', 1.5356698179913768),
 ('finally', 1.5236500520493004),
 ('crazy', 1.510161642126581),
 ('think', 1.505804405173682),
 ('tomorrow', 1.4769442397116785),
 ('ah', 1.4542463159967527),
 ('words', 1.4389624091552111),
 ('control', 1.411335330255775),
 ('hit', 1.366162842085624),
 ('break', 1.3357711651378272),
 ('stronger', 1.28558964433042

#### Among KPOP

In [61]:
df_looking = df[df["grouping"].isin(target_and_kpop)].reset_index(drop=True)

In [62]:
vectorizer = TfidfVectorizer(stop_words=list(stopwords), max_features=1000, min_df=10)

x = vectorizer.fit_transform(df_looking["lyrics"].fillna("")).toarray()

df_tfidf = pd.DataFrame(x, columns=vectorizer.get_feature_names_out()).T



In [63]:
threshold = 0.001
counter = {}
summer = {}

for cnt in range(len(df_looking)):
    if df_looking["grouping"][cnt] != target_group:
        continue
    
    tfidf_series = df_tfidf[cnt]
    
    keywords = tfidf_series[tfidf_series > threshold].index.tolist()
    for keyword in keywords:
        counter[keyword] = counter.get(keyword, 0) + 1
        summer[keyword] = summer.get(keyword, 0) + tfidf_series[keyword]

In [64]:
sort_dictionary(counter)

[('sorry', 23),
 ('love', 22),
 ('mind', 22),
 ('yes', 21),
 ('let', 20),
 ('say', 19),
 ('tell', 14),
 ('feel', 14),
 ('baby', 13),
 ('down', 13),
 ('heart', 13),
 ('keep', 13),
 ('take', 12),
 ('care', 12),
 ('back', 12),
 ('bad', 11),
 ('eyes', 11),
 ('matter', 11),
 ('crazy', 11),
 ('think', 11),
 ('show', 10),
 ('control', 10),
 ('need', 10),
 ('room', 10),
 ('ah', 10),
 ('am', 9),
 ('turn', 9),
 ('put', 9),
 ('little', 9),
 ('forget', 9),
 ('way', 9),
 ('home', 9),
 ('only', 8),
 ('hear', 8),
 ('coming', 8),
 ('girl', 8),
 ('game', 8),
 ('doesn', 7),
 ('face', 7),
 ('honey', 7),
 ('ready', 7),
 ('한국', 7),
 ('break', 7),
 ('dance', 7),
 ('worry', 7),
 ('okay', 7),
 ('stay', 7),
 ('follow', 7),
 ('play', 7),
 ('head', 7),
 ('high', 7),
 ('informazioni', 7),
 ('hit', 7),
 ('light', 7),
 ('world', 7),
 ('cuz', 6),
 ('give', 6),
 ('territory', 6),
 ('child', 6),
 ('fight', 6),
 ('kill', 6),
 ('feeling', 6),
 ('uh', 6),
 ('boy', 6),
 ('didn', 6),
 ('start', 6),
 ('still', 6),
 ('top', 

In [65]:
sort_dictionary(summer)

[('love', 3.7295123013654305),
 ('put', 2.4826616351462447),
 ('trust', 2.4324877675268377),
 ('hot', 2.3126303347836705),
 ('keep', 2.0253074558606508),
 ('woo', 1.9559731310647535),
 ('flower', 1.9553951154721705),
 ('crazy', 1.9351914543271769),
 ('fall', 1.896361950718139),
 ('worry', 1.889137009235887),
 ('mind', 1.8725577824369803),
 ('shy', 1.8625770823631649),
 ('business', 1.858239331562063),
 ('blah', 1.7916503324920754),
 ('한국', 1.7395776915919268),
 ('yes', 1.6725149283019323),
 ('em', 1.6580317150045465),
 ('control', 1.6275816088166775),
 ('sorry', 1.6181170837152725),
 ('think', 1.6029053675806249),
 ('hit', 1.5946636358702568),
 ('heart', 1.4564988011678022),
 ('down', 1.4448309833777575),
 ('ah', 1.4297295710406557),
 ('ready', 1.4238208197606785),
 ('say', 1.3873513044732313),
 ('matter', 1.3653434369426067),
 ('bad', 1.348649525140968),
 ('only', 1.3440981678289643),
 ('home', 1.3414986661490813),
 ('break', 1.3410441116279574),
 ('tell', 1.3296895882414261),
 ('care

### Topic analysis

In [66]:
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel

In [67]:
def pre_process(df_lyrics):
    # tokenize lyrics
    lyrics_processed = df_lyrics.fillna("").str.split()

    for _list in lyrics_processed:
        for cnt in range(len(_list)):
            _list[cnt] = _list[cnt].replace(".", "").replace(",", "").replace("?", "").replace("\\", "").replace("/", "").replace(":", "").lower()
            
            if is_stopword(_list[cnt]):
                _list[cnt] = ""
                
    # remove empty string
    for cnt in range(len(lyrics_processed)):
        while(True):
            try:
                lyrics_processed[cnt].remove("")
            except:
                break

    return lyrics_processed

#### Among global

In [68]:
df_looking = df[df["grouping"].isin(target_and_global)].reset_index(drop=True)

In [69]:
#train model
lyrics_processed = pre_process(df_looking["lyrics"])
dictionary = Dictionary(lyrics_processed)
corpus = [dictionary.doc2bow(text) for text in lyrics_processed]
lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=100)

all topics

In [70]:
# observe topics
topics = lda.print_topics()

#sort
topics = sorted(topics, key=lambda x: x[0])

topics

[(0,
  '0.016*"thomas" + 0.015*"blame" + 0.014*"only" + 0.011*"we\'re" + 0.011*"wild" + 0.009*"show" + 0.009*"home" + 0.009*"way" + 0.008*"coast" + 0.008*"heard"'),
 (5,
  '0.050*"yes" + 0.037*"yes)" + 0.026*"(yes" + 0.010*"(oh" + 0.009*"big" + 0.009*"take" + 0.008*"put" + 0.008*"ice" + 0.008*"hit" + 0.008*"afford"'),
 (6,
  '0.175*"hot" + 0.152*"boom" + 0.015*"moon" + 0.014*"feel" + 0.013*"super" + 0.012*"bass" + 0.012*"let" + 0.011*"beat" + 0.011*"heart" + 0.011*"badoom"'),
 (12,
  '0.023*"night" + 0.019*"tonight" + 0.017*"gonna" + 0.012*"down" + 0.010*"need" + 0.009*"doesn\'t" + 0.009*"back" + 0.009*"feeling" + 0.008*"won\'t" + 0.008*"fuck"'),
 (17,
  '0.049*"days" + 0.021*"au" + 0.016*"world" + 0.014*"live" + 0.014*"life" + 0.013*"ay" + 0.011*"forget" + 0.010*"whole" + 0.010*"stars" + 0.009*"comment"'),
 (27,
  '0.095*"night" + 0.025*"stay" + 0.021*"give" + 0.016*"love" + 0.014*"baby" + 0.012*"body" + 0.012*"wait" + 0.012*"using" + 0.011*"tell" + 0.009*"shake"'),
 (46,
  '0.057*"pa

what topics are in kpop?

In [71]:
threshold = 0.3

counter = {}
summer = {}

for cnt in range(len(df_looking)):
    if df_looking["grouping"][cnt] != target_group:
        continue
    
    topics = lda.get_document_topics(corpus[cnt], minimum_probability=0)
    
    for topic in topics:
        if topic[1] > threshold:
            counter[topic[0]] = counter.get(topic[0], 0) + 1
            
        summer[topic[0]] = summer.get(topic[0], 0) + topic[1]

In [72]:
sort_dictionary(counter)

[(59, 5),
 (86, 5),
 (4, 4),
 (97, 4),
 (66, 4),
 (22, 3),
 (70, 3),
 (47, 2),
 (84, 2),
 (5, 2),
 (68, 2),
 (46, 2),
 (6, 2),
 (35, 2),
 (13, 2),
 (60, 1),
 (74, 1),
 (33, 1),
 (30, 1),
 (16, 1),
 (34, 1),
 (2, 1),
 (51, 1),
 (44, 1),
 (56, 1),
 (91, 1),
 (65, 1),
 (82, 1),
 (20, 1),
 (79, 1),
 (76, 1),
 (23, 1),
 (96, 1),
 (90, 1),
 (53, 1),
 (48, 1),
 (61, 1)]

In [73]:
sums = sort_dictionary(summer)

sums

[(59, 4.512735463149056),
 (97, 3.944243160878159),
 (4, 2.994692469868369),
 (22, 2.692562343735972),
 (84, 2.63210539285501),
 (66, 2.5235062028164066),
 (70, 2.2990323281424025),
 (86, 2.2887698512654424),
 (35, 1.997911156164264),
 (47, 1.9781220668751303),
 (6, 1.9403113902949372),
 (68, 1.795709789855664),
 (13, 1.527295966553993),
 (48, 1.4889150881876958),
 (50, 1.4577170473726255),
 (44, 1.4229239939681975),
 (46, 1.3273079181758476),
 (53, 1.3204882611330504),
 (2, 1.2473758960709347),
 (90, 1.166908381632311),
 (33, 1.146149509442239),
 (16, 1.1343055928023205),
 (74, 1.0740235815915185),
 (30, 1.0525938835794477),
 (24, 1.005193038233756),
 (34, 0.9965037902293261),
 (45, 0.9614960229168901),
 (60, 0.9608792458325297),
 (61, 0.9606184965900866),
 (79, 0.9551347514639019),
 (56, 0.9491253938917907),
 (65, 0.9310305006160888),
 (76, 0.8918766943984338),
 (5, 0.8847776774869089),
 (62, 0.8259017975374263),
 (18, 0.8150276261235376),
 (20, 0.8123921676101418),
 (83, 0.811953138

In [74]:
for _sum in sums[:10]:
    print("Topic ID: " + str(_sum[0]))
    print("Sum: " + str(_sum[1]))
    print("Count: " + str(counter[_sum[0]]))
    
    print("Keywords: ")
    print(lda.print_topic(int(_sum[0])))
    
    print("\n")

Topic ID: 59
Sum: 4.512735463149056
Count: 5
Keywords: 
0.050*"love" + 0.019*"let" + 0.018*"down" + 0.013*"need" + 0.011*"pain" + 0.011*"bum" + 0.010*"say" + 0.010*"am" + 0.009*"think" + 0.008*"animals"


Topic ID: 97
Sum: 3.944243160878159
Count: 4
Keywords: 
0.035*"put" + 0.022*"let" + 0.020*"love" + 0.019*"burn" + 0.019*"'em" + 0.018*"only" + 0.014*"fire" + 0.014*"way" + 0.013*"sneakers" + 0.012*"burning"


Topic ID: 4
Sum: 2.994692469868369
Count: 4
Keywords: 
0.111*"love" + 0.026*"city" + 0.011*"give" + 0.009*"need" + 0.009*"sweat" + 0.009*"baby" + 0.009*"bad" + 0.007*"new" + 0.007*"let" + 0.006*"care"


Topic ID: 22
Sum: 2.692562343735972
Count: 3
Keywords: 
0.047*"money" + 0.018*"need" + 0.017*"trouble" + 0.015*"take" + 0.014*"love" + 0.013*"cheap" + 0.012*"girls" + 0.010*"floor" + 0.009*"chi't" + 0.009*"put"


Topic ID: 84
Sum: 2.63210539285501
Count: 2
Keywords: 
0.036*"feel" + 0.026*"boy" + 0.016*"fall" + 0.016*"say" + 0.015*"world" + 0.013*"am" + 0.011*"yes" + 0.010*"bad" + 

#### Among KPOP

In [75]:
df_looking = df[df["grouping"].isin(target_and_kpop)].reset_index(drop=True)

In [76]:
#train model
lyrics_processed = pre_process(df_looking["lyrics"])
dictionary = Dictionary(lyrics_processed)
corpus = [dictionary.doc2bow(text) for text in lyrics_processed]
lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=100)

# observe topics
topics = lda.print_topics()

In [77]:
threshold = 0.3

counter = {}
summer = {}

for cnt in range(len(df_looking)):
    if df_looking["grouping"][cnt] != target_group:
        continue
    
    topics = lda.get_document_topics(corpus[cnt], minimum_probability=0)
    
    for topic in topics:
        if topic[1] > threshold:
            counter[topic[0]] = counter.get(topic[0], 0) + 1
            
        summer[topic[0]] = summer.get(topic[0], 0) + topic[1]


In [78]:
sort_dictionary(counter)

[(50, 3),
 (63, 3),
 (96, 3),
 (15, 3),
 (17, 3),
 (51, 3),
 (53, 2),
 (27, 2),
 (72, 2),
 (90, 2),
 (38, 2),
 (28, 2),
 (68, 2),
 (10, 2),
 (94, 2),
 (26, 2),
 (54, 2),
 (9, 2),
 (75, 1),
 (60, 1),
 (84, 1),
 (78, 1),
 (2, 1),
 (20, 1),
 (89, 1),
 (36, 1),
 (35, 1),
 (19, 1),
 (49, 1),
 (87, 1),
 (85, 1),
 (32, 1),
 (29, 1),
 (57, 1),
 (55, 1),
 (66, 1),
 (62, 1),
 (46, 1),
 (92, 1),
 (45, 1),
 (0, 1),
 (1, 1),
 (42, 1),
 (91, 1),
 (44, 1),
 (93, 1),
 (76, 1)]

In [79]:

sums = sort_dictionary(summer)

sums

[(50, 3.1390510509590968),
 (15, 3.1072143638739362),
 (96, 2.7721774761084816),
 (17, 2.70866052223937),
 (51, 2.7061603413822013),
 (63, 2.4648404962754284),
 (27, 2.362981822770962),
 (28, 2.236646977100463),
 (10, 2.200532875394856),
 (54, 2.184186406644585),
 (94, 2.1739320299384417),
 (9, 2.1650073500568396),
 (26, 2.1516168937741895),
 (90, 2.138767337957688),
 (68, 2.1043124205389176),
 (72, 2.077097546462028),
 (53, 1.9188817676258623),
 (38, 1.677948069984268),
 (62, 1.362512590159895),
 (2, 1.355615980202856),
 (55, 1.3002933354946435),
 (0, 1.2877544766743085),
 (44, 1.2838157463993412),
 (76, 1.2789568275620695),
 (20, 1.2157330702393665),
 (1, 1.2058594677582732),
 (45, 1.178809797806025),
 (57, 1.1768193036878074),
 (49, 1.1764467446191702),
 (60, 1.172816701917327),
 (75, 1.171063000081631),
 (35, 1.1709904512390494),
 (84, 1.1688688269568956),
 (91, 1.1685266116983257),
 (93, 1.166235866570787),
 (46, 1.1653110312254285),
 (85, 1.1596026158731547),
 (19, 1.147135648752

In [80]:
#sort
topics = sorted(topics, key=lambda x: x[0])


for _sum in sums[:10]:
    print("Topic ID: " + str(_sum[0]))
    print("Sum: " + str(_sum[1]))
    print("Count: " + str(counter[_sum[0]]))
    
    print("Keywords: ")
    print(lda.print_topic(int(_sum[0])))
    
    print("\n")

Topic ID: 50
Sum: 3.1390510509590968
Count: 3
Keywords: 
0.026*"love" + 0.020*"id" + 0.017*"loco" + 0.016*"heart" + 0.011*"taking" + 0.011*"tell" + 0.010*"back" + 0.009*"say" + 0.008*"sorry" + 0.008*"youth"


Topic ID: 15
Sum: 3.1072143638739362
Count: 3
Keywords: 
0.112*"drunk" + 0.024*"call" + 0.020*"home" + 0.019*"back" + 0.017*"fall" + 0.010*"give" + 0.010*"boy" + 0.007*"il" + 0.006*"yes" + 0.006*"beep"


Topic ID: 96
Sum: 2.7721774761084816
Count: 3
Keywords: 
0.037*"non" + 0.016*"chi't" + 0.015*"fake" + 0.015*"yes" + 0.015*"surf" + 0.012*"say" + 0.011*"worry" + 0.011*"let" + 0.010*"oh!" + 0.009*"eyes"


Topic ID: 17
Sum: 2.70866052223937
Count: 3
Keywords: 
0.029*"back" + 0.022*"baby" + 0.016*"loca" + 0.015*"put" + 0.015*"sorry" + 0.013*"tt" + 0.013*"follow" + 0.012*"can't" + 0.010*"love" + 0.008*"tears"


Topic ID: 51
Sum: 2.7061603413822013
Count: 3
Keywords: 
0.077*"oriente" + 0.020*"sorry" + 0.014*"heart" + 0.011*"rose" + 0.009*"vie" + 0.009*"think" + 0.009*"bum" + 0.009*"lov

### Emotion analysis
Emotions:  
fear
anger
anticipation
trust
surprise
positive
negative
sadness
disgust
joy

In [81]:
import numpy as np

In [82]:
emotions = [
    "anger",
    "anticipation",
    "disgust",
    "fear",
    "joy",
    "negative",	
    "positive",
    "sadness",
    "surprise",	
    "trust",
]

In [83]:
df_nrc = pd.read_csv("NRC.txt", sep="\t", names=["word", "emotion", "amount"])

words_to_emotions = {}

for _, row in df_nrc.iterrows():
    words_to_emotions[row["word"]] = words_to_emotions.get(row["word"], np.zeros((10, 1)))
    words_to_emotions[row["word"]][emotions.index(row["emotion"])] = row["amount"]

In [84]:
df_emotion = df.copy()
for emotion in emotions:
    df_emotion[emotion] = 0

In [85]:
for cnt in range(len(df)):
    lyrics = df["lyrics"][cnt]

    if type(lyrics) != str:
        continue

    emotion_this = np.zeros((10, 1))

    for word in lyrics.split():
        if word in words_to_emotions:
            emotion_this += words_to_emotions[word]
            
    if np.linalg.norm(emotion_this) != 0:
        emotion_this /= np.linalg.norm(emotion_this)
        
    for emotion in emotions:
        df_emotion[emotion][cnt] = emotion_this[emotions.index(emotion)]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_emotion[emotion][cnt] = emotion_this[emotions.index(emotion)]
  df_emotion[emotion][cnt] = emotion_this[emotions.index(emotion)]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_emotion[emotion][cnt] = emotion_this[emotions.index(emotion)]
  df_emotion[emotion][cnt] = emotion_this[emotions.index(emotion)]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_emotion[emotion][cnt] = emotion_this[emotions.index(emotion)]
  df_emotion[emotion][cnt] = emotion_this[emoti

In [86]:
df_emotion_global = df_emotion[df_emotion["grouping"].isin(global_group)]
mean_emotion_global = df_emotion_global[emotions].mean()
normalized_mean_emotion_global = mean_emotion_global / np.linalg.norm(mean_emotion_global)

df_emotion_kpop = df_emotion[df_emotion["grouping"].isin(kpop_group)]
mean_emotion_kpop = df_emotion_kpop[emotions].mean()
normalized_mean_emotion_kpop = mean_emotion_kpop / np.linalg.norm(mean_emotion_kpop)

df_emotion_target = df_emotion[df_emotion["grouping"] == target_group]
mean_emotion_target = df_emotion_target[emotions].mean()
normalized_mean_emotion_target = mean_emotion_target / np.linalg.norm(mean_emotion_target)

In [None]:
normalized_mean_emotion_target

against global

In [87]:
emotion_percentage = (normalized_mean_emotion_target - normalized_mean_emotion_global) / normalized_mean_emotion_global * 100

emotion_percentage

anger           49.766256
anticipation    -9.279161
disgust         -7.783042
fear            -0.174828
joy             -1.692845
negative         2.711875
positive        -3.011005
sadness        -13.678178
surprise       -17.529765
trust           -1.146931
dtype: float64

against KPOP

In [88]:
emotion_percentage = (normalized_mean_emotion_target - normalized_mean_emotion_kpop) / normalized_mean_emotion_kpop * 100

emotion_percentage

anger           100.407097
anticipation    -16.543496
disgust          27.199445
fear             13.382179
joy             -12.798583
negative         32.925437
positive        -17.297245
sadness           2.391328
surprise        -15.747100
trust             3.573146
dtype: float64