# Lyrics Analysis

In [1]:
lyrics_directory = "Lyrics_Data/"
tracks_csv = "tracks_group.csv"
stopwords_file = "stopwords.txt"

In [2]:
target_group = "twice"
kpop_group = set(['kpop', 'itzy', 'seventeen', 'bts', 'twice'])
global_group = set(['global', 'billboard'])

target_and_kpop = kpop_group.union(set([target_group]))
target_and_global = global_group.union(set([target_group]))

## Import data

In [3]:
import pandas as pd

In [4]:
df = pd.read_csv(tracks_csv)

df.head(3)

Unnamed: 0,track_name,track_id,grouping
0,Ice Cream (with Selena Gomez),2J4P46vCFm1rPkNkp9pZWX,kpop
1,Dynamite,0v1x6rN6JHRapa03JElljE,kpop
2,THE BADDEST,2V4Fx72svQRxrFvNT1eq5f,kpop


In [5]:
df.shape

(4618, 3)

In [6]:
df["grouping"].unique()

array(['kpop', 'itzy', 'seventeen', 'bts', 'twice', 'global', 'billboard'],
      dtype=object)

In [7]:
def get_lyrics(track_id):
    try:
        lyrics = ""
        with open(lyrics_directory + track_id + ".txt", "r") as f:
            lyrics = f.read()
            
        return lyrics            
    except:
        return None
    
df["lyrics"] = df["track_id"].apply(get_lyrics)

df.shape

(4618, 4)

In [8]:
# stop words
# https://www.kaggle.com/datasets/rowhitswami/stopwords/
stopwords = set()
with open(stopwords_file, "r") as f:
    stopwords = set(f.read().split("\n"))
    
stopwords

{',',
 '>',
 '?',
 'Just',
 'Yes',
 'a',
 'abaft',
 'abafter',
 'abaftest',
 'about',
 'abouter',
 'aboutest',
 'above',
 'abover',
 'abovest',
 'accordingly',
 'aer',
 'aest',
 'afore',
 'after',
 'afterer',
 'afterest',
 'afterward',
 'afterwards',
 'again',
 'against',
 'aid',
 'ain',
 'albeit',
 'all',
 'aller',
 'allest',
 'alls',
 'allyou',
 'almost',
 'along',
 'alongside',
 'already',
 'also',
 'although',
 'always',
 'amid',
 'amidst',
 'among',
 'amongst',
 'an',
 'and',
 'andor',
 'anear',
 'anent',
 'another',
 'any',
 'anybody',
 'anyhow',
 'anyone',
 'anything',
 'anywhere',
 'apart',
 'aparter',
 'apartest',
 'appear',
 'appeared',
 'appearing',
 'appears',
 'appropriate',
 'appropriated',
 'appropriater',
 'appropriates',
 'appropriatest',
 'appropriating',
 'are',
 'ares',
 'around',
 'as',
 'ases',
 'aside',
 'asides',
 'aslant',
 'astraddle',
 'astraddler',
 'astraddlest',
 'astride',
 'astrider',
 'astridest',
 'at',
 'athwart',
 'atop',
 'atween',
 'aught',
 'aught

## Analyze

In [9]:
def is_stopword(word):
    word = word.lower()
    
    if len(word) <= 1:
        return True
    
    # in the list
    if word in stopwords:
        return True
    
    # is Korean
    if ord("가") <= ord(word[0]) <= ord("힣"):
        return True
    
    return False

### Characteristic Keyword
Using TF-IDF

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

#### Among global

In [11]:
df_looking = df[df["grouping"].isin(target_and_global)].reset_index(drop=True)

In [12]:
vectorizer = TfidfVectorizer(stop_words=list(stopwords), max_features=1000, min_df=10)

x = vectorizer.fit_transform(df_looking["lyrics"].fillna("")).toarray()

df_tfidf = pd.DataFrame(x, columns=vectorizer.get_feature_names_out()).T



In [13]:
threshold = 0.001
counter = {}
summer = {}

for cnt in range(len(df_looking)):
    if df_looking["grouping"][cnt] != target_group:
        continue
    
    tfidf_series = df_tfidf[cnt]
    
    keywords = tfidf_series[tfidf_series > threshold].index.tolist()
    for keyword in keywords:
        counter[keyword] = counter.get(keyword, 0) + 1
        summer[keyword] = summer.get(keyword, 0) + tfidf_series[keyword]

In [14]:
def sort_dictionary(dictionary):
    return [(k, dictionary[k]) for k in sorted(dictionary, key=dictionary.get, reverse=True)]

In [15]:
sort_dictionary(counter)

[('love', 81),
 ('let', 71),
 ('heart', 64),
 ('baby', 62),
 ('home', 56),
 ('yes', 51),
 ('feel', 50),
 ('sorry', 47),
 ('say', 44),
 ('think', 43),
 ('eyes', 42),
 ('need', 37),
 ('news', 34),
 ('take', 31),
 ('tell', 30),
 ('honey', 30),
 ('back', 30),
 ('child', 29),
 ('dream', 28),
 ('only', 26),
 ('am', 26),
 ('light', 25),
 ('ah', 24),
 ('world', 24),
 ('feeling', 24),
 ('night', 23),
 ('life', 22),
 ('happy', 22),
 ('share', 21),
 ('sweet', 20),
 ('follow', 20),
 ('show', 20),
 ('mind', 19),
 ('ready', 19),
 ('wait', 19),
 ('easy', 18),
 ('room', 18),
 ('end', 18),
 ('okay', 18),
 ('boy', 17),
 ('forget', 17),
 ('meet', 17),
 ('give', 17),
 ('way', 17),
 ('hear', 17),
 ('new', 17),
 ('ooh', 16),
 ('away', 16),
 ('keep', 16),
 ('true', 16),
 ('words', 16),
 ('smile', 16),
 ('shut', 16),
 ('dance', 15),
 ('care', 15),
 ('down', 15),
 ('tonight', 15),
 ('girl', 15),
 ('mine', 15),
 ('start', 15),
 ('little', 15),
 ('beautiful', 14),
 ('made', 14),
 ('move', 14),
 ('wow', 14),
 ('c

In [16]:
sort_dictionary(summer)

[('home', 13.931992319453727),
 ('love', 12.824083976293252),
 ('baby', 9.069787994701606),
 ('heart', 8.167630007034115),
 ('sorry', 6.87206765201956),
 ('let', 6.820567467417053),
 ('ah', 6.3710435563246595),
 ('yes', 6.039825610017484),
 ('sweet', 5.587227836906947),
 ('eyes', 5.571798184274636),
 ('child', 5.425195621935116),
 ('news', 5.1852820453526896),
 ('follow', 4.728578440242536),
 ('한국', 4.710007613924276),
 ('say', 4.670350696880509),
 ('back', 4.565068654238137),
 ('honey', 4.508644267693422),
 ('happy', 4.355925899456217),
 ('shut', 4.147360523112134),
 ('share', 3.828051149884993),
 ('boy', 3.823601342392143),
 ('ooh', 3.812661929570221),
 ('feel', 3.709714642563397),
 ('dream', 3.6839441198522556),
 ('only', 3.5378947428270893),
 ('room', 3.4250275121800344),
 ('think', 3.2927475359754466),
 ('dance', 3.170758416935763),
 ('easy', 3.125976679431446),
 ('give', 3.114537772389982),
 ('need', 3.1020697366848804),
 ('waiting', 2.9401781953729174),
 ('tell', 2.6353964829377

#### Among KPOP

In [17]:
df_looking = df[df["grouping"].isin(target_and_kpop)].reset_index(drop=True)

In [18]:
vectorizer = TfidfVectorizer(stop_words=list(stopwords), max_features=1000, min_df=10)

x = vectorizer.fit_transform(df_looking["lyrics"].fillna("")).toarray()

df_tfidf = pd.DataFrame(x, columns=vectorizer.get_feature_names_out()).T



In [19]:
threshold = 0.001
counter = {}
summer = {}

for cnt in range(len(df_looking)):
    if df_looking["grouping"][cnt] != target_group:
        continue
    
    tfidf_series = df_tfidf[cnt]
    
    keywords = tfidf_series[tfidf_series > threshold].index.tolist()
    for keyword in keywords:
        counter[keyword] = counter.get(keyword, 0) + 1
        summer[keyword] = summer.get(keyword, 0) + tfidf_series[keyword]

In [20]:
sort_dictionary(counter)

[('love', 81),
 ('let', 71),
 ('heart', 64),
 ('baby', 62),
 ('home', 56),
 ('yes', 51),
 ('feel', 50),
 ('sorry', 47),
 ('say', 44),
 ('think', 43),
 ('eyes', 42),
 ('need', 37),
 ('news', 34),
 ('take', 31),
 ('tell', 30),
 ('honey', 30),
 ('back', 30),
 ('child', 29),
 ('dream', 28),
 ('only', 26),
 ('am', 26),
 ('light', 25),
 ('contact', 25),
 ('ah', 24),
 ('world', 24),
 ('feeling', 24),
 ('night', 23),
 ('life', 22),
 ('happy', 22),
 ('share', 21),
 ('sweet', 20),
 ('follow', 20),
 ('show', 20),
 ('mind', 19),
 ('ready', 19),
 ('wait', 19),
 ('easy', 18),
 ('room', 18),
 ('end', 18),
 ('okay', 18),
 ('boy', 17),
 ('forget', 17),
 ('meet', 17),
 ('give', 17),
 ('way', 17),
 ('hear', 17),
 ('new', 17),
 ('ooh', 16),
 ('away', 16),
 ('keep', 16),
 ('true', 16),
 ('words', 16),
 ('smile', 16),
 ('shut', 16),
 ('dance', 15),
 ('care', 15),
 ('down', 15),
 ('tonight', 15),
 ('girl', 15),
 ('informazioni', 15),
 ('mine', 15),
 ('start', 15),
 ('한국어', 15),
 ('little', 15),
 ('beautiful'

In [21]:
sort_dictionary(summer)

[('love', 13.065723016326086),
 ('home', 12.046380918451089),
 ('baby', 9.37089980241579),
 ('heart', 7.39317226970636),
 ('let', 7.0242306392414395),
 ('yes', 6.590925241101974),
 ('ah', 5.6111757294054305),
 ('say', 5.35123036052182),
 ('child', 5.136613226283855),
 ('honey', 4.9609555244080585),
 ('eyes', 4.937091926832534),
 ('back', 4.787605550619838),
 ('sweet', 4.783702989497009),
 ('feel', 4.408003272188201),
 ('boy', 3.994288137930682),
 ('sorry', 3.884341745531181),
 ('only', 3.8213452721812167),
 ('happy', 3.7722986475492406),
 ('need', 3.7701262583412736),
 ('ooh', 3.7484341269486787),
 ('think', 3.6914551045215114),
 ('give', 3.6140214682608116),
 ('share', 3.6140009224389864),
 ('follow', 3.4620043174998534),
 ('shut', 3.1821969977209306),
 ('tell', 3.1245862358348635),
 ('한국', 3.0492493517826613),
 ('dance', 2.891446458906176),
 ('night', 2.865801978193554),
 ('dream', 2.83933489657322),
 ('easy', 2.814198716896),
 ('waiting', 2.7886617134179796),
 ('take', 2.72871228385

### Topic analysis

In [22]:
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel

In [23]:
def pre_process(df_lyrics):
    # tokenize lyrics
    lyrics_processed = df_lyrics.fillna("").str.split()

    for _list in lyrics_processed:
        for cnt in range(len(_list)):
            _list[cnt] = _list[cnt].replace(".", "").replace(",", "").replace("?", "").replace("\\", "").replace("/", "").replace(":", "").lower()
            
            if is_stopword(_list[cnt]):
                _list[cnt] = ""
                
    # remove empty string
    for cnt in range(len(lyrics_processed)):
        while(True):
            try:
                lyrics_processed[cnt].remove("")
            except:
                break

    return lyrics_processed

#### Among global

In [24]:
df_looking = df[df["grouping"].isin(target_and_global)].reset_index(drop=True)

In [25]:
#train model
lyrics_processed = pre_process(df_looking["lyrics"])
dictionary = Dictionary(lyrics_processed)
corpus = [dictionary.doc2bow(text) for text in lyrics_processed]
lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=100)

all topics

In [26]:
# observe topics
topics = lda.print_topics()

#sort
topics = sorted(topics, key=lambda x: x[0])

topics

[(0,
  '0.043*"dark" + 0.019*"angel" + 0.014*"girl" + 0.012*"watch" + 0.011*"price" + 0.010*"top" + 0.008*"give" + 0.007*"young" + 0.007*"that\'s" + 0.007*"didn\'t"'),
 (13,
  '0.042*"hope" + 0.041*"hell" + 0.036*"gives" + 0.026*"trouble" + 0.015*"goodbye" + 0.013*"hard" + 0.012*"love" + 0.012*"face" + 0.011*"take" + 0.011*"music"'),
 (15,
  '0.049*"mom" + 0.021*"take" + 0.015*"we\'re" + 0.014*"let" + 0.012*"love" + 0.009*"favorite" + 0.009*"show" + 0.009*"back" + 0.009*"girl" + 0.009*"healthy"'),
 (16,
  '0.755*"baby" + 0.005*"path" + 0.005*"mine" + 0.004*"born" + 0.004*"honey" + 0.003*"girl" + 0.003*"love" + 0.003*"child" + 0.003*"give" + 0.003*"whistle"'),
 (17,
  '0.024*"shit" + 0.017*"take" + 0.011*"pump" + 0.009*"man" + 0.008*"faster" + 0.008*"side" + 0.008*"run" + 0.007*"tell" + 0.007*"children" + 0.007*"fuck"'),
 (22,
  '0.074*"take" + 0.033*"child" + 0.013*"love" + 0.012*"night" + 0.012*"give" + 0.010*"can\'t" + 0.010*"tonight" + 0.008*"let" + 0.007*"life" + 0.007*"heart"'),
 

what topics are in kpop?

In [27]:
threshold = 0.3

counter = {}
summer = {}

for cnt in range(len(df_looking)):
    if df_looking["grouping"][cnt] != target_group:
        continue
    
    topics = lda.get_document_topics(corpus[cnt], minimum_probability=0)
    
    for topic in topics:
        if topic[1] > threshold:
            counter[topic[0]] = counter.get(topic[0], 0) + 1
            
        summer[topic[0]] = summer.get(topic[0], 0) + topic[1]

In [28]:
sort_dictionary(counter)

[(80, 11),
 (94, 8),
 (45, 8),
 (19, 7),
 (48, 6),
 (97, 5),
 (85, 5),
 (72, 5),
 (49, 4),
 (69, 4),
 (39, 4),
 (40, 4),
 (5, 4),
 (8, 4),
 (53, 3),
 (17, 3),
 (79, 3),
 (20, 3),
 (7, 3),
 (99, 3),
 (70, 3),
 (12, 3),
 (46, 3),
 (35, 3),
 (78, 3),
 (91, 3),
 (34, 3),
 (21, 2),
 (55, 2),
 (76, 2),
 (32, 2),
 (9, 2),
 (44, 2),
 (37, 2),
 (47, 2),
 (82, 2),
 (92, 2),
 (56, 1),
 (25, 1),
 (16, 1),
 (87, 1),
 (66, 1),
 (33, 1),
 (14, 1),
 (90, 1),
 (51, 1),
 (23, 1),
 (98, 1),
 (61, 1),
 (75, 1),
 (95, 1),
 (28, 1),
 (38, 1),
 (81, 1),
 (58, 1),
 (1, 1),
 (26, 1),
 (22, 1),
 (29, 1),
 (24, 1),
 (93, 1),
 (42, 1)]

In [29]:
sums = sort_dictionary(summer)

sums

[(19, 8.166971907663537),
 (80, 7.550452483319532),
 (94, 7.435437804157118),
 (45, 5.938356536727952),
 (48, 4.865549346613989),
 (72, 4.490339883301203),
 (85, 4.148054550303641),
 (97, 4.108048974208941),
 (5, 4.073523869745259),
 (35, 4.0555546056893945),
 (91, 3.7945783710347314),
 (16, 3.6145420969842235),
 (70, 3.598703955318342),
 (79, 3.5986704157694476),
 (39, 3.2722417120348837),
 (49, 3.165925742341642),
 (34, 3.1599980818245967),
 (47, 3.0700158316067245),
 (69, 2.9677795464631345),
 (58, 2.9591705232669483),
 (55, 2.9398950534050527),
 (12, 2.9151327682448027),
 (46, 2.9071674678816635),
 (8, 2.884509376002825),
 (40, 2.8534378640833893),
 (7, 2.7507329383442993),
 (78, 2.7305682314727164),
 (99, 2.678751478968479),
 (53, 2.619225138299953),
 (21, 2.6045198023821285),
 (61, 2.4205597791842592),
 (82, 2.3489605125832895),
 (22, 2.205834266722377),
 (9, 2.196245675728278),
 (11, 2.137910536090203),
 (20, 2.133425811476627),
 (17, 2.1155015032309166),
 (44, 2.0934453419286),

In [30]:
for _sum in sums[:10]:
    print("Topic ID: " + str(_sum[0]))
    print("Sum: " + str(_sum[1]))
    print("Count: " + str(counter[_sum[0]]))
    
    print("Keywords: ")
    print(lda.print_topic(int(_sum[0])))
    
    print("\n")

Topic ID: 19
Sum: 8.166971907663537
Count: 7
Keywords: 
0.073*"eyes" + 0.038*"love" + 0.024*"head" + 0.022*"that's" + 0.013*"need" + 0.013*"dark" + 0.011*"mind" + 0.010*"still" + 0.008*"baby" + 0.008*"home"


Topic ID: 80
Sum: 7.550452483319532
Count: 11
Keywords: 
0.216*"home" + 0.040*"wood" + 0.031*"let's" + 0.028*"fight" + 0.024*"due" + 0.012*"ah" + 0.011*"boy" + 0.011*"fighting" + 0.010*"tell" + 0.009*"coming"


Topic ID: 94
Sum: 7.435437804157118
Count: 8
Keywords: 
0.154*"love" + 0.021*"yes" + 0.011*"back" + 0.010*"am" + 0.010*"harder" + 0.010*"heart" + 0.009*"can't" + 0.008*"(hey" + 0.008*"weather" + 0.007*""i'm"


Topic ID: 45
Sum: 5.938356536727952
Count: 8
Keywords: 
0.063*"love" + 0.024*"child" + 0.021*"say" + 0.017*"save" + 0.013*"we're" + 0.012*"back" + 0.011*"let's" + 0.011*"crazy" + 0.009*"take" + 0.009*"tell"


Topic ID: 48
Sum: 4.865549346613989
Count: 6
Keywords: 
0.067*"love" + 0.051*"tell" + 0.024*"feel" + 0.019*"need" + 0.018*"heart" + 0.013*"boom" + 0.012*"makes" 

#### Among KPOP

In [31]:
df_looking = df[df["grouping"].isin(target_and_kpop)].reset_index(drop=True)

In [32]:
#train model
lyrics_processed = pre_process(df_looking["lyrics"])
dictionary = Dictionary(lyrics_processed)
corpus = [dictionary.doc2bow(text) for text in lyrics_processed]
lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=100)

# observe topics
topics = lda.print_topics()

In [33]:
threshold = 0.3

counter = {}
summer = {}

for cnt in range(len(df_looking)):
    if df_looking["grouping"][cnt] != target_group:
        continue
    
    topics = lda.get_document_topics(corpus[cnt], minimum_probability=0)
    
    for topic in topics:
        if topic[1] > threshold:
            counter[topic[0]] = counter.get(topic[0], 0) + 1
            
        summer[topic[0]] = summer.get(topic[0], 0) + topic[1]


In [34]:
sort_dictionary(counter)

[(52, 10),
 (82, 8),
 (45, 7),
 (19, 6),
 (77, 5),
 (64, 5),
 (8, 5),
 (11, 4),
 (84, 4),
 (30, 4),
 (43, 4),
 (37, 4),
 (88, 4),
 (99, 4),
 (70, 3),
 (74, 3),
 (14, 3),
 (80, 3),
 (96, 3),
 (1, 3),
 (24, 3),
 (58, 3),
 (32, 3),
 (50, 3),
 (4, 3),
 (7, 3),
 (23, 3),
 (35, 3),
 (2, 3),
 (66, 3),
 (21, 3),
 (55, 3),
 (67, 2),
 (9, 2),
 (10, 2),
 (16, 2),
 (42, 2),
 (33, 2),
 (86, 2),
 (47, 2),
 (75, 2),
 (53, 2),
 (39, 2),
 (28, 2),
 (93, 2),
 (56, 2),
 (72, 2),
 (48, 2),
 (18, 2),
 (51, 2),
 (73, 2),
 (22, 2),
 (36, 1),
 (68, 1),
 (17, 1),
 (46, 1),
 (61, 1),
 (29, 1),
 (92, 1),
 (60, 1),
 (91, 1),
 (98, 1),
 (15, 1),
 (79, 1),
 (78, 1),
 (90, 1),
 (81, 1),
 (89, 1),
 (69, 1),
 (41, 1),
 (34, 1),
 (5, 1),
 (94, 1),
 (83, 1),
 (12, 1),
 (27, 1),
 (26, 1),
 (40, 1),
 (25, 1),
 (3, 1),
 (31, 1),
 (13, 1),
 (71, 1)]

In [35]:

sums = sort_dictionary(summer)

sums

[(52, 7.240168708787678),
 (82, 6.776887156527664),
 (45, 6.477135047251068),
 (19, 5.963832112116506),
 (99, 4.7448053010957665),
 (64, 4.604169552134408),
 (43, 4.322644414052775),
 (11, 4.079502209886414),
 (77, 4.068373620932107),
 (30, 3.993538882183202),
 (88, 3.93412766851543),
 (37, 3.7766542891404242),
 (8, 3.611024180736422),
 (55, 3.2129494829496252),
 (14, 3.1796080969652394),
 (66, 3.115330631204415),
 (4, 3.1080099621322006),
 (74, 3.102896617434453),
 (7, 3.0413123158796225),
 (1, 3.0395726625138195),
 (21, 2.947988038577023),
 (70, 2.9274457674109726),
 (84, 2.9203729239234235),
 (50, 2.9119362708370318),
 (86, 2.795081112300977),
 (96, 2.791288207306934),
 (23, 2.6784209264806123),
 (58, 2.6447397363663185),
 (18, 2.567767292923236),
 (51, 2.472201469106949),
 (2, 2.459248151462816),
 (32, 2.371997409463802),
 (10, 2.3588481071274146),
 (24, 2.3401019500597613),
 (35, 2.316811681848776),
 (80, 2.2546123584616),
 (33, 2.248938216936949),
 (72, 2.2312323650621693),
 (56,

In [36]:
#sort
topics = sorted(topics, key=lambda x: x[0])


for _sum in sums[:10]:
    print("Topic ID: " + str(_sum[0]))
    print("Sum: " + str(_sum[1]))
    print("Count: " + str(counter[_sum[0]]))
    
    print("Keywords: ")
    print(lda.print_topic(int(_sum[0])))
    
    print("\n")

Topic ID: 52
Sum: 7.240168708787678
Count: 10
Keywords: 
0.270*"home" + 0.019*"shut" + 0.011*"baby" + 0.010*"illegally" + 0.009*"comment" + 0.009*"chiudi" + 0.008*"call" + 0.007*"toy" + 0.007*"sorry" + 0.007*"ooh"


Topic ID: 82
Sum: 6.776887156527664
Count: 8
Keywords: 
0.127*"love" + 0.022*"sorry" + 0.015*"need" + 0.010*"(i" + 0.010*"hate" + 0.009*"seoul" + 0.007*"baby" + 0.007*"heart" + 0.006*"news" + 0.006*"loving"


Topic ID: 45
Sum: 6.477135047251068
Count: 7
Keywords: 
0.027*"only" + 0.023*"love" + 0.023*"ah" + 0.011*"girlfriend" + 0.010*"home" + 0.009*"gno" + 0.008*"news" + 0.008*"can't" + 0.008*"talk" + 0.008*"heart"


Topic ID: 19
Sum: 5.963832112116506
Count: 6
Keywords: 
0.036*"stay" + 0.026*"love" + 0.026*"can't" + 0.018*"honey" + 0.016*"call" + 0.011*"touch" + 0.011*"sorry" + 0.011*"life" + 0.010*"night" + 0.010*"heart"


Topic ID: 99
Sum: 4.7448053010957665
Count: 4
Keywords: 
0.034*"heart" + 0.023*"home" + 0.014*"spicy" + 0.013*"follow" + 0.012*"think" + 0.011*"sweet" +

### Emotion analysis
Emotions:  
fear
anger
anticipation
trust
surprise
positive
negative
sadness
disgust
joy

In [37]:
import numpy as np

In [38]:
emotions = [
    "anger",
    "anticipation",
    "disgust",
    "fear",
    "joy",
    "negative",	
    "positive",
    "sadness",
    "surprise",	
    "trust",
]

In [39]:
df_nrc = pd.read_csv("NRC.txt", sep="\t", names=["word", "emotion", "amount"])

words_to_emotions = {}

for _, row in df_nrc.iterrows():
    words_to_emotions[row["word"]] = words_to_emotions.get(row["word"], np.zeros((10, 1)))
    words_to_emotions[row["word"]][emotions.index(row["emotion"])] = row["amount"]

In [40]:
df_emotion = df.copy()
for emotion in emotions:
    df_emotion[emotion] = 0

In [41]:
for cnt in range(len(df)):
    lyrics = df["lyrics"][cnt]

    if type(lyrics) != str:
        continue

    emotion_this = np.zeros((10, 1))

    for word in lyrics.split():
        if word in words_to_emotions:
            emotion_this += words_to_emotions[word]
            
    if np.linalg.norm(emotion_this) != 0:
        emotion_this /= np.linalg.norm(emotion_this)
        
    for emotion in emotions:
        df_emotion[emotion][cnt] = emotion_this[emotions.index(emotion)]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_emotion[emotion][cnt] = emotion_this[emotions.index(emotion)]
  df_emotion[emotion][cnt] = emotion_this[emotions.index(emotion)]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_emotion[emotion][cnt] = emotion_this[emotions.index(emotion)]
  df_emotion[emotion][cnt] = emotion_this[emotions.index(emotion)]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_emotion[emotion][cnt] = emotion_this[emotions.index(emotion)]
  df_emotion[emotion][cnt] = emotion_this[emoti

In [42]:
df_emotion_global = df_emotion[df_emotion["grouping"].isin(global_group)]
mean_emotion_global = df_emotion_global[emotions].mean()
normalized_mean_emotion_global = mean_emotion_global / np.linalg.norm(mean_emotion_global)

df_emotion_kpop = df_emotion[df_emotion["grouping"].isin(kpop_group)]
mean_emotion_kpop = df_emotion_kpop[emotions].mean()
normalized_mean_emotion_kpop = mean_emotion_kpop / np.linalg.norm(mean_emotion_kpop)

df_emotion_target = df_emotion[df_emotion["grouping"] == target_group]
mean_emotion_target = df_emotion_target[emotions].mean()
normalized_mean_emotion_target = mean_emotion_target / np.linalg.norm(mean_emotion_target)

In [None]:
normalized_mean_emotion_target

against global

In [43]:
emotion_percentage = (normalized_mean_emotion_target - normalized_mean_emotion_global) / normalized_mean_emotion_global * 100

emotion_percentage

anger          -35.259049
anticipation    16.726185
disgust        -47.480227
fear           -39.513619
joy             33.185952
negative       -37.102611
positive        22.373141
sadness        -41.493078
surprise        -2.994104
trust          -10.454797
dtype: float64

against KPOP

In [44]:
emotion_percentage = (normalized_mean_emotion_target - normalized_mean_emotion_kpop) / normalized_mean_emotion_kpop * 100

emotion_percentage

anger          -13.368028
anticipation     7.379512
disgust        -27.556860
fear           -31.299115
joy             18.139963
negative       -18.600815
positive         4.347879
sadness        -30.601541
surprise        -0.897238
trust           -6.179156
dtype: float64