# Lyrics Analysis

In [166]:
lyrics_directory = "Lyrics_Data/"
tracks_csv = "tracks_group.csv"
stopwords_file = "stopwords.txt"

In [167]:
target_group = "bts"
kpop_group = set(['kpop', 'itzy', 'seventeen', 'bts', 'twice'])
global_group = set(['global', 'billboard'])

target_and_kpop = kpop_group.union(set([target_group]))
target_and_global = global_group.union(set([target_group]))

## Import data

In [168]:
import pandas as pd

In [169]:
df = pd.read_csv(tracks_csv)

df.head(3)

Unnamed: 0,track_name,track_id,grouping
0,Ice Cream (with Selena Gomez),2J4P46vCFm1rPkNkp9pZWX,kpop
1,Dynamite,0v1x6rN6JHRapa03JElljE,kpop
2,THE BADDEST,2V4Fx72svQRxrFvNT1eq5f,kpop


In [170]:
df.shape

(4618, 3)

In [171]:
df["grouping"].unique()

array(['kpop', 'itzy', 'seventeen', 'bts', 'twice', 'global', 'billboard'],
      dtype=object)

In [172]:
def get_lyrics(track_id):
    try:
        lyrics = ""
        with open(lyrics_directory + track_id + ".txt", "r") as f:
            lyrics = f.read()
            
        return lyrics            
    except:
        return None
    
df["lyrics"] = df["track_id"].apply(get_lyrics)

df.shape

(4618, 4)

In [173]:
# stop words
# https://www.kaggle.com/datasets/rowhitswami/stopwords/
stopwords = set()
with open(stopwords_file, "r") as f:
    stopwords = set(f.read().split("\n"))
    
stopwords

{',',
 '>',
 '?',
 'Just',
 'Yes',
 'a',
 'abaft',
 'abafter',
 'abaftest',
 'about',
 'abouter',
 'aboutest',
 'above',
 'abover',
 'abovest',
 'accordingly',
 'aer',
 'aest',
 'afore',
 'after',
 'afterer',
 'afterest',
 'afterward',
 'afterwards',
 'again',
 'against',
 'aid',
 'ain',
 'albeit',
 'all',
 'aller',
 'allest',
 'alls',
 'allyou',
 'almost',
 'along',
 'alongside',
 'already',
 'also',
 'although',
 'always',
 'amid',
 'amidst',
 'among',
 'amongst',
 'an',
 'and',
 'andor',
 'anear',
 'anent',
 'another',
 'any',
 'anybody',
 'anyhow',
 'anyone',
 'anything',
 'anywhere',
 'apart',
 'aparter',
 'apartest',
 'appear',
 'appeared',
 'appearing',
 'appears',
 'appropriate',
 'appropriated',
 'appropriater',
 'appropriates',
 'appropriatest',
 'appropriating',
 'are',
 'ares',
 'around',
 'as',
 'ases',
 'aside',
 'asides',
 'aslant',
 'astraddle',
 'astraddler',
 'astraddlest',
 'astride',
 'astrider',
 'astridest',
 'at',
 'athwart',
 'atop',
 'atween',
 'aught',
 'aught

## Analyze

In [174]:
def is_stopword(word):
    word = word.lower()
    
    if len(word) <= 1:
        return True
    
    # in the list
    if word in stopwords:
        return True
    
    # is Korean
    if ord("가") <= ord(word[0]) <= ord("힣"):
        return True
    
    return False

### Characteristic Keyword
Using TF-IDF

In [175]:
from sklearn.feature_extraction.text import TfidfVectorizer

#### Among global

In [176]:
df_looking = df[df["grouping"].isin(target_and_global)].reset_index(drop=True)

In [177]:
vectorizer = TfidfVectorizer(stop_words=list(stopwords), max_features=1000, min_df=10)

x = vectorizer.fit_transform(df_looking["lyrics"].fillna("")).toarray()

df_tfidf = pd.DataFrame(x, columns=vectorizer.get_feature_names_out()).T



In [178]:
threshold = 0.001
counter = {}
summer = {}

for cnt in range(len(df_looking)):
    if df_looking["grouping"][cnt] != target_group:
        continue
    
    tfidf_series = df_tfidf[cnt]
    
    keywords = tfidf_series[tfidf_series > threshold].index.tolist()
    for keyword in keywords:
        counter[keyword] = counter.get(keyword, 0) + 1
        summer[keyword] = summer.get(keyword, 0) + tfidf_series[keyword]

In [179]:
def sort_dictionary(dictionary):
    return [(k, dictionary[k]) for k in sorted(dictionary, key=dictionary.get, reverse=True)]

In [180]:
sort_dictionary(counter)

[('home', 111),
 ('love', 99),
 ('let', 93),
 ('sorry', 92),
 ('news', 87),
 ('life', 76),
 ('night', 73),
 ('say', 66),
 ('world', 63),
 ('heart', 61),
 ('think', 59),
 ('eyes', 57),
 ('dream', 54),
 ('only', 53),
 ('am', 52),
 ('take', 52),
 ('yes', 51),
 ('new', 51),
 ('back', 49),
 ('baby', 49),
 ('still', 48),
 ('light', 47),
 ('keep', 46),
 ('follow', 45),
 ('contact', 41),
 ('room', 40),
 ('end', 40),
 ('man', 39),
 ('leave', 38),
 ('tell', 37),
 ('feel', 36),
 ('way', 33),
 ('need', 33),
 ('hand', 32),
 ('girl', 32),
 ('fly', 31),
 ('away', 31),
 ('first', 30),
 ('morning', 30),
 ('top', 29),
 ('body', 29),
 ('dance', 28),
 ('true', 28),
 ('people', 28),
 ('start', 28),
 ('little', 28),
 ('star', 27),
 ('hands', 27),
 ('name', 27),
 ('give', 27),
 ('live', 27),
 ('shut', 27),
 ('god', 26),
 ('sky', 26),
 ('call', 26),
 ('care', 26),
 ('change', 26),
 ('okay', 26),
 ('hear', 26),
 ('money', 26),
 ('sleep', 26),
 ('high', 26),
 ('boy', 25),
 ('crazy', 25),
 ('looking', 25),
 ('br

In [181]:
sort_dictionary(summer)

[('sorry', 18.545721208654804),
 ('home', 17.067671100778245),
 ('love', 16.735009466826803),
 ('news', 12.674093693227155),
 ('contact', 9.22333997829706),
 ('한국', 8.222223505021587),
 ('dream', 8.030096895193802),
 ('universe', 7.9083924155994865),
 ('let', 7.608030589112444),
 ('light', 7.356065623697072),
 ('world', 7.223579992437025),
 ('night', 7.066279767766307),
 ('follow', 6.95590814499894),
 ('life', 6.71919442829168),
 ('new', 5.96204864337862),
 ('room', 5.867874875362694),
 ('best', 5.506462222532698),
 ('yes', 5.477761858632744),
 ('only', 5.458156198309174),
 ('shut', 5.449540846527193),
 ('baby', 5.403275060796169),
 ('still', 5.351539751321282),
 ('hands', 5.300967017098819),
 ('girl', 5.259574326668171),
 ('heart', 5.035193009201872),
 ('fly', 4.985401266614601),
 ('crazy', 4.7854674590072825),
 ('dynamite', 4.677174820436611),
 ('eyes', 4.604735445913241),
 ('sick', 4.51720668625965),
 ('need', 4.496714381468557),
 ('say', 4.48611354935942),
 ('end', 4.48320034049854

#### Among KPOP

In [182]:
df_looking = df[df["grouping"].isin(target_and_kpop)].reset_index(drop=True)

In [183]:
vectorizer = TfidfVectorizer(stop_words=list(stopwords), max_features=1000, min_df=10)

x = vectorizer.fit_transform(df_looking["lyrics"].fillna("")).toarray()

df_tfidf = pd.DataFrame(x, columns=vectorizer.get_feature_names_out()).T



In [184]:
threshold = 0.001
counter = {}
summer = {}

for cnt in range(len(df_looking)):
    if df_looking["grouping"][cnt] != target_group:
        continue
    
    tfidf_series = df_tfidf[cnt]
    
    keywords = tfidf_series[tfidf_series > threshold].index.tolist()
    for keyword in keywords:
        counter[keyword] = counter.get(keyword, 0) + 1
        summer[keyword] = summer.get(keyword, 0) + tfidf_series[keyword]

In [185]:
sort_dictionary(counter)

[('home', 110),
 ('love', 99),
 ('let', 93),
 ('sorry', 92),
 ('news', 87),
 ('life', 76),
 ('night', 73),
 ('say', 66),
 ('world', 63),
 ('heart', 61),
 ('think', 59),
 ('eyes', 57),
 ('dream', 54),
 ('only', 53),
 ('am', 52),
 ('take', 52),
 ('yes', 51),
 ('new', 51),
 ('back', 49),
 ('baby', 49),
 ('still', 48),
 ('light', 47),
 ('keep', 46),
 ('follow', 45),
 ('contact', 41),
 ('room', 40),
 ('end', 40),
 ('man', 39),
 ('leave', 38),
 ('tell', 37),
 ('feel', 36),
 ('way', 33),
 ('price_varies', 33),
 ('need', 33),
 ('hand', 32),
 ('girl', 32),
 ('fly', 31),
 ('away', 31),
 ('first', 30),
 ('morning', 30),
 ('top', 29),
 ('body', 29),
 ('dance', 28),
 ('true', 28),
 ('people', 28),
 ('start', 28),
 ('little', 28),
 ('star', 27),
 ('hands', 27),
 ('name', 27),
 ('give', 27),
 ('live', 27),
 ('shut', 27),
 ('god', 26),
 ('sky', 26),
 ('call', 26),
 ('care', 26),
 ('change', 26),
 ('hear', 26),
 ('money', 26),
 ('sleep', 26),
 ('high', 26),
 ('boy', 25),
 ('okay', 25),
 ('crazy', 25),


In [186]:
sort_dictionary(summer)

[('love', 17.244766793140478),
 ('home', 13.976033105135537),
 ('sorry', 11.214499143805613),
 ('let', 8.365441239057917),
 ('life', 7.722852882428214),
 ('night', 7.388498725011722),
 ('universe', 6.86082255138718),
 ('world', 6.732148348413589),
 ('only', 6.243375393825241),
 ('dream', 6.188214244365974),
 ('contact', 6.101288404283434),
 ('still', 6.026430835348571),
 ('news', 6.025426808817155),
 ('light', 5.982532756808491),
 ('girl', 5.958198828692089),
 ('best', 5.659946319195746),
 ('say', 5.510279668081929),
 ('yes', 5.466931898775716),
 ('crazy', 5.3143404884223635),
 ('need', 5.192483267449859),
 ('hands', 5.011287717289467),
 ('new', 5.008184068024992),
 ('baby', 4.8137220314116655),
 ('fly', 4.748502194692486),
 ('follow', 4.6920595313623235),
 ('think', 4.567508302573726),
 ('heart', 4.433581534579057),
 ('stay', 4.4185032561937945),
 ('invalid', 4.404981493203845),
 ('sick', 4.40122956611715),
 ('back', 4.374582098380377),
 ('room', 4.325917759871038),
 ('money', 4.32569

### Topic analysis

In [187]:
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel

In [188]:
def pre_process(df_lyrics):
    # tokenize lyrics
    lyrics_processed = df_lyrics.fillna("").str.split()

    for _list in lyrics_processed:
        for cnt in range(len(_list)):
            _list[cnt] = _list[cnt].replace(".", "").replace(",", "").replace("?", "").replace("\\", "").replace("/", "").replace(":", "").lower()
            
            if is_stopword(_list[cnt]):
                _list[cnt] = ""
                
    # remove empty string
    for cnt in range(len(lyrics_processed)):
        while(True):
            try:
                lyrics_processed[cnt].remove("")
            except:
                break

    return lyrics_processed

#### Among global

In [189]:
df_looking = df[df["grouping"].isin(target_and_global)].reset_index(drop=True)

In [190]:
#train model
lyrics_processed = pre_process(df_looking["lyrics"])
dictionary = Dictionary(lyrics_processed)
corpus = [dictionary.doc2bow(text) for text in lyrics_processed]
lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=100)

all topics

In [191]:
# observe topics
topics = lda.print_topics()

#sort
topics = sorted(topics, key=lambda x: x[0])

topics

[(2,
  '0.058*"shut" + 0.034*"dance" + 0.026*"summer" + 0.012*"best" + 0.010*"mind" + 0.010*"late" + 0.010*"shit" + 0.010*"say" + 0.008*"feel" + 0.008*"let"'),
 (4,
  '0.560*"baby" + 0.015*"high" + 0.008*"girl" + 0.006*"mine" + 0.006*"wood" + 0.006*"love" + 0.005*"give" + 0.005*"take" + 0.005*"boy" + 0.004*"ruin"'),
 (8,
  '0.035*"am" + 0.027*"found" + 0.022*"love" + 0.018*"place" + 0.012*"dem" + 0.010*"bulletproof" + 0.009*"party" + 0.009*"hope" + 0.009*"baby" + 0.009*"came"'),
 (14,
  '0.037*"lights" + 0.018*"need" + 0.016*"save" + 0.015*"girl" + 0.010*"beautiful" + 0.009*"love" + 0.009*"let" + 0.009*"can\'t" + 0.009*"meet" + 0.009*"call"'),
 (27,
  '0.056*"take" + 0.019*"money" + 0.015*"child" + 0.012*"feel" + 0.012*"only" + 0.012*"hand" + 0.010*"hero" + 0.009*"lovers" + 0.008*"yes" + 0.008*"need"'),
 (29,
  '0.284*"hands" + 0.018*"smack" + 0.016*"lose" + 0.014*"air" + 0.011*"bouncin" + 0.010*"feel" + 0.009*"keep" + 0.008*"can\'t" + 0.008*"say" + 0.007*"down!"'),
 (36,
  '0.038*"gay

what topics are in kpop?

In [192]:
threshold = 0.3

counter = {}
summer = {}

for cnt in range(len(df_looking)):
    if df_looking["grouping"][cnt] != target_group:
        continue
    
    topics = lda.get_document_topics(corpus[cnt], minimum_probability=0)
    
    for topic in topics:
        if topic[1] > threshold:
            counter[topic[0]] = counter.get(topic[0], 0) + 1
            
        summer[topic[0]] = summer.get(topic[0], 0) + topic[1]

In [193]:
sort_dictionary(counter)

[(18, 10),
 (0, 8),
 (51, 7),
 (72, 7),
 (26, 6),
 (65, 6),
 (37, 6),
 (19, 6),
 (82, 6),
 (52, 6),
 (35, 5),
 (43, 5),
 (2, 5),
 (9, 5),
 (73, 5),
 (16, 5),
 (14, 5),
 (64, 4),
 (70, 4),
 (36, 4),
 (66, 4),
 (31, 4),
 (3, 4),
 (91, 4),
 (54, 4),
 (79, 4),
 (57, 4),
 (20, 3),
 (10, 3),
 (99, 3),
 (28, 3),
 (44, 3),
 (90, 3),
 (80, 2),
 (75, 2),
 (67, 2),
 (56, 2),
 (87, 2),
 (63, 2),
 (60, 2),
 (13, 2),
 (30, 2),
 (42, 2),
 (83, 2),
 (38, 2),
 (12, 2),
 (69, 2),
 (15, 2),
 (33, 1),
 (7, 1),
 (58, 1),
 (46, 1),
 (6, 1),
 (96, 1),
 (40, 1),
 (89, 1),
 (29, 1),
 (47, 1),
 (86, 1),
 (11, 1),
 (59, 1),
 (48, 1),
 (62, 1),
 (88, 1),
 (4, 1),
 (81, 1),
 (41, 1),
 (53, 1)]

In [194]:
sums = sort_dictionary(summer)

sums

[(0, 10.539613003154955),
 (51, 9.590236410302168),
 (18, 9.436605761300598),
 (16, 9.25922489441291),
 (19, 9.117982150164607),
 (70, 9.051968690382637),
 (72, 8.032022000254074),
 (37, 7.453786909827613),
 (82, 7.340890043964464),
 (38, 5.97267580794869),
 (3, 5.9543520168699615),
 (44, 5.840236877422285),
 (10, 5.696689243082801),
 (64, 5.618328994689364),
 (65, 5.555313035310974),
 (14, 5.3920224760368),
 (26, 5.357744881504914),
 (91, 5.237682380498882),
 (66, 5.163769721999415),
 (43, 5.078138128359569),
 (28, 5.022421317153203),
 (90, 4.847784736302856),
 (79, 4.8425809773289075),
 (99, 4.778095318721171),
 (36, 4.747859088256519),
 (35, 4.628715866361745),
 (9, 4.620393818193406),
 (57, 4.594529316836997),
 (20, 4.590112025816779),
 (31, 4.540639660397574),
 (2, 4.5030286026558315),
 (53, 4.1456288579138345),
 (41, 4.062658484835993),
 (12, 4.0558196979509376),
 (42, 3.797881005841191),
 (30, 3.793509921502846),
 (87, 3.741728667209827),
 (52, 3.702256228225451),
 (5, 3.6302720

In [195]:
for _sum in sums[:10]:
    print("Topic ID: " + str(_sum[0]))
    print("Sum: " + str(_sum[1]))
    print("Count: " + str(counter[_sum[0]]))
    
    print("Keywords: ")
    print(lda.print_topic(int(_sum[0])))
    
    print("\n")

Topic ID: 0
Sum: 10.539613003154955
Count: 8
Keywords: 
0.123*"ка" + 0.113*"love" + 0.021*"let" + 0.020*"crazy" + 0.019*"free" + 0.019*"god" + 0.017*"sorry" + 0.013*"river" + 0.013*"baby" + 0.013*"feel"


Topic ID: 51
Sum: 9.590236410302168
Count: 7
Keywords: 
0.041*"man" + 0.025*"dynamite" + 0.024*"night" + 0.018*"tell" + 0.018*"life" + 0.017*"stars" + 0.017*"fun" + 0.015*"light" + 0.013*"soul" + 0.013*"shine"


Topic ID: 18
Sum: 9.436605761300598
Count: 10
Keywords: 
0.051*"home" + 0.025*"let" + 0.016*"only" + 0.015*"made" + 0.014*"take" + 0.012*"back" + 0.012*"feel" + 0.012*"miss" + 0.011*"world" + 0.009*"give"


Topic ID: 16
Sum: 9.25922489441291
Count: 5
Keywords: 
0.029*"sorry" + 0.011*"gonna" + 0.011*"say" + 0.009*"happy" + 0.009*"share" + 0.009*"call" + 0.008*"follow" + 0.008*"bitch" + 0.008*"ruins" + 0.008*"life"


Topic ID: 19
Sum: 9.117982150164607
Count: 6
Keywords: 
0.042*"mom" + 0.019*"life" + 0.017*"night" + 0.013*"can't" + 0.011*"read" + 0.011*"say" + 0.010*"need" + 0.0

#### Among KPOP

In [196]:
df_looking = df[df["grouping"].isin(target_and_kpop)].reset_index(drop=True)

In [197]:
#train model
lyrics_processed = pre_process(df_looking["lyrics"])
dictionary = Dictionary(lyrics_processed)
corpus = [dictionary.doc2bow(text) for text in lyrics_processed]
lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=100)

# observe topics
topics = lda.print_topics()

In [198]:
threshold = 0.3

counter = {}
summer = {}

for cnt in range(len(df_looking)):
    if df_looking["grouping"][cnt] != target_group:
        continue
    
    topics = lda.get_document_topics(corpus[cnt], minimum_probability=0)
    
    for topic in topics:
        if topic[1] > threshold:
            counter[topic[0]] = counter.get(topic[0], 0) + 1
            
        summer[topic[0]] = summer.get(topic[0], 0) + topic[1]


In [199]:
sort_dictionary(counter)

[(39, 11),
 (11, 9),
 (43, 9),
 (13, 9),
 (27, 7),
 (47, 7),
 (25, 7),
 (68, 6),
 (49, 6),
 (71, 6),
 (33, 6),
 (63, 6),
 (7, 5),
 (9, 5),
 (74, 5),
 (44, 5),
 (94, 5),
 (35, 5),
 (80, 4),
 (82, 4),
 (64, 4),
 (45, 4),
 (77, 4),
 (4, 4),
 (93, 4),
 (70, 4),
 (48, 4),
 (98, 3),
 (1, 3),
 (54, 3),
 (30, 3),
 (12, 3),
 (65, 3),
 (22, 3),
 (16, 3),
 (2, 3),
 (52, 3),
 (50, 3),
 (31, 3),
 (37, 3),
 (73, 3),
 (24, 3),
 (78, 3),
 (92, 3),
 (86, 3),
 (53, 3),
 (87, 2),
 (58, 2),
 (90, 2),
 (17, 2),
 (41, 2),
 (55, 2),
 (57, 2),
 (3, 2),
 (20, 2),
 (14, 2),
 (6, 2),
 (66, 2),
 (96, 2),
 (23, 2),
 (62, 2),
 (59, 2),
 (56, 2),
 (40, 2),
 (67, 2),
 (99, 2),
 (18, 2),
 (76, 2),
 (61, 2),
 (88, 2),
 (32, 1),
 (60, 1),
 (21, 1),
 (97, 1),
 (26, 1),
 (85, 1),
 (28, 1),
 (95, 1),
 (34, 1),
 (79, 1),
 (46, 1),
 (36, 1),
 (29, 1),
 (51, 1),
 (72, 1),
 (89, 1),
 (42, 1),
 (15, 1),
 (0, 1),
 (10, 1),
 (83, 1)]

In [200]:

sums = sort_dictionary(summer)

sums

[(39, 10.590388212396192),
 (43, 10.226062877667118),
 (13, 8.702815052015922),
 (11, 8.42180312973096),
 (47, 8.242370234616374),
 (27, 7.805400506811111),
 (33, 7.073885554611024),
 (25, 6.776861816786095),
 (63, 6.745957672806526),
 (68, 6.243004713858227),
 (71, 5.917397642253036),
 (74, 5.502150196436105),
 (64, 5.466802343500149),
 (7, 5.420621966058661),
 (9, 5.414081115522549),
 (44, 5.41121331385466),
 (49, 5.10561738293427),
 (82, 4.938799704622852),
 (4, 4.774629389070014),
 (93, 4.711782197260163),
 (80, 4.67599624704053),
 (35, 4.662511898917273),
 (48, 4.546309347536862),
 (65, 4.526477746843284),
 (78, 4.40763925953388),
 (15, 4.377807715041399),
 (45, 4.354294603693688),
 (70, 4.3279790359115395),
 (94, 4.269568286882986),
 (52, 4.140826935322821),
 (22, 4.1239170462426955),
 (54, 4.068138297352107),
 (24, 3.9609030125898244),
 (12, 3.941841478029801),
 (98, 3.9320084021014736),
 (53, 3.9233381063199886),
 (92, 3.687610355376364),
 (50, 3.6196921721234503),
 (16, 3.5573

In [201]:
#sort
topics = sorted(topics, key=lambda x: x[0])


for _sum in sums[:10]:
    print("Topic ID: " + str(_sum[0]))
    print("Sum: " + str(_sum[1]))
    print("Count: " + str(counter[_sum[0]]))
    
    print("Keywords: ")
    print(lda.print_topic(int(_sum[0])))
    
    print("\n")

Topic ID: 39
Sum: 10.590388212396192
Count: 11
Keywords: 
0.110*"ка" + 0.068*"home" + 0.052*"love" + 0.033*"segreta" + 0.016*"sorry" + 0.012*"contact" + 0.012*"crazy" + 0.010*"summer" + 0.009*"broom" + 0.008*"bang"


Topic ID: 43
Sum: 10.226062877667118
Count: 9
Keywords: 
0.032*"dynamite" + 0.024*"light" + 0.020*"night" + 0.018*"life" + 0.018*"soul" + 0.018*"city" + 0.017*"yes" + 0.016*"shine" + 0.016*"fun" + 0.015*"tonight"


Topic ID: 13
Sum: 8.702815052015922
Count: 9
Keywords: 
0.058*"love" + 0.015*"need" + 0.015*"girl" + 0.014*"baby" + 0.014*"crazy" + 0.012*"give" + 0.012*"say" + 0.011*"break" + 0.011*"eyes" + 0.010*"night"


Topic ID: 11
Sum: 8.42180312973096
Count: 9
Keywords: 
0.028*"live" + 0.019*"am" + 0.011*"mic" + 0.010*"hands" + 0.010*"let" + 0.009*"show" + 0.009*"side" + 0.007*"yes" + 0.007*"light" + 0.006*"say"


Topic ID: 47
Sum: 8.242370234616374
Count: 7
Keywords: 
0.040*"love" + 0.029*"baby" + 0.019*"mariachi" + 0.018*"el" + 0.011*"price_min}}" + 0.011*"compare_at_p

### Emotion analysis
Emotions:  
fear
anger
anticipation
trust
surprise
positive
negative
sadness
disgust
joy

In [202]:
import numpy as np

In [203]:
emotions = [
    "anger",
    "anticipation",
    "disgust",
    "fear",
    "joy",
    "negative",	
    "positive",
    "sadness",
    "surprise",	
    "trust",
]

In [204]:
df_nrc = pd.read_csv("NRC.txt", sep="\t", names=["word", "emotion", "amount"])

words_to_emotions = {}

for _, row in df_nrc.iterrows():
    words_to_emotions[row["word"]] = words_to_emotions.get(row["word"], np.zeros((10, 1)))
    words_to_emotions[row["word"]][emotions.index(row["emotion"])] = row["amount"]

In [205]:
df_emotion = df.copy()
for emotion in emotions:
    df_emotion[emotion] = 0

In [206]:
for cnt in range(len(df)):
    lyrics = df["lyrics"][cnt]

    if type(lyrics) != str:
        continue

    emotion_this = np.zeros((10, 1))

    for word in lyrics.split():
        if word in words_to_emotions:
            emotion_this += words_to_emotions[word]
            
    if np.linalg.norm(emotion_this) != 0:
        emotion_this /= np.linalg.norm(emotion_this)
        
    for emotion in emotions:
        df_emotion[emotion][cnt] = emotion_this[emotions.index(emotion)]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_emotion[emotion][cnt] = emotion_this[emotions.index(emotion)]
  df_emotion[emotion][cnt] = emotion_this[emotions.index(emotion)]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_emotion[emotion][cnt] = emotion_this[emotions.index(emotion)]
  df_emotion[emotion][cnt] = emotion_this[emotions.index(emotion)]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_emotion[emotion][cnt] = emotion_this[emotions.index(emotion)]
  df_emotion[emotion][cnt] = emotion_this[emoti

In [207]:
df_emotion_global = df_emotion[df_emotion["grouping"].isin(global_group)]
mean_emotion_global = df_emotion_global[emotions].mean()
normalized_mean_emotion_global = mean_emotion_global / np.linalg.norm(mean_emotion_global)

df_emotion_kpop = df_emotion[df_emotion["grouping"].isin(kpop_group)]
mean_emotion_kpop = df_emotion_kpop[emotions].mean()
normalized_mean_emotion_kpop = mean_emotion_kpop / np.linalg.norm(mean_emotion_kpop)

df_emotion_target = df_emotion[df_emotion["grouping"] == target_group]
mean_emotion_target = df_emotion_target[emotions].mean()
normalized_mean_emotion_target = mean_emotion_target / np.linalg.norm(mean_emotion_target)

In [210]:
normalized_mean_emotion_target

anger           0.154527
anticipation    0.313939
disgust         0.105527
fear            0.223917
joy             0.363832
negative        0.366232
positive        0.625078
sadness         0.246532
surprise        0.162838
trust           0.267905
dtype: float64

against global

In [208]:
emotion_percentage = (normalized_mean_emotion_target - normalized_mean_emotion_global) / normalized_mean_emotion_global * 100

emotion_percentage

anger          -26.862864
anticipation     6.185337
disgust        -22.754799
fear            -5.103215
joy              5.472281
negative       -21.122329
positive        16.974613
sadness         -4.229430
surprise        -4.397312
trust           -4.203301
dtype: float64

against KPOP

In [209]:
emotion_percentage = (normalized_mean_emotion_target - normalized_mean_emotion_kpop) / normalized_mean_emotion_kpop * 100

emotion_percentage

anger           -2.132820
anticipation    -2.317294
disgust          6.548154
fear             7.784481
joy             -6.442900
negative         2.080201
positive        -0.255459
sadness         13.599037
surprise        -2.330777
trust            0.370840
dtype: float64