# Lyrics Analysis

In [1]:
lyrics_directory = "Lyrics_Data/"
tracks_csv = "tracks_group.csv"
stopwords_file = "stopwords.txt"

In [2]:
target_group = "bts"
kpop_group = set(['kpop', 'itzy', 'seventeen', 'bts', 'twice'])
global_group = set(['global', 'billboard'])

target_and_kpop = kpop_group.union(set([target_group]))
target_and_global = global_group.union(set([target_group]))

## Import data

In [3]:
import pandas as pd

In [4]:
df = pd.read_csv(tracks_csv)

df.head(3)

Unnamed: 0,track_name,track_id,grouping
0,Ice Cream (with Selena Gomez),2J4P46vCFm1rPkNkp9pZWX,kpop
1,Dynamite,0v1x6rN6JHRapa03JElljE,kpop
2,THE BADDEST,2V4Fx72svQRxrFvNT1eq5f,kpop


In [5]:
df.shape

(4618, 3)

In [6]:
df["grouping"].unique()

array(['kpop', 'itzy', 'seventeen', 'bts', 'twice', 'global', 'billboard'],
      dtype=object)

In [7]:
def get_lyrics(track_id):
    try:
        lyrics = ""
        with open(lyrics_directory + track_id + ".txt", "r") as f:
            lyrics = f.read()
            
        return lyrics            
    except:
        return None
    
df["lyrics"] = df["track_id"].apply(get_lyrics)

df.shape

(4618, 4)

In [8]:
# stop words
# https://www.kaggle.com/datasets/rowhitswami/stopwords/
stopwords = set()
with open(stopwords_file, "r") as f:
    stopwords = set(f.read().split("\n"))
    
stopwords

{',',
 '>',
 '?',
 'Just',
 'Yes',
 'a',
 'abaft',
 'abafter',
 'abaftest',
 'about',
 'abouter',
 'aboutest',
 'above',
 'abover',
 'abovest',
 'accordingly',
 'aer',
 'aest',
 'afore',
 'after',
 'afterer',
 'afterest',
 'afterward',
 'afterwards',
 'again',
 'against',
 'aid',
 'ain',
 'albeit',
 'all',
 'aller',
 'allest',
 'alls',
 'allyou',
 'almost',
 'along',
 'alongside',
 'already',
 'also',
 'although',
 'always',
 'amid',
 'amidst',
 'among',
 'amongst',
 'an',
 'and',
 'andor',
 'anear',
 'anent',
 'another',
 'any',
 'anybody',
 'anyhow',
 'anyone',
 'anything',
 'anywhere',
 'apart',
 'aparter',
 'apartest',
 'appear',
 'appeared',
 'appearing',
 'appears',
 'appropriate',
 'appropriated',
 'appropriater',
 'appropriates',
 'appropriatest',
 'appropriating',
 'are',
 'ares',
 'around',
 'as',
 'ases',
 'aside',
 'asides',
 'aslant',
 'astraddle',
 'astraddler',
 'astraddlest',
 'astride',
 'astrider',
 'astridest',
 'at',
 'athwart',
 'atop',
 'atween',
 'aught',
 'aught

## Analyze

In [9]:
def is_stopword(word):
    word = word.lower()
    
    if len(word) <= 1:
        return True
    
    # in the list
    if word in stopwords:
        return True
    
    # is Korean
    if ord("가") <= ord(word[0]) <= ord("힣"):
        return True
    
    return False

### Characteristic Keyword
Using TF-IDF

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

#### Among global

In [11]:
df_looking = df[df["grouping"].isin(target_and_global)].reset_index(drop=True)

In [12]:
vectorizer = TfidfVectorizer(stop_words=list(stopwords), max_features=1000, min_df=10)

x = vectorizer.fit_transform(df_looking["lyrics"].fillna("")).toarray()

df_tfidf = pd.DataFrame(x, columns=vectorizer.get_feature_names_out()).T



In [13]:
threshold = 0.001
counter = {}
summer = {}

for cnt in range(len(df_looking)):
    if df_looking["grouping"][cnt] != target_group:
        continue
    
    tfidf_series = df_tfidf[cnt]
    
    keywords = tfidf_series[tfidf_series > threshold].index.tolist()
    for keyword in keywords:
        counter[keyword] = counter.get(keyword, 0) + 1
        summer[keyword] = summer.get(keyword, 0) + float(tfidf_series[keyword])

In [14]:
def sort_dictionary(dictionary):
    return [(k, dictionary[k]) for k in sorted(dictionary, key=dictionary.get, reverse=True)]

In [15]:
tfidf_counter_global = sort_dictionary(counter)
tfidf_counter_global

[('home', 111),
 ('love', 99),
 ('let', 93),
 ('sorry', 92),
 ('news', 87),
 ('life', 76),
 ('night', 73),
 ('say', 66),
 ('world', 63),
 ('heart', 61),
 ('think', 59),
 ('eyes', 57),
 ('dream', 54),
 ('only', 53),
 ('am', 52),
 ('take', 52),
 ('yes', 51),
 ('new', 51),
 ('back', 49),
 ('baby', 49),
 ('still', 48),
 ('light', 47),
 ('keep', 46),
 ('follow', 45),
 ('contact', 41),
 ('room', 40),
 ('end', 40),
 ('man', 39),
 ('leave', 38),
 ('tell', 37),
 ('feel', 36),
 ('way', 33),
 ('need', 33),
 ('hand', 32),
 ('girl', 32),
 ('fly', 31),
 ('away', 31),
 ('first', 30),
 ('morning', 30),
 ('top', 29),
 ('body', 29),
 ('dance', 28),
 ('true', 28),
 ('people', 28),
 ('start', 28),
 ('little', 28),
 ('star', 27),
 ('hands', 27),
 ('name', 27),
 ('give', 27),
 ('live', 27),
 ('shut', 27),
 ('god', 26),
 ('sky', 26),
 ('call', 26),
 ('care', 26),
 ('change', 26),
 ('okay', 26),
 ('hear', 26),
 ('money', 26),
 ('sleep', 26),
 ('high', 26),
 ('boy', 25),
 ('crazy', 25),
 ('looking', 25),
 ('br

In [16]:
tfidf_summer_global = sort_dictionary(summer)
tfidf_summer_global

[('sorry', 18.545721208654804),
 ('home', 17.067671100778245),
 ('love', 16.735009466826803),
 ('news', 12.674093693227155),
 ('contact', 9.22333997829706),
 ('한국', 8.222223505021587),
 ('dream', 8.030096895193802),
 ('universe', 7.9083924155994865),
 ('let', 7.608030589112444),
 ('light', 7.356065623697072),
 ('world', 7.223579992437025),
 ('night', 7.066279767766307),
 ('follow', 6.95590814499894),
 ('life', 6.71919442829168),
 ('new', 5.96204864337862),
 ('room', 5.867874875362694),
 ('best', 5.506462222532698),
 ('yes', 5.477761858632744),
 ('only', 5.458156198309174),
 ('shut', 5.449540846527193),
 ('baby', 5.403275060796169),
 ('still', 5.351539751321282),
 ('hands', 5.300967017098819),
 ('girl', 5.259574326668171),
 ('heart', 5.035193009201872),
 ('fly', 4.985401266614601),
 ('crazy', 4.7854674590072825),
 ('dynamite', 4.677174820436611),
 ('eyes', 4.604735445913241),
 ('sick', 4.51720668625965),
 ('need', 4.496714381468557),
 ('say', 4.48611354935942),
 ('end', 4.48320034049854

#### Among KPOP

In [17]:
df_looking = df[df["grouping"].isin(target_and_kpop)].reset_index(drop=True)

In [18]:
vectorizer = TfidfVectorizer(stop_words=list(stopwords), max_features=1000, min_df=10)

x = vectorizer.fit_transform(df_looking["lyrics"].fillna("")).toarray()

df_tfidf = pd.DataFrame(x, columns=vectorizer.get_feature_names_out()).T



In [19]:
threshold = 0.001
counter = {}
summer = {}

for cnt in range(len(df_looking)):
    if df_looking["grouping"][cnt] != target_group:
        continue
    
    tfidf_series = df_tfidf[cnt]
    
    keywords = tfidf_series[tfidf_series > threshold].index.tolist()
    for keyword in keywords:
        counter[keyword] = counter.get(keyword, 0) + 1
        summer[keyword] = summer.get(keyword, 0) + float(tfidf_series[keyword])

In [20]:
tfidf_counter_kpop = sort_dictionary(counter)
tfidf_counter_kpop

[('home', 110),
 ('love', 99),
 ('let', 93),
 ('sorry', 92),
 ('news', 87),
 ('life', 76),
 ('night', 73),
 ('say', 66),
 ('world', 63),
 ('heart', 61),
 ('think', 59),
 ('eyes', 57),
 ('dream', 54),
 ('only', 53),
 ('am', 52),
 ('take', 52),
 ('yes', 51),
 ('new', 51),
 ('back', 49),
 ('baby', 49),
 ('still', 48),
 ('light', 47),
 ('keep', 46),
 ('follow', 45),
 ('contact', 41),
 ('room', 40),
 ('end', 40),
 ('man', 39),
 ('leave', 38),
 ('tell', 37),
 ('feel', 36),
 ('way', 33),
 ('price_varies', 33),
 ('need', 33),
 ('hand', 32),
 ('girl', 32),
 ('fly', 31),
 ('away', 31),
 ('first', 30),
 ('morning', 30),
 ('top', 29),
 ('body', 29),
 ('dance', 28),
 ('true', 28),
 ('people', 28),
 ('start', 28),
 ('little', 28),
 ('star', 27),
 ('hands', 27),
 ('name', 27),
 ('give', 27),
 ('live', 27),
 ('shut', 27),
 ('god', 26),
 ('sky', 26),
 ('call', 26),
 ('care', 26),
 ('change', 26),
 ('hear', 26),
 ('money', 26),
 ('sleep', 26),
 ('high', 26),
 ('boy', 25),
 ('okay', 25),
 ('crazy', 25),


In [21]:
tfidf_summer_kpop = sort_dictionary(summer)
tfidf_summer_kpop

[('love', 17.244766793140478),
 ('home', 13.976033105135537),
 ('sorry', 11.214499143805613),
 ('let', 8.365441239057917),
 ('life', 7.722852882428214),
 ('night', 7.388498725011722),
 ('universe', 6.86082255138718),
 ('world', 6.732148348413589),
 ('only', 6.243375393825241),
 ('dream', 6.188214244365974),
 ('contact', 6.101288404283434),
 ('still', 6.026430835348571),
 ('news', 6.025426808817155),
 ('light', 5.982532756808491),
 ('girl', 5.958198828692089),
 ('best', 5.659946319195746),
 ('say', 5.510279668081929),
 ('yes', 5.466931898775716),
 ('crazy', 5.3143404884223635),
 ('need', 5.192483267449859),
 ('hands', 5.011287717289467),
 ('new', 5.008184068024992),
 ('baby', 4.8137220314116655),
 ('fly', 4.748502194692486),
 ('follow', 4.6920595313623235),
 ('think', 4.567508302573726),
 ('heart', 4.433581534579057),
 ('stay', 4.4185032561937945),
 ('invalid', 4.404981493203845),
 ('sick', 4.40122956611715),
 ('back', 4.374582098380377),
 ('room', 4.325917759871038),
 ('money', 4.32569

### Topic analysis

In [22]:
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel

In [23]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
def save_lda_vis(lda, corpus, dictionary, filename):
    vis = gensimvis.prepare(lda, corpus, dictionary)
    pyLDAvis.save_html(vis, filename)
    
    with open(filename, "r") as f:
        html = f.read()
        
    return html, vis

In [24]:
def pre_process(df_lyrics):
    # tokenize lyrics
    lyrics_processed = df_lyrics.fillna("").str.split()

    for _list in lyrics_processed:
        for cnt in range(len(_list)):
            _list[cnt] = _list[cnt].replace(".", "").replace(",", "").replace("?", "").replace("\\", "").replace("/", "").replace(":", "").lower()
            
            if is_stopword(_list[cnt]):
                _list[cnt] = ""
                
    # remove empty string
    for cnt in range(len(lyrics_processed)):
        while(True):
            try:
                lyrics_processed[cnt].remove("")
            except:
                break

    return lyrics_processed

#### Among global

In [25]:
df_looking = df[df["grouping"].isin(target_and_global)].reset_index(drop=True)

In [26]:
#train model
lyrics_processed = pre_process(df_looking["lyrics"])
dictionary = Dictionary(lyrics_processed)
corpus = [dictionary.doc2bow(text) for text in lyrics_processed]
lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=100)

all topics

In [27]:
lda_html_global, vis = save_lda_vis(lda, corpus, dictionary, "lda.html")

vis

PreparedData(topic_coordinates=              x         y  topics  cluster      Freq
topic                                               
53     0.063595 -0.002480       1        1  4.475778
11     0.106444 -0.016141       2        1  3.586331
30     0.041679 -0.003085       3        1  3.117786
94     0.078308 -0.011432       4        1  2.804937
39     0.087175 -0.019344       5        1  2.748802
...         ...       ...     ...      ...       ...
80    -0.121646  0.026426      96        1  0.308542
68    -0.068283  0.027250      97        1  0.292647
63    -0.109372  0.004994      98        1  0.266035
20    -0.093669  0.031602      99        1  0.261396
93    -0.079813  0.026090     100        1  0.256449

[100 rows x 5 columns], topic_info=        Term         Freq        Total  Category  logprob  loglift
4889  labour  4755.000000  4755.000000   Default  30.0000  30.0000
580    drunk  2579.000000  2579.000000   Default  29.0000  29.0000
49      baby  4869.000000  4869.000000   De

what topics are in kpop?

In [28]:
threshold = 0.3

counter = {}
summer = {}

for cnt in range(len(df_looking)):
    if df_looking["grouping"][cnt] != target_group:
        continue
    
    topics = lda.get_document_topics(corpus[cnt], minimum_probability=0)
    
    for topic in topics:
        if topic[1] > threshold:
            counter[topic[0]] = counter.get(topic[0], 0) + 1
            
        summer[topic[0]] = summer.get(topic[0], 0) + float(topic[1])

In [29]:
_lda_counter_global = sort_dictionary(counter)
lda_counter_global = []
for tup in _lda_counter_global:
    new_tup = [tup[0], tup[1], lda.show_topic(tup[0])]
    lda_counter_global.append(new_tup)
    
lda_counter_global

[[39,
  12,
  [('love', 0.123812795),
   ('girl', 0.015177706),
   ('house', 0.012945831),
   ('found', 0.011447903),
   ('let', 0.011130232),
   ('place', 0.010730769),
   ('way', 0.00803331),
   ('need', 0.007980324),
   ('think', 0.007934016),
   ('yes', 0.007901667)]],
 [11,
  9,
  [('love', 0.0671956),
   ('say', 0.017090758),
   ("can't", 0.014592582),
   ('give', 0.013192182),
   ('need', 0.013008415),
   ('heart', 0.010491185),
   ('home', 0.010278336),
   ('leave', 0.009304878),
   ('alone', 0.00852356),
   ('find', 0.0072796983)]],
 [49,
  9,
  [('home', 0.10985859),
   ('matter', 0.016954737),
   ('say', 0.016389411),
   ('due', 0.013163471),
   ('sorry', 0.012800555),
   ("doesn't", 0.0116091035),
   ('friday', 0.009734606),
   ('pump', 0.009410147),
   ('back', 0.0089087235),
   ('walk', 0.008548106)]],
 [82,
  8,
  [('best', 0.023363052),
   ('stay', 0.018019058),
   ("let's", 0.015872207),
   ('crazy', 0.014617654),
   ('give', 0.011174909),
   ('fine', 0.010864386),
   

In [30]:
_lda_summer_global = sort_dictionary(summer)
lda_summer_global = []
for tup in _lda_summer_global:
    new_tup = [tup[0], tup[1], lda.show_topic(tup[0])]
    lda_summer_global.append(new_tup)
    
lda_summer_global

[[49,
  9.98669076754868,
  [('home', 0.10985859),
   ('matter', 0.016954737),
   ('say', 0.016389411),
   ('due', 0.013163471),
   ('sorry', 0.012800555),
   ("doesn't", 0.0116091035),
   ('friday', 0.009734606),
   ('pump', 0.009410147),
   ('back', 0.0089087235),
   ('walk', 0.008548106)]],
 [39,
  9.635773781686112,
  [('love', 0.123812795),
   ('girl', 0.015177706),
   ('house', 0.012945831),
   ('found', 0.011447903),
   ('let', 0.011130232),
   ('place', 0.010730769),
   ('way', 0.00803331),
   ('need', 0.007980324),
   ('think', 0.007934016),
   ('yes', 0.007901667)]],
 [11,
  9.190596861587437,
  [('love', 0.0671956),
   ('say', 0.017090758),
   ("can't", 0.014592582),
   ('give', 0.013192182),
   ('need', 0.013008415),
   ('heart', 0.010491185),
   ('home', 0.010278336),
   ('leave', 0.009304878),
   ('alone', 0.00852356),
   ('find', 0.0072796983)]],
 [55,
  8.648864292149483,
  [('light', 0.044404957),
   ('life', 0.024890583),
   ('little', 0.02236362),
   ('city', 0.01954

#### Among KPOP

In [31]:
df_looking = df[df["grouping"].isin(target_and_kpop)].reset_index(drop=True)

In [32]:
#train model
lyrics_processed = pre_process(df_looking["lyrics"])
dictionary = Dictionary(lyrics_processed)
corpus = [dictionary.doc2bow(text) for text in lyrics_processed]
lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=100)

# observe topics
topics = lda.print_topics()

In [33]:
threshold = 0.3

counter = {}
summer = {}

for cnt in range(len(df_looking)):
    if df_looking["grouping"][cnt] != target_group:
        continue
    
    topics = lda.get_document_topics(corpus[cnt], minimum_probability=0)
    
    for topic in topics:
        if topic[1] > threshold:
            counter[topic[0]] = counter.get(topic[0], 0) + 1
            
        summer[topic[0]] = summer.get(topic[0], 0) + float(topic[1])


In [34]:
lda_html_kpop, vis = save_lda_vis(lda, corpus, dictionary, "lda.html")

vis

PreparedData(topic_coordinates=              x         y  topics  cluster      Freq
topic                                               
11     0.356192  0.065595       1        1  3.679636
77     0.055111  0.115737       2        1  2.789065
76     0.120788 -0.073097       3        1  2.687125
38    -0.015716 -0.107596       4        1  2.198534
60    -0.004587 -0.083669       5        1  1.963256
...         ...       ...     ...      ...       ...
2      0.144183  0.083538      96        1  0.386976
7     -0.025693  0.001662      97        1  0.348756
85    -0.056021  0.111750      98        1  0.310712
49    -0.037139  0.160335      99        1  0.254533
61    -0.050064  0.047589     100        1  0.210311

[100 rows x 5 columns], topic_info=         Term         Freq        Total  Category  logprob  loglift
280     child  4807.000000  4807.000000   Default  30.0000  30.0000
425     drunk  1936.000000  1936.000000   Default  29.0000  29.0000
494      cake  1443.000000  1443.000000 

In [35]:
_lda_counter_kpop = sort_dictionary(counter)
lda_counter_kpop = []
for tup in _lda_counter_kpop:
    new_tup = [tup[0], tup[1], lda.show_topic(tup[0])]
    lda_counter_kpop.append(new_tup)
    
lda_counter_kpop

[[23,
  15,
  [('night', 0.018480774),
   ('home', 0.013538982),
   ('dynamite', 0.012081783),
   ('swing', 0.010209303),
   ('life', 0.009867142),
   ('afraid', 0.009812433),
   ('fire', 0.009171158),
   ('light', 0.008321327),
   ('follow', 0.007884144),
   ('let', 0.007590286)]],
 [44,
  9,
  [('bum', 0.021381103),
   ('night', 0.018964747),
   ('light', 0.015977364),
   ('life', 0.01370759),
   ('dynamite', 0.012611669),
   ('heart', 0.010979367),
   ('luv', 0.01061994),
   ('fun', 0.008559908),
   ('sick', 0.008394083),
   ('love', 0.0076597794)]],
 [26,
  8,
  [('child', 0.16972177),
   ('singing', 0.10604896),
   ('love', 0.074914135),
   ('sì', 0.018147986),
   ('universe', 0.011307437),
   ('sorry', 0.007300804),
   ('world', 0.0067121997),
   ('take', 0.0059307422),
   ('news', 0.005660807),
   ('baby', 0.005256103)]],
 [60,
  7,
  [('woo', 0.055317182),
   ('follow', 0.033040598),
   ('sorry', 0.016212681),
   ('dad', 0.012180163),
   ('best', 0.011419232),
   ('honey', 0.01

In [36]:
_lda_summer_kpop = sort_dictionary(summer)
lda_summer_kpop = []
for tup in _lda_summer_kpop:
    new_tup = [tup[0], tup[1], lda.show_topic(tup[0])]
    lda_summer_kpop.append(new_tup)
    
lda_summer_kpop

[[23,
  12.080285786385957,
  [('night', 0.018480774),
   ('home', 0.013538982),
   ('dynamite', 0.012081783),
   ('swing', 0.010209303),
   ('life', 0.009867142),
   ('afraid', 0.009812433),
   ('fire', 0.009171158),
   ('light', 0.008321327),
   ('follow', 0.007884144),
   ('let', 0.007590286)]],
 [26,
  7.95085066247384,
  [('child', 0.16972177),
   ('singing', 0.10604896),
   ('love', 0.074914135),
   ('sì', 0.018147986),
   ('universe', 0.011307437),
   ('sorry', 0.007300804),
   ('world', 0.0067121997),
   ('take', 0.0059307422),
   ('news', 0.005660807),
   ('baby', 0.005256103)]],
 [93,
  7.903547587481171,
  [('invalid', 0.15472318),
   ('home', 0.024356404),
   ('love', 0.017617239),
   ('universe', 0.016581727),
   ('switch', 0.012536719),
   ('world', 0.011256077),
   ('(you', 0.011119875),
   ("can't", 0.010566707),
   ('(i', 0.010497292),
   ('are)', 0.008906201)]],
 [39,
  7.725504138263659,
  [('ка', 0.19221221),
   ('love', 0.05465291),
   ('crazy', 0.02048545),
   ('o

### Emotion analysis
Emotions:  
fear
anger
anticipation
trust
surprise
positive
negative
sadness
disgust
joy

In [37]:
import numpy as np

In [38]:
emotions = [
    "anger",
    "anticipation",
    "disgust",
    "fear",
    "joy",
    "negative",	
    "positive",
    "sadness",
    "surprise",	
    "trust",
]

In [39]:
df_nrc = pd.read_csv("NRC.txt", sep="\t", names=["word", "emotion", "amount"])

words_to_emotions = {}

for _, row in df_nrc.iterrows():
    words_to_emotions[row["word"]] = words_to_emotions.get(row["word"], np.zeros((10, 1)))
    words_to_emotions[row["word"]][emotions.index(row["emotion"])] = row["amount"]

In [40]:
df_emotion = df.copy()
for emotion in emotions:
    df_emotion[emotion] = 0

In [41]:
for cnt in range(len(df)):
    lyrics = df["lyrics"][cnt]

    if type(lyrics) != str:
        continue

    emotion_this = np.zeros((10, 1))

    for word in lyrics.split():
        if word in words_to_emotions:
            emotion_this += words_to_emotions[word]
            
    if np.linalg.norm(emotion_this) != 0:
        emotion_this /= np.linalg.norm(emotion_this)
        
    for emotion in emotions:
        df_emotion[emotion][cnt] = emotion_this[emotions.index(emotion)]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_emotion[emotion][cnt] = emotion_this[emotions.index(emotion)]
  df_emotion[emotion][cnt] = emotion_this[emotions.index(emotion)]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_emotion[emotion][cnt] = emotion_this[emotions.index(emotion)]
  df_emotion[emotion][cnt] = emotion_this[emotions.index(emotion)]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_emotion[emotion][cnt] = emotion_this[emotions.index(emotion)]
  df_emotion[emotion][cnt] = emotion_this[emoti

In [42]:
df_emotion_global = df_emotion[df_emotion["grouping"].isin(global_group)]
mean_emotion_global = df_emotion_global[emotions].mean()
normalized_mean_emotion_global = mean_emotion_global / np.linalg.norm(mean_emotion_global)

df_emotion_kpop = df_emotion[df_emotion["grouping"].isin(kpop_group)]
mean_emotion_kpop = df_emotion_kpop[emotions].mean()
normalized_mean_emotion_kpop = mean_emotion_kpop / np.linalg.norm(mean_emotion_kpop)

df_emotion_target = df_emotion[df_emotion["grouping"] == target_group]
mean_emotion_target = df_emotion_target[emotions].mean()
normalized_mean_emotion_target = mean_emotion_target / np.linalg.norm(mean_emotion_target)

In [43]:
normalized_mean_emotion_target

anger           0.154527
anticipation    0.313939
disgust         0.105527
fear            0.223917
joy             0.363832
negative        0.366232
positive        0.625078
sadness         0.246532
surprise        0.162838
trust           0.267905
dtype: float64

against global

In [44]:
emotion_percentage_global = (normalized_mean_emotion_target - normalized_mean_emotion_global) / normalized_mean_emotion_global * 100

emotion_percentage_global

anger          -26.862864
anticipation     6.185337
disgust        -22.754799
fear            -5.103215
joy              5.472281
negative       -21.122329
positive        16.974613
sadness         -4.229430
surprise        -4.397312
trust           -4.203301
dtype: float64

against KPOP

In [45]:
emotion_percentage_kpop = (normalized_mean_emotion_target - normalized_mean_emotion_kpop) / normalized_mean_emotion_kpop * 100

emotion_percentage_kpop

anger           -2.132820
anticipation    -2.317294
disgust          6.548154
fear             7.784481
joy             -6.442900
negative         2.080201
positive        -0.255459
sadness         13.599037
surprise        -2.330777
trust            0.370840
dtype: float64

## Save Data

In [46]:
# to float
for _sum in lda_summer_global:
    _sum[2] = [(tup[0], float(tup[1])) for tup in _sum[2]]
    
for _sum in lda_summer_kpop:
    _sum[2] = [(tup[0], float(tup[1])) for tup in _sum[2]]
    
for count in lda_counter_global:
    count[2] = [(tup[0], float(tup[1])) for tup in count[2]]

for count in lda_counter_kpop:
    count[2] = [(tup[0], float(tup[1])) for tup in count[2]]

In [47]:
json_data = {
    "tfidf_counter_global": tfidf_counter_global,
    "tfidf_summer_global": tfidf_summer_global,
    "tfidf_counter_kpop": tfidf_counter_kpop,
    "tfidf_summer_kpop": tfidf_summer_kpop,
    "lda_counter_global": lda_counter_global,
    "lda_summer_global": lda_summer_global,
    "lda_counter_kpop": lda_counter_kpop,
    "lda_summer_kpop": lda_summer_kpop,
    "lda_html_global": lda_html_global,
    "lda_html_kpop": lda_html_kpop,
    "normalized_mean_emotion_target": normalized_mean_emotion_target.astype(float).tolist(),
    "emotion_percentage_global": emotion_percentage_global.astype(float).tolist(),
    "emotion_percentage_kpop": emotion_percentage_kpop.astype(float).tolist(),
}

In [48]:
import json
with open("data_"+ target_group +".json", "w") as f:
    json.dump(json_data, f)