# Lyrics Analysis

In [1]:
lyrics_directory = "Lyrics_Data/"
tracks_csv = "tracks_group.csv"
stopwords_file = "stopwords.txt"

In [2]:
target_group = "seventeen"
kpop_group = set(['kpop', 'itzy', 'seventeen', 'bts', 'twice'])
global_group = set(['global', 'billboard'])

target_and_kpop = kpop_group.union(set([target_group]))
target_and_global = global_group.union(set([target_group]))

## Import data

In [3]:
import pandas as pd

In [4]:
df = pd.read_csv(tracks_csv)

df.head(3)

Unnamed: 0,track_name,track_id,grouping
0,Ice Cream (with Selena Gomez),2J4P46vCFm1rPkNkp9pZWX,kpop
1,Dynamite,0v1x6rN6JHRapa03JElljE,kpop
2,THE BADDEST,2V4Fx72svQRxrFvNT1eq5f,kpop


In [5]:
df.shape

(4618, 3)

In [6]:
df["grouping"].unique()

array(['kpop', 'itzy', 'seventeen', 'bts', 'twice', 'global', 'billboard'],
      dtype=object)

In [7]:
def get_lyrics(track_id):
    try:
        lyrics = ""
        with open(lyrics_directory + track_id + ".txt", "r") as f:
            lyrics = f.read()
            
        return lyrics            
    except:
        return None
    
df["lyrics"] = df["track_id"].apply(get_lyrics)

df.shape

(4618, 4)

In [8]:
# stop words
# https://www.kaggle.com/datasets/rowhitswami/stopwords/
stopwords = set()
with open(stopwords_file, "r") as f:
    stopwords = set(f.read().split("\n"))
    
stopwords

{',',
 '>',
 '?',
 'Just',
 'Yes',
 'a',
 'abaft',
 'abafter',
 'abaftest',
 'about',
 'abouter',
 'aboutest',
 'above',
 'abover',
 'abovest',
 'accordingly',
 'aer',
 'aest',
 'afore',
 'after',
 'afterer',
 'afterest',
 'afterward',
 'afterwards',
 'again',
 'against',
 'aid',
 'ain',
 'albeit',
 'all',
 'aller',
 'allest',
 'alls',
 'allyou',
 'almost',
 'along',
 'alongside',
 'already',
 'also',
 'although',
 'always',
 'amid',
 'amidst',
 'among',
 'amongst',
 'an',
 'and',
 'andor',
 'anear',
 'anent',
 'another',
 'any',
 'anybody',
 'anyhow',
 'anyone',
 'anything',
 'anywhere',
 'apart',
 'aparter',
 'apartest',
 'appear',
 'appeared',
 'appearing',
 'appears',
 'appropriate',
 'appropriated',
 'appropriater',
 'appropriates',
 'appropriatest',
 'appropriating',
 'are',
 'ares',
 'around',
 'as',
 'ases',
 'aside',
 'asides',
 'aslant',
 'astraddle',
 'astraddler',
 'astraddlest',
 'astride',
 'astrider',
 'astridest',
 'at',
 'athwart',
 'atop',
 'atween',
 'aught',
 'aught

## Analyze

In [9]:
def is_stopword(word):
    word = word.lower()
    
    if len(word) <= 1:
        return True
    
    # in the list
    if word in stopwords:
        return True
    
    # is Korean
    if ord("가") <= ord(word[0]) <= ord("힣"):
        return True
    
    return False

### Characteristic Keyword
Using TF-IDF

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

#### Among global

In [11]:
df_looking = df[df["grouping"].isin(target_and_global)].reset_index(drop=True)

In [12]:
vectorizer = TfidfVectorizer(stop_words=list(stopwords), max_features=1000, min_df=10)

x = vectorizer.fit_transform(df_looking["lyrics"].fillna("")).toarray()

df_tfidf = pd.DataFrame(x, columns=vectorizer.get_feature_names_out()).T



In [13]:
threshold = 0.001
counter = {}
summer = {}

for cnt in range(len(df_looking)):
    if df_looking["grouping"][cnt] != target_group:
        continue
    
    tfidf_series = df_tfidf[cnt]
    
    keywords = tfidf_series[tfidf_series > threshold].index.tolist()
    for keyword in keywords:
        counter[keyword] = counter.get(keyword, 0) + 1
        summer[keyword] = summer.get(keyword, 0) + float(tfidf_series[keyword])

In [14]:
def sort_dictionary(dictionary):
    return [(k, dictionary[k]) for k in sorted(dictionary, key=dictionary.get, reverse=True)]

In [15]:
tfidf_counter_global = sort_dictionary(counter)
tfidf_counter_global

[('home', 81),
 ('sorry', 80),
 ('news', 74),
 ('let', 72),
 ('heart', 61),
 ('love', 60),
 ('think', 48),
 ('baby', 42),
 ('new', 39),
 ('night', 39),
 ('only', 39),
 ('follow', 38),
 ('feel', 38),
 ('world', 38),
 ('end', 34),
 ('eyes', 34),
 ('back', 33),
 ('contact', 33),
 ('keep', 32),
 ('yes', 31),
 ('say', 31),
 ('looking', 30),
 ('leave', 30),
 ('dream', 30),
 ('mind', 29),
 ('happy', 29),
 ('sleep', 28),
 ('hands', 27),
 ('need', 25),
 ('yeh', 25),
 ('give', 25),
 ('morning', 25),
 ('find', 24),
 ('take', 24),
 ('am', 24),
 ('hand', 24),
 ('sky', 24),
 ('wait', 23),
 ('worry', 23),
 ('smile', 22),
 ('room', 21),
 ('tell', 21),
 ('ah', 20),
 ('sun', 20),
 ('dance', 20),
 ('meet', 20),
 ('today', 19),
 ('life', 19),
 ('music', 19),
 ('away', 18),
 ('way', 18),
 ('play', 18),
 ('light', 18),
 ('hear', 18),
 ('wind', 18),
 ('little', 17),
 ('change', 17),
 ('still', 17),
 ('child', 17),
 ('easy', 17),
 ('tomorrow', 17),
 ('place', 17),
 ('remember', 17),
 ('share', 16),
 ('call', 

In [16]:
tfidf_summer_global = sort_dictionary(summer)
tfidf_summer_global

[('home', 15.736966679808186),
 ('news', 14.518524685914194),
 ('sorry', 11.91577797479948),
 ('heart', 8.811270094419466),
 ('love', 8.688800538611188),
 ('baby', 7.788746881942375),
 ('contact', 6.832942029185571),
 ('yeh', 6.542506996848126),
 ('world', 6.220133391429427),
 ('let', 6.0279484609790055),
 ('follow', 5.612129997871336),
 ('dream', 5.371939112373928),
 ('only', 5.096528380955112),
 ('new', 5.071695046562155),
 ('end', 4.9540229578548205),
 ('think', 4.712041506435446),
 ('happy', 4.496023423359318),
 ('back', 4.005746254043052),
 ('give', 3.7335500790827494),
 ('looking', 3.6827297732633477),
 ('wind', 3.670400931134151),
 ('worry', 3.6425433783146657),
 ('smile', 3.589368585436858),
 ('shut', 3.571071355579521),
 ('leave', 3.5368330698902164),
 ('ay', 3.4872861784621976),
 ('night', 3.483549123653218),
 ('yes', 3.44780622637595),
 ('run', 3.4310940206210545),
 ('song', 3.418346823020923),
 ('life', 3.364894648261042),
 ('um', 3.3641824991900555),
 ('music', 3.347872862

#### Among KPOP

In [17]:
df_looking = df[df["grouping"].isin(target_and_kpop)].reset_index(drop=True)

In [18]:
vectorizer = TfidfVectorizer(stop_words=list(stopwords), max_features=1000, min_df=10)

x = vectorizer.fit_transform(df_looking["lyrics"].fillna("")).toarray()

df_tfidf = pd.DataFrame(x, columns=vectorizer.get_feature_names_out()).T



In [19]:
threshold = 0.001
counter = {}
summer = {}

for cnt in range(len(df_looking)):
    if df_looking["grouping"][cnt] != target_group:
        continue
    
    tfidf_series = df_tfidf[cnt]
    
    keywords = tfidf_series[tfidf_series > threshold].index.tolist()
    for keyword in keywords:
        counter[keyword] = counter.get(keyword, 0) + 1
        summer[keyword] = summer.get(keyword, 0) + float(tfidf_series[keyword])

In [20]:
tfidf_counter_kpop = sort_dictionary(counter)
tfidf_counter_kpop

[('home', 80),
 ('sorry', 80),
 ('news', 74),
 ('let', 72),
 ('heart', 61),
 ('love', 60),
 ('think', 48),
 ('baby', 42),
 ('night', 40),
 ('only', 40),
 ('new', 39),
 ('follow', 38),
 ('feel', 38),
 ('world', 38),
 ('end', 34),
 ('eyes', 34),
 ('back', 33),
 ('contact', 33),
 ('keep', 32),
 ('yes', 31),
 ('say', 31),
 ('looking', 30),
 ('leave', 30),
 ('dream', 30),
 ('mind', 29),
 ('happy', 29),
 ('sleep', 28),
 ('hands', 27),
 ('need', 25),
 ('yeh', 25),
 ('give', 25),
 ('morning', 25),
 ('find', 24),
 ('take', 24),
 ('am', 24),
 ('hand', 24),
 ('sky', 24),
 ('wait', 23),
 ('worry', 23),
 ('tell', 22),
 ('smile', 22),
 ('room', 21),
 ('ah', 20),
 ('sun', 20),
 ('dance', 20),
 ('comment', 20),
 ('meet', 20),
 ('today', 19),
 ('life', 19),
 ('music', 19),
 ('away', 18),
 ('price_varies', 18),
 ('way', 18),
 ('play', 18),
 ('light', 18),
 ('hear', 18),
 ('wind', 18),
 ('little', 17),
 ('change', 17),
 ('still', 17),
 ('child', 17),
 ('easy', 17),
 ('song', 17),
 ('questions', 17),
 ('t

In [21]:
tfidf_summer_kpop = sort_dictionary(summer)
tfidf_summer_kpop

[('home', 12.15820554016299),
 ('love', 9.029950266202746),
 ('heart', 7.8499695759031205),
 ('baby', 7.383031344719721),
 ('news', 7.112408166336561),
 ('sorry', 6.890558195969502),
 ('only', 5.749848884568152),
 ('yeh', 5.713494252203872),
 ('let', 5.712994427725181),
 ('think', 5.069674096544697),
 ('world', 4.969372652295935),
 ('back', 4.683235562496428),
 ('give', 4.351780180473527),
 ('end', 4.047276678189264),
 ('dream', 4.043967495151701),
 ('new', 4.031170116408636),
 ('happy', 4.013403552645574),
 ('contact', 3.9751484423575043),
 ('life', 3.706154365069785),
 ('run', 3.6648673415278665),
 ('follow', 3.6406844515529726),
 ('looking', 3.619639156326462),
 ('yes', 3.5779200142749383),
 ('night', 3.481554551838795),
 ('say', 3.4765591544016083),
 ('feel', 3.3922187540080064),
 ('song', 3.3899439264015068),
 ('call', 3.38277431976287),
 ('am', 3.371554583814393),
 ('music', 3.370028153834897),
 ('find', 3.316858735067583),
 ('leave', 3.3125686232572074),
 ('ay', 3.24221179522855

### Topic analysis

In [22]:
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel

In [23]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
def save_lda_vis(lda, corpus, dictionary, filename):
    vis = gensimvis.prepare(lda, corpus, dictionary)
    pyLDAvis.save_html(vis, filename)
    
    with open(filename, "r") as f:
        html = f.read()
        
    return html, vis

In [24]:
def pre_process(df_lyrics):
    # tokenize lyrics
    lyrics_processed = df_lyrics.fillna("").str.split()

    for _list in lyrics_processed:
        for cnt in range(len(_list)):
            _list[cnt] = _list[cnt].replace(".", "").replace(",", "").replace("?", "").replace("\\", "").replace("/", "").replace(":", "").lower()
            
            if is_stopword(_list[cnt]):
                _list[cnt] = ""
                
    # remove empty string
    for cnt in range(len(lyrics_processed)):
        while(True):
            try:
                lyrics_processed[cnt].remove("")
            except:
                break

    return lyrics_processed

#### Among global

In [25]:
df_looking = df[df["grouping"].isin(target_and_global)].reset_index(drop=True)

In [26]:
#train model
lyrics_processed = pre_process(df_looking["lyrics"])
dictionary = Dictionary(lyrics_processed)
corpus = [dictionary.doc2bow(text) for text in lyrics_processed]
lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=100)

all topics

In [27]:
lda_html_global, vis = save_lda_vis(lda, corpus, dictionary, "lda.html")

vis

PreparedData(topic_coordinates=              x         y  topics  cluster      Freq
topic                                               
62    -0.057313  0.042433       1        1  4.563433
19    -0.014083  0.003265       2        1  3.277836
56    -0.085206  0.067758       3        1  2.951812
86    -0.063604  0.044635       4        1  2.722717
71    -0.072591  0.055415       5        1  2.611585
...         ...       ...     ...      ...       ...
14     0.080884 -0.115962      96        1  0.341691
53     0.043529 -0.047816      97        1  0.333478
81     0.107920 -0.133010      98        1  0.307824
21     0.036596 -0.069161      99        1  0.299024
70     0.074080 -0.105898     100        1  0.290616

[100 rows x 5 columns], topic_info=         Term         Freq        Total  Category  logprob  loglift
4827   labour  6024.000000  6024.000000   Default  30.0000  30.0000
55       baby  4923.000000  4923.000000   Default  29.0000  29.0000
653      work  1671.000000  1671.000000 

what topics are in kpop?

In [28]:
threshold = 0.3

counter = {}
summer = {}

for cnt in range(len(df_looking)):
    if df_looking["grouping"][cnt] != target_group:
        continue
    
    topics = lda.get_document_topics(corpus[cnt], minimum_probability=0)
    
    for topic in topics:
        if topic[1] > threshold:
            counter[topic[0]] = counter.get(topic[0], 0) + 1
            
        summer[topic[0]] = summer.get(topic[0], 0) + float(topic[1])

In [29]:
_lda_counter_global = sort_dictionary(counter)
lda_counter_global = []
for tup in _lda_counter_global:
    new_tup = [tup[0], tup[1], lda.show_topic(tup[0])]
    lda_counter_global.append(new_tup)
    
lda_counter_global

[[71,
  21,
  [('say', 0.027936634),
   ('let', 0.021746522),
   ('love', 0.016613767),
   ('tell', 0.016350273),
   ('take', 0.013763184),
   ('feel', 0.010993037),
   ('think', 0.009842661),
   ('heart', 0.009502152),
   ('[bleep', 0.008637071),
   ('sorry', 0.0078080376)]],
 [94,
  7,
  [('only', 0.04889053),
   ('love', 0.0342403),
   ('let', 0.033705782),
   ('life', 0.028026769),
   ('take', 0.013016682),
   ('road', 0.011545542),
   ('world', 0.0104672015),
   ('need', 0.010349542),
   ('home', 0.009937878),
   ('honey', 0.009827302)]],
 [5,
  6,
  [('lights', 0.051437806),
   ('change', 0.023998477),
   ('coming', 0.023311432),
   ('back', 0.022159914),
   ('home', 0.019869225),
   ("we'll", 0.019450102),
   ('hours', 0.018060025),
   ('old', 0.014699207),
   ('thought', 0.013580619),
   ('feel', 0.012194799)]],
 [88,
  6,
  [('let', 0.018881734),
   ('girl', 0.014824895),
   ('down', 0.013481156),
   ('feel', 0.011988978),
   ('little', 0.011663624),
   ('life', 0.011524334),


In [30]:
_lda_summer_global = sort_dictionary(summer)
lda_summer_global = []
for tup in _lda_summer_global:
    new_tup = [tup[0], tup[1], lda.show_topic(tup[0])]
    lda_summer_global.append(new_tup)
    
lda_summer_global

[[71,
  16.475424695587208,
  [('say', 0.027936634),
   ('let', 0.021746522),
   ('love', 0.016613767),
   ('tell', 0.016350273),
   ('take', 0.013763184),
   ('feel', 0.010993037),
   ('think', 0.009842661),
   ('heart', 0.009502152),
   ('[bleep', 0.008637071),
   ('sorry', 0.0078080376)]],
 [18,
  8.6783787356635,
  [('home', 0.065012895),
   ('world', 0.018244155),
   ('news', 0.016244957),
   ('sta', 0.015528149),
   ('{{if', 0.013820188),
   ('price_varies}', 0.013603687),
   ('heart', 0.013253995),
   ('coming', 0.012789944),
   ("what's", 0.012126695),
   ('love', 0.011987572)]],
 [94,
  6.1642158746926725,
  [('only', 0.04889053),
   ('love', 0.0342403),
   ('let', 0.033705782),
   ('life', 0.028026769),
   ('take', 0.013016682),
   ('road', 0.011545542),
   ('world', 0.0104672015),
   ('need', 0.010349542),
   ('home', 0.009937878),
   ('honey', 0.009827302)]],
 [79,
  6.144023636743441,
  [('love', 0.0887733),
   ('heart', 0.023739712),
   ('let', 0.017095268),
   ('baby', 0

#### Among KPOP

In [31]:
df_looking = df[df["grouping"].isin(target_and_kpop)].reset_index(drop=True)

In [32]:
#train model
lyrics_processed = pre_process(df_looking["lyrics"])
dictionary = Dictionary(lyrics_processed)
corpus = [dictionary.doc2bow(text) for text in lyrics_processed]
lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=100)

# observe topics
topics = lda.print_topics()

In [33]:
threshold = 0.3

counter = {}
summer = {}

for cnt in range(len(df_looking)):
    if df_looking["grouping"][cnt] != target_group:
        continue
    
    topics = lda.get_document_topics(corpus[cnt], minimum_probability=0)
    
    for topic in topics:
        if topic[1] > threshold:
            counter[topic[0]] = counter.get(topic[0], 0) + 1
            
        summer[topic[0]] = summer.get(topic[0], 0) + float(topic[1])


In [34]:
lda_html_kpop, vis = save_lda_vis(lda, corpus, dictionary, "lda.html")

vis

PreparedData(topic_coordinates=              x         y  topics  cluster      Freq
topic                                               
64     0.030320  0.051068       1        1  3.291955
11    -0.007764 -0.140043       2        1  2.916772
84    -0.341036  0.038025       3        1  2.892462
26     0.010186 -0.048333       4        1  2.092292
67     0.018847 -0.074226       5        1  1.915824
...         ...       ...     ...      ...       ...
1      0.021250  0.096912      96        1  0.332242
24     0.011844  0.102449      97        1  0.318615
15     0.025994  0.074360      98        1  0.280643
77     0.034383  0.187227      99        1  0.274195
4      0.018675  0.050750     100        1  0.267863

[100 rows x 5 columns], topic_info=         Term         Freq        Total  Category  logprob  loglift
280     child  4336.000000  4336.000000   Default  30.0000  30.0000
494      cake  1785.000000  1785.000000   Default  29.0000  29.0000
425     drunk  2364.000000  2364.000000 

In [35]:
_lda_counter_kpop = sort_dictionary(counter)
lda_counter_kpop = []
for tup in _lda_counter_kpop:
    new_tup = [tup[0], tup[1], lda.show_topic(tup[0])]
    lda_counter_kpop.append(new_tup)
    
lda_counter_kpop

[[42,
  8,
  [('rock', 0.032839376),
   ('dream', 0.018969184),
   ('heart', 0.01539206),
   ('home', 0.015245652),
   ('head', 0.012651826),
   ('news', 0.011988615),
   ('end', 0.011499451),
   ('hip', 0.011357877),
   ('anpanman', 0.010756616),
   ('let', 0.010557997)]],
 [33,
  7,
  [('love', 0.09256143),
   ('la-la', 0.021700809),
   ('baby', 0.018469667),
   ('left', 0.012676071),
   ('sorry', 0.012492001),
   ("can't", 0.01159381),
   ('black', 0.011067055),
   ('boom', 0.010604462),
   ('hold', 0.010264342),
   ('news', 0.00917961)]],
 [67,
  7,
  [('invalid', 0.19587377),
   ('love', 0.03820416),
   ('man', 0.037112445),
   ('whatta', 0.016827086),
   ('summer', 0.01631289),
   ('girl', 0.014338953),
   ('new', 0.010366004),
   ('leave', 0.008250743),
   ("can't", 0.008139248),
   ('back', 0.0077227075)]],
 [58,
  7,
  [('give', 0.027036207),
   ('baby', 0.024882667),
   ('break', 0.020048298),
   ('sorry', 0.019625232),
   ('say', 0.019234981),
   ('up!', 0.017721532),
   ('n

In [36]:
_lda_summer_kpop = sort_dictionary(summer)
lda_summer_kpop = []
for tup in _lda_summer_kpop:
    new_tup = [tup[0], tup[1], lda.show_topic(tup[0])]
    lda_summer_kpop.append(new_tup)
    
lda_summer_kpop

[[60,
  7.503566237565792,
  [("can't", 0.026725706),
   ('well', 0.015791943),
   ('ti', 0.014480049),
   ('home', 0.014199515),
   ('baby', 0.014057288),
   ('ah!', 0.011639583),
   ('sorry', 0.010969393),
   ('play', 0.010414769),
   ('reale', 0.010411201),
   ('give', 0.009555414)]],
 [67,
  7.231550633142433,
  [('invalid', 0.19587377),
   ('love', 0.03820416),
   ('man', 0.037112445),
   ('whatta', 0.016827086),
   ('summer', 0.01631289),
   ('girl', 0.014338953),
   ('new', 0.010366004),
   ('leave', 0.008250743),
   ("can't", 0.008139248),
   ('back', 0.0077227075)]],
 [58,
  7.069865941192802,
  [('give', 0.027036207),
   ('baby', 0.024882667),
   ('break', 0.020048298),
   ('sorry', 0.019625232),
   ('say', 0.019234981),
   ('up!', 0.017721532),
   ('news', 0.011278215),
   ('breakage', 0.010024683),
   ('bambi', 0.009596664),
   ('meet', 0.008784716)]],
 [33,
  6.9847264625705066,
  [('love', 0.09256143),
   ('la-la', 0.021700809),
   ('baby', 0.018469667),
   ('left', 0.012

### Emotion analysis
Emotions:  
fear
anger
anticipation
trust
surprise
positive
negative
sadness
disgust
joy

In [37]:
import numpy as np

In [38]:
emotions = [
    "anger",
    "anticipation",
    "disgust",
    "fear",
    "joy",
    "negative",	
    "positive",
    "sadness",
    "surprise",	
    "trust",
]

In [39]:
df_nrc = pd.read_csv("NRC.txt", sep="\t", names=["word", "emotion", "amount"])

words_to_emotions = {}

for _, row in df_nrc.iterrows():
    words_to_emotions[row["word"]] = words_to_emotions.get(row["word"], np.zeros((10, 1)))
    words_to_emotions[row["word"]][emotions.index(row["emotion"])] = row["amount"]

In [40]:
df_emotion = df.copy()
for emotion in emotions:
    df_emotion[emotion] = 0

In [41]:
for cnt in range(len(df)):
    lyrics = df["lyrics"][cnt]

    if type(lyrics) != str:
        continue

    emotion_this = np.zeros((10, 1))

    for word in lyrics.split():
        if word in words_to_emotions:
            emotion_this += words_to_emotions[word]
            
    if np.linalg.norm(emotion_this) != 0:
        emotion_this /= np.linalg.norm(emotion_this)
        
    for emotion in emotions:
        df_emotion[emotion][cnt] = emotion_this[emotions.index(emotion)]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_emotion[emotion][cnt] = emotion_this[emotions.index(emotion)]
  df_emotion[emotion][cnt] = emotion_this[emotions.index(emotion)]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_emotion[emotion][cnt] = emotion_this[emotions.index(emotion)]
  df_emotion[emotion][cnt] = emotion_this[emotions.index(emotion)]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_emotion[emotion][cnt] = emotion_this[emotions.index(emotion)]
  df_emotion[emotion][cnt] = emotion_this[emoti

In [42]:
df_emotion_global = df_emotion[df_emotion["grouping"].isin(global_group)]
mean_emotion_global = df_emotion_global[emotions].mean()
normalized_mean_emotion_global = mean_emotion_global / np.linalg.norm(mean_emotion_global)

df_emotion_kpop = df_emotion[df_emotion["grouping"].isin(kpop_group)]
mean_emotion_kpop = df_emotion_kpop[emotions].mean()
normalized_mean_emotion_kpop = mean_emotion_kpop / np.linalg.norm(mean_emotion_kpop)

df_emotion_target = df_emotion[df_emotion["grouping"] == target_group]
mean_emotion_target = df_emotion_target[emotions].mean()
normalized_mean_emotion_target = mean_emotion_target / np.linalg.norm(mean_emotion_target)

In [43]:
normalized_mean_emotion_target

anger           0.096464
anticipation    0.368778
disgust         0.078079
fear            0.195727
joy             0.373302
negative        0.322254
positive        0.649261
sadness         0.185454
surprise        0.145876
trust           0.299796
dtype: float64

against global

In [44]:
emotion_percentage_global = (normalized_mean_emotion_target - normalized_mean_emotion_global) / normalized_mean_emotion_global * 100

emotion_percentage_global

anger          -54.343950
anticipation    24.733574
disgust        -42.846661
fear           -17.049931
joy              8.217464
negative       -30.594136
positive        21.500227
sadness        -27.956571
surprise       -14.356268
trust            7.200270
dtype: float64

against KPOP

In [45]:
emotion_percentage_kpop = (normalized_mean_emotion_target - normalized_mean_emotion_kpop) / normalized_mean_emotion_kpop * 100

emotion_percentage_kpop

anger          -38.906155
anticipation    14.745721
disgust        -21.165552
fear            -5.784689
joy             -4.007840
negative       -10.177817
positive         3.603543
sadness        -14.545104
surprise       -12.505004
trust           12.318912
dtype: float64

## Save Data

In [46]:
# to float
for _sum in lda_summer_global:
    _sum[2] = [(tup[0], float(tup[1])) for tup in _sum[2]]
    
for _sum in lda_summer_kpop:
    _sum[2] = [(tup[0], float(tup[1])) for tup in _sum[2]]
    
for count in lda_counter_global:
    count[2] = [(tup[0], float(tup[1])) for tup in count[2]]

for count in lda_counter_kpop:
    count[2] = [(tup[0], float(tup[1])) for tup in count[2]]

In [47]:
json_data = {
    "tfidf_counter_global": tfidf_counter_global,
    "tfidf_summer_global": tfidf_summer_global,
    "tfidf_counter_kpop": tfidf_counter_kpop,
    "tfidf_summer_kpop": tfidf_summer_kpop,
    "lda_counter_global": lda_counter_global,
    "lda_summer_global": lda_summer_global,
    "lda_counter_kpop": lda_counter_kpop,
    "lda_summer_kpop": lda_summer_kpop,
    "lda_html_global": lda_html_global,
    "lda_html_kpop": lda_html_kpop,
    "normalized_mean_emotion_target": normalized_mean_emotion_target.astype(float).tolist(),
    "emotion_percentage_global": emotion_percentage_global.astype(float).tolist(),
    "emotion_percentage_kpop": emotion_percentage_kpop.astype(float).tolist(),
}

In [48]:
import json
with open("data_"+ target_group +".json", "w") as f:
    json.dump(json_data, f)