# Lyrics Analysis

In [1]:
lyrics_directory = "Lyrics_Data/"
tracks_csv = "tracks_group.csv"
stopwords_file = "stopwords.txt"

In [2]:
target_group = "itzy"
kpop_group = set(['kpop', 'itzy', 'seventeen', 'bts', 'twice'])
global_group = set(['global', 'billboard'])

target_and_kpop = kpop_group.union(set([target_group]))
target_and_global = global_group.union(set([target_group]))

## Import data

In [3]:
import pandas as pd

In [4]:
df = pd.read_csv(tracks_csv)

df.head(3)

Unnamed: 0,track_name,track_id,grouping
0,Ice Cream (with Selena Gomez),2J4P46vCFm1rPkNkp9pZWX,kpop
1,Dynamite,0v1x6rN6JHRapa03JElljE,kpop
2,THE BADDEST,2V4Fx72svQRxrFvNT1eq5f,kpop


In [5]:
df.shape

(4618, 3)

In [6]:
df["grouping"].unique()

array(['kpop', 'itzy', 'seventeen', 'bts', 'twice', 'global', 'billboard'],
      dtype=object)

In [7]:
def get_lyrics(track_id):
    try:
        lyrics = ""
        with open(lyrics_directory + track_id + ".txt", "r") as f:
            lyrics = f.read()
            
        return lyrics            
    except:
        return None
    
df["lyrics"] = df["track_id"].apply(get_lyrics)

df.shape

(4618, 4)

In [8]:
# stop words
# https://www.kaggle.com/datasets/rowhitswami/stopwords/
stopwords = set()
with open(stopwords_file, "r") as f:
    stopwords = set(f.read().split("\n"))
    
stopwords

{',',
 '>',
 '?',
 'Just',
 'Yes',
 'a',
 'abaft',
 'abafter',
 'abaftest',
 'about',
 'abouter',
 'aboutest',
 'above',
 'abover',
 'abovest',
 'accordingly',
 'aer',
 'aest',
 'afore',
 'after',
 'afterer',
 'afterest',
 'afterward',
 'afterwards',
 'again',
 'against',
 'aid',
 'ain',
 'albeit',
 'all',
 'aller',
 'allest',
 'alls',
 'allyou',
 'almost',
 'along',
 'alongside',
 'already',
 'also',
 'although',
 'always',
 'amid',
 'amidst',
 'among',
 'amongst',
 'an',
 'and',
 'andor',
 'anear',
 'anent',
 'another',
 'any',
 'anybody',
 'anyhow',
 'anyone',
 'anything',
 'anywhere',
 'apart',
 'aparter',
 'apartest',
 'appear',
 'appeared',
 'appearing',
 'appears',
 'appropriate',
 'appropriated',
 'appropriater',
 'appropriates',
 'appropriatest',
 'appropriating',
 'are',
 'ares',
 'around',
 'as',
 'ases',
 'aside',
 'asides',
 'aslant',
 'astraddle',
 'astraddler',
 'astraddlest',
 'astride',
 'astrider',
 'astridest',
 'at',
 'athwart',
 'atop',
 'atween',
 'aught',
 'aught

## Analyze

In [9]:
def is_stopword(word):
    word = word.lower()
    
    if len(word) <= 1:
        return True
    
    # in the list
    if word in stopwords:
        return True
    
    # is Korean
    if ord("가") <= ord(word[0]) <= ord("힣"):
        return True
    
    return False

### Characteristic Keyword
Using TF-IDF

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

#### Among global

In [11]:
df_looking = df[df["grouping"].isin(target_and_global)].reset_index(drop=True)

In [12]:
vectorizer = TfidfVectorizer(stop_words=list(stopwords), max_features=1000, min_df=10)

x = vectorizer.fit_transform(df_looking["lyrics"].fillna("")).toarray()

df_tfidf = pd.DataFrame(x, columns=vectorizer.get_feature_names_out()).T



In [13]:
threshold = 0.001
counter = {}
summer = {}

for cnt in range(len(df_looking)):
    if df_looking["grouping"][cnt] != target_group:
        continue
    
    tfidf_series = df_tfidf[cnt]
    
    keywords = tfidf_series[tfidf_series > threshold].index.tolist()
    for keyword in keywords:
        counter[keyword] = counter.get(keyword, 0) + 1
        summer[keyword] = summer.get(keyword, 0) + float(tfidf_series[keyword])

In [14]:
def sort_dictionary(dictionary):
    return [(k, dictionary[k]) for k in sorted(dictionary, key=dictionary.get, reverse=True)]

In [15]:
tfidf_counter_global = sort_dictionary(counter)
tfidf_counter_global

[('sorry', 23),
 ('love', 22),
 ('mind', 22),
 ('yes', 21),
 ('let', 20),
 ('say', 19),
 ('tell', 14),
 ('feel', 14),
 ('baby', 13),
 ('down', 13),
 ('heart', 13),
 ('keep', 13),
 ('take', 12),
 ('care', 12),
 ('back', 12),
 ('bad', 11),
 ('eyes', 11),
 ('matter', 11),
 ('crazy', 11),
 ('think', 11),
 ('show', 10),
 ('control', 10),
 ('need', 10),
 ('room', 10),
 ('ah', 10),
 ('am', 9),
 ('turn', 9),
 ('put', 9),
 ('little', 9),
 ('forget', 9),
 ('way', 9),
 ('home', 9),
 ('only', 8),
 ('hear', 8),
 ('coming', 8),
 ('girl', 8),
 ('game', 8),
 ('doesn', 7),
 ('face', 7),
 ('honey', 7),
 ('ready', 7),
 ('break', 7),
 ('dance', 7),
 ('worry', 7),
 ('okay', 7),
 ('stay', 7),
 ('follow', 7),
 ('play', 7),
 ('head', 7),
 ('high', 7),
 ('hit', 7),
 ('light', 7),
 ('world', 7),
 ('cuz', 6),
 ('give', 6),
 ('child', 6),
 ('fight', 6),
 ('kill', 6),
 ('feeling', 6),
 ('uh', 6),
 ('boy', 6),
 ('didn', 6),
 ('start', 6),
 ('still', 6),
 ('top', 6),
 ('new', 6),
 ('free', 6),
 ('end', 6),
 ('lost',

In [16]:
tfidf_summer_global = sort_dictionary(summer)
tfidf_summer_global

[('love', 3.2111590137486092),
 ('sorry', 3.1624247485449186),
 ('trust', 2.3862535651729093),
 ('hot', 2.352824882534737),
 ('woo', 2.1142086197441174),
 ('mind', 2.1046574503957345),
 ('put', 2.020884450575801),
 ('flower', 1.980310452818118),
 ('em', 1.9626333700933452),
 ('shy', 1.9226858042564654),
 ('ready', 1.9188426805354042),
 ('worry', 1.837092103431801),
 ('yes', 1.8106564463544794),
 ('business', 1.8082325752768063),
 ('fall', 1.8011038824731416),
 ('heart', 1.7928345825331622),
 ('care', 1.7165829596424889),
 ('chi', 1.7042387770576761),
 ('keep', 1.6902832699626618),
 ('taking', 1.608198525916642),
 ('matter', 1.5818529324911066),
 ('home', 1.5356698179913768),
 ('finally', 1.5236500520493004),
 ('crazy', 1.510161642126581),
 ('think', 1.505804405173682),
 ('tomorrow', 1.4769442397116785),
 ('ah', 1.4542463159967527),
 ('words', 1.4389624091552111),
 ('control', 1.411335330255775),
 ('hit', 1.366162842085624),
 ('break', 1.3357711651378272),
 ('stronger', 1.28558964433042

#### Among KPOP

In [17]:
df_looking = df[df["grouping"].isin(target_and_kpop)].reset_index(drop=True)

In [18]:
vectorizer = TfidfVectorizer(stop_words=list(stopwords), max_features=1000, min_df=10)

x = vectorizer.fit_transform(df_looking["lyrics"].fillna("")).toarray()

df_tfidf = pd.DataFrame(x, columns=vectorizer.get_feature_names_out()).T



In [19]:
threshold = 0.001
counter = {}
summer = {}

for cnt in range(len(df_looking)):
    if df_looking["grouping"][cnt] != target_group:
        continue
    
    tfidf_series = df_tfidf[cnt]
    
    keywords = tfidf_series[tfidf_series > threshold].index.tolist()
    for keyword in keywords:
        counter[keyword] = counter.get(keyword, 0) + 1
        summer[keyword] = summer.get(keyword, 0) + float(tfidf_series[keyword])

In [20]:
tfidf_counter_kpop = sort_dictionary(counter)
tfidf_counter_kpop

[('sorry', 23),
 ('love', 22),
 ('mind', 22),
 ('yes', 21),
 ('let', 20),
 ('say', 19),
 ('tell', 14),
 ('feel', 14),
 ('baby', 13),
 ('down', 13),
 ('heart', 13),
 ('keep', 13),
 ('take', 12),
 ('care', 12),
 ('back', 12),
 ('bad', 11),
 ('eyes', 11),
 ('matter', 11),
 ('crazy', 11),
 ('think', 11),
 ('show', 10),
 ('control', 10),
 ('need', 10),
 ('room', 10),
 ('ah', 10),
 ('am', 9),
 ('turn', 9),
 ('put', 9),
 ('little', 9),
 ('forget', 9),
 ('way', 9),
 ('home', 9),
 ('only', 8),
 ('hear', 8),
 ('coming', 8),
 ('girl', 8),
 ('game', 8),
 ('doesn', 7),
 ('face', 7),
 ('honey', 7),
 ('ready', 7),
 ('한국', 7),
 ('break', 7),
 ('dance', 7),
 ('worry', 7),
 ('okay', 7),
 ('stay', 7),
 ('follow', 7),
 ('play', 7),
 ('head', 7),
 ('high', 7),
 ('informazioni', 7),
 ('hit', 7),
 ('light', 7),
 ('world', 7),
 ('cuz', 6),
 ('give', 6),
 ('territory', 6),
 ('child', 6),
 ('fight', 6),
 ('kill', 6),
 ('feeling', 6),
 ('uh', 6),
 ('boy', 6),
 ('didn', 6),
 ('start', 6),
 ('still', 6),
 ('top', 

In [21]:
tfidf_summer_kpop = sort_dictionary(summer)
tfidf_summer_kpop

[('love', 3.7295123013654305),
 ('put', 2.4826616351462447),
 ('trust', 2.4324877675268377),
 ('hot', 2.3126303347836705),
 ('keep', 2.0253074558606508),
 ('woo', 1.9559731310647535),
 ('flower', 1.9553951154721705),
 ('crazy', 1.9351914543271769),
 ('fall', 1.896361950718139),
 ('worry', 1.889137009235887),
 ('mind', 1.8725577824369803),
 ('shy', 1.8625770823631649),
 ('business', 1.858239331562063),
 ('blah', 1.7916503324920754),
 ('한국', 1.7395776915919268),
 ('yes', 1.6725149283019323),
 ('em', 1.6580317150045465),
 ('control', 1.6275816088166775),
 ('sorry', 1.6181170837152725),
 ('think', 1.6029053675806249),
 ('hit', 1.5946636358702568),
 ('heart', 1.4564988011678022),
 ('down', 1.4448309833777575),
 ('ah', 1.4297295710406557),
 ('ready', 1.4238208197606785),
 ('say', 1.3873513044732313),
 ('matter', 1.3653434369426067),
 ('bad', 1.348649525140968),
 ('only', 1.3440981678289643),
 ('home', 1.3414986661490813),
 ('break', 1.3410441116279574),
 ('tell', 1.3296895882414261),
 ('care

### Topic analysis

In [22]:
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel

In [23]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
def save_lda_vis(lda, corpus, dictionary, filename):
    vis = gensimvis.prepare(lda, corpus, dictionary)
    pyLDAvis.save_html(vis, filename)
    
    with open(filename, "r") as f:
        html = f.read()
        
    return html, vis

In [24]:
def pre_process(df_lyrics):
    # tokenize lyrics
    lyrics_processed = df_lyrics.fillna("").str.split()

    for _list in lyrics_processed:
        for cnt in range(len(_list)):
            _list[cnt] = _list[cnt].replace(".", "").replace(",", "").replace("?", "").replace("\\", "").replace("/", "").replace(":", "").lower()
            
            if is_stopword(_list[cnt]):
                _list[cnt] = ""
                
    # remove empty string
    for cnt in range(len(lyrics_processed)):
        while(True):
            try:
                lyrics_processed[cnt].remove("")
            except:
                break

    return lyrics_processed

#### Among global

In [25]:
df_looking = df[df["grouping"].isin(target_and_global)].reset_index(drop=True)

In [26]:
#train model
lyrics_processed = pre_process(df_looking["lyrics"])
dictionary = Dictionary(lyrics_processed)
corpus = [dictionary.doc2bow(text) for text in lyrics_processed]
lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=100)

all topics

In [27]:
lda_html_global, vis = save_lda_vis(lda, corpus, dictionary, "lda.html")

vis

PreparedData(topic_coordinates=              x         y  topics  cluster      Freq
topic                                               
57     0.092387 -0.048408       1        1  3.283710
60     0.089717 -0.039305       2        1  2.776482
30     0.111992 -0.065036       3        1  2.741017
44     0.069789 -0.014851       4        1  2.568751
21    -0.279776 -0.310737       5        1  2.527943
...         ...       ...     ...      ...       ...
34    -0.084809  0.053001      96        1  0.303823
17    -0.173009  0.076600      97        1  0.255255
27    -0.079329  0.056883      98        1  0.232348
75    -0.067037  0.058842      99        1  0.224647
0     -0.149979 -0.110891     100        1  0.211946

[100 rows x 5 columns], topic_info=        Term         Freq        Total  Category  logprob  loglift
4853  labour  6806.000000  6806.000000   Default  30.0000  30.0000
862     cake  1829.000000  1829.000000   Default  29.0000  29.0000
88      baby  4967.000000  4967.000000   De

what topics are in kpop?

In [28]:
threshold = 0.3

counter = {}
summer = {}

for cnt in range(len(df_looking)):
    if df_looking["grouping"][cnt] != target_group:
        continue
    
    topics = lda.get_document_topics(corpus[cnt], minimum_probability=0)
    
    for topic in topics:
        if topic[1] > threshold:
            counter[topic[0]] = counter.get(topic[0], 0) + 1
            
        summer[topic[0]] = summer.get(topic[0], 0) + float(topic[1])

In [29]:
_lda_counter_global = sort_dictionary(counter)
lda_counter_global = []
for tup in _lda_counter_global:
    new_tup = [tup[0], tup[1], lda.show_topic(tup[0])]
    lda_counter_global.append(new_tup)
    
lda_counter_global

[[60,
  5,
  [('night', 0.032860707),
   ('baby', 0.01748109),
   ('awake', 0.015840512),
   ('put', 0.015284688),
   ('let', 0.013912941),
   ('sorry', 0.011289967),
   ('need', 0.011113763),
   ("let's", 0.010872562),
   ("we're", 0.010855437),
   ('only', 0.010647571)]],
 [3,
  4,
  [('dance', 0.15623459),
   ('take', 0.021970028),
   ('back', 0.014065792),
   ('only', 0.011694177),
   ('feel', 0.011261701),
   ('love', 0.011122922),
   ("let's", 0.01012209),
   ('once', 0.009636374),
   ('bottom', 0.00954894),
   ("can't", 0.009354201)]],
 [80,
  4,
  [('think', 0.030435959),
   ('lose', 0.015799498),
   ('hard', 0.0127345715),
   ('take', 0.012161051),
   ('new', 0.010120002),
   ('give', 0.009799788),
   ('bye', 0.009224499),
   ('let', 0.009114276),
   ('am', 0.009088236),
   ('only', 0.008822472)]],
 [55,
  4,
  [('feel', 0.026136218),
   ('mind', 0.022587433),
   ('coming', 0.013278444),
   ('girl', 0.0127936965),
   ('love', 0.011802135),
   ('say', 0.011079406),
   ("can't",

In [30]:
_lda_summer_global = sort_dictionary(summer)
lda_summer_global = []
for tup in _lda_summer_global:
    new_tup = [tup[0], tup[1], lda.show_topic(tup[0])]
    lda_summer_global.append(new_tup)
    
lda_summer_global

[[60,
  4.627612700844111,
  [('night', 0.032860707),
   ('baby', 0.01748109),
   ('awake', 0.015840512),
   ('put', 0.015284688),
   ('let', 0.013912941),
   ('sorry', 0.011289967),
   ('need', 0.011113763),
   ("let's", 0.010872562),
   ("we're", 0.010855437),
   ('only', 0.010647571)]],
 [3,
  3.4884902131584568,
  [('dance', 0.15623459),
   ('take', 0.021970028),
   ('back', 0.014065792),
   ('only', 0.011694177),
   ('feel', 0.011261701),
   ('love', 0.011122922),
   ("let's", 0.01012209),
   ('once', 0.009636374),
   ('bottom', 0.00954894),
   ("can't", 0.009354201)]],
 [55,
  3.4550811987996894,
  [('feel', 0.026136218),
   ('mind', 0.022587433),
   ('coming', 0.013278444),
   ('girl', 0.0127936965),
   ('love', 0.011802135),
   ('say', 0.011079406),
   ("can't", 0.010525564),
   ('night', 0.0096738655),
   ('let', 0.009664745),
   ('woo', 0.007209554)]],
 [36,
  3.029872133140543,
  [('worry', 0.022394314),
   ('baby', 0.009861896),
   ("chi't", 0.009071086),
   ('night', 0.008

#### Among KPOP

In [31]:
df_looking = df[df["grouping"].isin(target_and_kpop)].reset_index(drop=True)

In [32]:
#train model
lyrics_processed = pre_process(df_looking["lyrics"])
dictionary = Dictionary(lyrics_processed)
corpus = [dictionary.doc2bow(text) for text in lyrics_processed]
lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=100)

# observe topics
topics = lda.print_topics()

In [33]:
threshold = 0.3

counter = {}
summer = {}

for cnt in range(len(df_looking)):
    if df_looking["grouping"][cnt] != target_group:
        continue
    
    topics = lda.get_document_topics(corpus[cnt], minimum_probability=0)
    
    for topic in topics:
        if topic[1] > threshold:
            counter[topic[0]] = counter.get(topic[0], 0) + 1
            
        summer[topic[0]] = summer.get(topic[0], 0) + float(topic[1])


In [34]:
lda_html_kpop, vis = save_lda_vis(lda, corpus, dictionary, "lda.html")

vis

PreparedData(topic_coordinates=              x         y  topics  cluster      Freq
topic                                               
79    -0.007053 -0.080083       1        1  3.699065
13     0.339152  0.059346       2        1  3.400827
11     0.045854  0.025310       3        1  2.486130
77    -0.027755 -0.051006       4        1  2.234973
71    -0.014243 -0.062178       5        1  2.130160
...         ...       ...     ...      ...       ...
98     0.038073  0.017323      96        1  0.392615
15    -0.031665 -0.010091      97        1  0.376427
81    -0.026065  0.050204      98        1  0.346985
43     0.254848  0.054518      99        1  0.250941
5     -0.057932  0.213255     100        1  0.244404

[100 rows x 5 columns], topic_info=         Term         Freq        Total  Category  logprob  loglift
280     child  3973.000000  3973.000000   Default  30.0000  30.0000
425     drunk  2097.000000  2097.000000   Default  29.0000  29.0000
494      cake  1674.000000  1674.000000 

In [35]:
_lda_counter_kpop = sort_dictionary(counter)
lda_counter_kpop = []
for tup in _lda_counter_kpop:
    new_tup = [tup[0], tup[1], lda.show_topic(tup[0])]
    lda_counter_kpop.append(new_tup)
    
lda_counter_kpop

[[15,
  4,
  [('sorry', 0.019177983),
   ('man', 0.015505934),
   ('love', 0.01410066),
   ('top', 0.012325257),
   ('yes', 0.012061623),
   ('mob', 0.011231353),
   ('eyes', 0.009781362),
   ('cherry', 0.009393422),
   ('call', 0.008776228),
   ('cold', 0.0075873663)]],
 [35,
  4,
  [('put', 0.048931114),
   ("'em", 0.027571443),
   ('gucci', 0.02247288),
   ('sneakers', 0.021779431),
   ('ready', 0.018953884),
   ('19th', 0.015438336),
   ('century', 0.015371317),
   ('goat', 0.0150678065),
   ('brave', 0.014465645),
   ('call', 0.013949169)]],
 [11,
  3,
  [('drunk', 0.48934472),
   ('only', 0.018974105),
   ('love', 0.01258739),
   ('help', 0.006337784),
   ('bomb', 0.0061882157),
   ('take', 0.0052208225),
   ('loco', 0.004951508),
   ('night', 0.004660736),
   ('me)', 0.004635133),
   ('dan', 0.0045377)]],
 [95,
  3,
  [('love', 0.10150945),
   ('crazy', 0.017002089),
   ('girl', 0.016112667),
   ('baby', 0.013138875),
   ('fun', 0.010652551),
   ('whip', 0.010499436),
   ('suck'

In [36]:
_lda_summer_kpop = sort_dictionary(summer)
lda_summer_kpop = []
for tup in _lda_summer_kpop:
    new_tup = [tup[0], tup[1], lda.show_topic(tup[0])]
    lda_summer_kpop.append(new_tup)
    
lda_summer_kpop

[[35,
  4.108234476310827,
  [('put', 0.048931114),
   ("'em", 0.027571443),
   ('gucci', 0.02247288),
   ('sneakers', 0.021779431),
   ('ready', 0.018953884),
   ('19th', 0.015438336),
   ('century', 0.015371317),
   ('goat', 0.0150678065),
   ('brave', 0.014465645),
   ('call', 0.013949169)]],
 [15,
  4.049603166467932,
  [('sorry', 0.019177983),
   ('man', 0.015505934),
   ('love', 0.01410066),
   ('top', 0.012325257),
   ('yes', 0.012061623),
   ('mob', 0.011231353),
   ('eyes', 0.009781362),
   ('cherry', 0.009393422),
   ('call', 0.008776228),
   ('cold', 0.0075873663)]],
 [17,
  3.2546780708671577,
  [('ay', 0.033522647),
   ('ya', 0.028017163),
   ('cuz', 0.02162898),
   ('business', 0.02102332),
   ('feel', 0.020784901),
   ('shimmy', 0.017886044),
   ('down', 0.014459645),
   ('new', 0.013645619),
   ('bruce', 0.013619435),
   ('lee', 0.012469022)]],
 [67,
  3.0953026868683082,
  [('tell', 0.022238554),
   ('name', 0.018631121),
   ('heart', 0.016828267),
   ('sorry', 0.01383

### Emotion analysis
Emotions:  
fear
anger
anticipation
trust
surprise
positive
negative
sadness
disgust
joy

In [37]:
import numpy as np

In [38]:
labels = [
    "anger",
    "anticipation",
    "disgust",
    "fear",
    "joy",
    "sadness",
    "surprise",	
    "trust",
    "negative",	
    "positive",
]

emotions = [
    "anger",
    "anticipation",
    "disgust",
    "fear",
    "joy",
    "sadness",
    "surprise",	
    "trust",
]

sentiments = [
    "negative",	
    "positive",
]

In [39]:
df_nrc = pd.read_csv("NRC.txt", sep="\t", names=["word", "label", "amount"])

words_to_labels = {}

for _, row in df_nrc.iterrows():
    words_to_labels[row["word"]] = words_to_labels.get(row["word"], np.zeros((10, 1)))
    words_to_labels[row["word"]][labels.index(row["label"])] = row["amount"]

In [40]:
df_label = df.copy()
for label in labels:
    df_label[label] = 0

In [41]:
for cnt in range(len(df)):
    lyrics = df["lyrics"][cnt]

    if type(lyrics) != str:
        continue

    label_this = np.zeros((10, 1))

    for word in lyrics.split():
        if word in words_to_labels:
            label_this += words_to_labels[word]
            
    emotion_this = label_this[:8]
    sentiment_this = label_this[8:]
            
    if np.linalg.norm(emotion_this) != 0:
        emotion_this /= np.linalg.norm(emotion_this)
        
    if np.linalg.norm(sentiment_this) != 0:
        sentiment_this /= np.linalg.norm(sentiment_this)
        
    label_this = np.concatenate((emotion_this, sentiment_this), axis=None)
    
    for label in labels:
        df_label[label][cnt] = label_this[labels.index(label)]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_label[label][cnt] = label_this[labels.index(label)]
  df_label[label][cnt] = label_this[labels.index(label)]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_label[label][cnt] = label_this[labels.index(label)]
  df_label[label][cnt] = label_this[labels.index(label)]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_label[label][cnt] = label_this[labels.index(label)]
  df_label[label][cnt] = label_this[labels.index(label)]
A value is trying to be set on a copy of

In [42]:
df_emotion_global = df_label[df_label["grouping"].isin(global_group)]
mean_emotion_global = df_emotion_global[emotions].mean()
normalized_mean_emotion_global = mean_emotion_global / np.linalg.norm(mean_emotion_global)
mean_sentiment_global = df_emotion_global[sentiments].mean()
normalized_mean_sentiment_global = mean_sentiment_global / np.linalg.norm(mean_sentiment_global)

df_emotion_kpop = df_label[df_label["grouping"].isin(kpop_group)]
mean_emotion_kpop = df_emotion_kpop[emotions].mean()
normalized_mean_emotion_kpop = mean_emotion_kpop / np.linalg.norm(mean_emotion_kpop)
mean_sentiment_kpop = df_emotion_kpop[sentiments].mean()
normalized_mean_sentiment_kpop = mean_sentiment_kpop / np.linalg.norm(mean_sentiment_kpop)

df_emotion_target = df_label[df_label["grouping"] == target_group]
mean_emotion_target = df_emotion_target[emotions].mean()
normalized_mean_emotion_target = mean_emotion_target / np.linalg.norm(mean_emotion_target)
mean_sentiment_target = df_emotion_target[sentiments].mean()
normalized_mean_sentiment_target = mean_sentiment_target / np.linalg.norm(mean_sentiment_target)

In [43]:
normalized_mean_emotion_target

anger           0.438379
anticipation    0.375822
disgust         0.170637
fear            0.330805
joy             0.493340
sadness         0.322054
surprise        0.200253
trust           0.375272
dtype: float64

In [44]:
normalized_mean_sentiment_target

negative    0.688952
positive    0.724807
dtype: float64

against global

In [45]:
emotion_percentage_global = (normalized_mean_emotion_target - normalized_mean_emotion_global) / normalized_mean_emotion_global * 100

emotion_percentage_global

anger           45.648572
anticipation    -8.772393
disgust        -12.205216
fear            -0.224858
joy             -0.273445
sadness        -12.341296
surprise       -15.363607
trust           -5.082505
dtype: float64

In [46]:
sentiment_percentage_global = (normalized_mean_sentiment_target - normalized_mean_sentiment_global) / normalized_mean_sentiment_global * 100

sentiment_percentage_global

negative    5.049715
positive   -3.986985
dtype: float64

against KPOP

In [47]:
emotion_percentage_kpop = (normalized_mean_emotion_target - normalized_mean_emotion_kpop) / normalized_mean_emotion_kpop * 100

emotion_percentage_kpop

anger           97.407605
anticipation   -17.561263
disgust         22.265617
fear            11.910441
joy            -14.256188
sadness          1.878356
surprise       -15.187418
trust           -2.871784
dtype: float64

In [48]:
sentiment_percentage_kpop = (normalized_mean_sentiment_target - normalized_mean_sentiment_kpop) / normalized_mean_sentiment_kpop * 100

sentiment_percentage_kpop

negative    35.117273
positive   -15.743482
dtype: float64

## Save Data

In [49]:
# to float
for _sum in lda_summer_global:
    _sum[2] = [(tup[0], float(tup[1])) for tup in _sum[2]]
    
for _sum in lda_summer_kpop:
    _sum[2] = [(tup[0], float(tup[1])) for tup in _sum[2]]
    
for count in lda_counter_global:
    count[2] = [(tup[0], float(tup[1])) for tup in count[2]]

for count in lda_counter_kpop:
    count[2] = [(tup[0], float(tup[1])) for tup in count[2]]

In [50]:
json_data = {
    "tfidf_counter_global": tfidf_counter_global,
    "tfidf_summer_global": tfidf_summer_global,
    "tfidf_counter_kpop": tfidf_counter_kpop,
    "tfidf_summer_kpop": tfidf_summer_kpop,
    "lda_counter_global": lda_counter_global,
    "lda_summer_global": lda_summer_global,
    "lda_counter_kpop": lda_counter_kpop,
    "lda_summer_kpop": lda_summer_kpop,
    "lda_html_global": lda_html_global,
    "lda_html_kpop": lda_html_kpop,
    "normalized_mean_emotion_target": normalized_mean_emotion_target.astype(float).tolist(),
    "sentiment_percentage_global": sentiment_percentage_global.astype(float).tolist(),
    "emotion_percentage_global": emotion_percentage_global.astype(float).tolist(),
    "sentiment_percentage_global": sentiment_percentage_global.astype(float).tolist(),
    "emotion_percentage_kpop": emotion_percentage_kpop.astype(float).tolist(),
    "sentiment_percentage_kpop": sentiment_percentage_kpop.astype(float).tolist(),
}

In [51]:
import json
with open("data_"+ target_group +".json", "w") as f:
    json.dump(json_data, f)