# Lyrics Analysis

In [52]:
lyrics_directory = "Lyrics_Data/"
tracks_csv = "tracks_group.csv"
stopwords_file = "stopwords.txt"

In [53]:
target_group = "itzy"
kpop_group = set(['kpop', 'itzy', 'seventeen', 'bts', 'twice'])
global_group = set(['global', 'billboard'])

target_and_kpop = kpop_group.union(set([target_group]))
target_and_global = global_group.union(set([target_group]))

## Import data

In [54]:
import pandas as pd

In [55]:
df = pd.read_csv(tracks_csv)

df.head(3)

Unnamed: 0,track_name,track_id,grouping
0,Ice Cream (with Selena Gomez),2J4P46vCFm1rPkNkp9pZWX,kpop
1,Dynamite,0v1x6rN6JHRapa03JElljE,kpop
2,THE BADDEST,2V4Fx72svQRxrFvNT1eq5f,kpop


In [56]:
df.shape

(4618, 3)

In [57]:
df["grouping"].unique()

array(['kpop', 'itzy', 'seventeen', 'bts', 'twice', 'global', 'billboard'],
      dtype=object)

In [58]:
def get_lyrics(track_id):
    try:
        lyrics = ""
        with open(lyrics_directory + track_id + ".txt", "r") as f:
            lyrics = f.read()
            
        return lyrics            
    except:
        return None
    
df["lyrics"] = df["track_id"].apply(get_lyrics)

df.shape

(4618, 4)

In [59]:
# stop words
# https://www.kaggle.com/datasets/rowhitswami/stopwords/
stopwords = set()
with open(stopwords_file, "r") as f:
    stopwords = set(f.read().split("\n"))
    
stopwords

{',',
 '>',
 '?',
 'Just',
 'Yes',
 'a',
 'abaft',
 'abafter',
 'abaftest',
 'about',
 'abouter',
 'aboutest',
 'above',
 'abover',
 'abovest',
 'accordingly',
 'aer',
 'aest',
 'afore',
 'after',
 'afterer',
 'afterest',
 'afterward',
 'afterwards',
 'again',
 'against',
 'aid',
 'ain',
 'albeit',
 'all',
 'aller',
 'allest',
 'alls',
 'allyou',
 'almost',
 'along',
 'alongside',
 'already',
 'also',
 'although',
 'always',
 'amid',
 'amidst',
 'among',
 'amongst',
 'an',
 'and',
 'andor',
 'anear',
 'anent',
 'another',
 'any',
 'anybody',
 'anyhow',
 'anyone',
 'anything',
 'anywhere',
 'apart',
 'aparter',
 'apartest',
 'appear',
 'appeared',
 'appearing',
 'appears',
 'appropriate',
 'appropriated',
 'appropriater',
 'appropriates',
 'appropriatest',
 'appropriating',
 'are',
 'ares',
 'around',
 'as',
 'ases',
 'aside',
 'asides',
 'aslant',
 'astraddle',
 'astraddler',
 'astraddlest',
 'astride',
 'astrider',
 'astridest',
 'at',
 'athwart',
 'atop',
 'atween',
 'aught',
 'aught

## Analyze

In [60]:
def is_stopword(word):
    word = word.lower()
    
    if len(word) <= 1:
        return True
    
    # in the list
    if word in stopwords:
        return True
    
    # is Korean
    if ord("가") <= ord(word[0]) <= ord("힣"):
        return True
    
    return False

### Characteristic Keyword
Using TF-IDF

In [61]:
from sklearn.feature_extraction.text import TfidfVectorizer

#### Among global

In [62]:
df_looking = df[df["grouping"].isin(target_and_global)].reset_index(drop=True)

In [63]:
vectorizer = TfidfVectorizer(stop_words=list(stopwords), max_features=1000, min_df=10)

x = vectorizer.fit_transform(df_looking["lyrics"].fillna("")).toarray()

df_tfidf = pd.DataFrame(x, columns=vectorizer.get_feature_names_out()).T



In [64]:
threshold = 0.001
counter = {}
summer = {}

for cnt in range(len(df_looking)):
    if df_looking["grouping"][cnt] != target_group:
        continue
    
    tfidf_series = df_tfidf[cnt]
    
    keywords = tfidf_series[tfidf_series > threshold].index.tolist()
    for keyword in keywords:
        counter[keyword] = counter.get(keyword, 0) + 1
        summer[keyword] = summer.get(keyword, 0) + float(tfidf_series[keyword])

In [65]:
def sort_dictionary(dictionary):
    return [(k, dictionary[k]) for k in sorted(dictionary, key=dictionary.get, reverse=True)]

In [66]:
tfidf_counter_global = sort_dictionary(counter)
tfidf_counter_global

[('sorry', 23),
 ('love', 22),
 ('mind', 22),
 ('yes', 21),
 ('let', 20),
 ('say', 19),
 ('tell', 14),
 ('feel', 14),
 ('baby', 13),
 ('down', 13),
 ('heart', 13),
 ('keep', 13),
 ('take', 12),
 ('care', 12),
 ('back', 12),
 ('bad', 11),
 ('eyes', 11),
 ('matter', 11),
 ('crazy', 11),
 ('think', 11),
 ('show', 10),
 ('control', 10),
 ('need', 10),
 ('room', 10),
 ('ah', 10),
 ('am', 9),
 ('turn', 9),
 ('put', 9),
 ('little', 9),
 ('forget', 9),
 ('way', 9),
 ('home', 9),
 ('only', 8),
 ('hear', 8),
 ('coming', 8),
 ('girl', 8),
 ('game', 8),
 ('doesn', 7),
 ('face', 7),
 ('honey', 7),
 ('ready', 7),
 ('break', 7),
 ('dance', 7),
 ('worry', 7),
 ('okay', 7),
 ('stay', 7),
 ('follow', 7),
 ('play', 7),
 ('head', 7),
 ('high', 7),
 ('hit', 7),
 ('light', 7),
 ('world', 7),
 ('cuz', 6),
 ('give', 6),
 ('child', 6),
 ('fight', 6),
 ('kill', 6),
 ('feeling', 6),
 ('uh', 6),
 ('boy', 6),
 ('didn', 6),
 ('start', 6),
 ('still', 6),
 ('top', 6),
 ('new', 6),
 ('free', 6),
 ('end', 6),
 ('lost',

In [67]:
tfidf_summer_global = sort_dictionary(summer)
tfidf_summer_global

[('love', 3.2111590137486092),
 ('sorry', 3.1624247485449186),
 ('trust', 2.3862535651729093),
 ('hot', 2.352824882534737),
 ('woo', 2.1142086197441174),
 ('mind', 2.1046574503957345),
 ('put', 2.020884450575801),
 ('flower', 1.980310452818118),
 ('em', 1.9626333700933452),
 ('shy', 1.9226858042564654),
 ('ready', 1.9188426805354042),
 ('worry', 1.837092103431801),
 ('yes', 1.8106564463544794),
 ('business', 1.8082325752768063),
 ('fall', 1.8011038824731416),
 ('heart', 1.7928345825331622),
 ('care', 1.7165829596424889),
 ('chi', 1.7042387770576761),
 ('keep', 1.6902832699626618),
 ('taking', 1.608198525916642),
 ('matter', 1.5818529324911066),
 ('home', 1.5356698179913768),
 ('finally', 1.5236500520493004),
 ('crazy', 1.510161642126581),
 ('think', 1.505804405173682),
 ('tomorrow', 1.4769442397116785),
 ('ah', 1.4542463159967527),
 ('words', 1.4389624091552111),
 ('control', 1.411335330255775),
 ('hit', 1.366162842085624),
 ('break', 1.3357711651378272),
 ('stronger', 1.28558964433042

#### Among KPOP

In [68]:
df_looking = df[df["grouping"].isin(target_and_kpop)].reset_index(drop=True)

In [69]:
vectorizer = TfidfVectorizer(stop_words=list(stopwords), max_features=1000, min_df=10)

x = vectorizer.fit_transform(df_looking["lyrics"].fillna("")).toarray()

df_tfidf = pd.DataFrame(x, columns=vectorizer.get_feature_names_out()).T



In [70]:
threshold = 0.001
counter = {}
summer = {}

for cnt in range(len(df_looking)):
    if df_looking["grouping"][cnt] != target_group:
        continue
    
    tfidf_series = df_tfidf[cnt]
    
    keywords = tfidf_series[tfidf_series > threshold].index.tolist()
    for keyword in keywords:
        counter[keyword] = counter.get(keyword, 0) + 1
        summer[keyword] = summer.get(keyword, 0) + float(tfidf_series[keyword])

In [71]:
tfidf_counter_kpop = sort_dictionary(counter)
tfidf_counter_kpop

[('sorry', 23),
 ('love', 22),
 ('mind', 22),
 ('yes', 21),
 ('let', 20),
 ('say', 19),
 ('tell', 14),
 ('feel', 14),
 ('baby', 13),
 ('down', 13),
 ('heart', 13),
 ('keep', 13),
 ('take', 12),
 ('care', 12),
 ('back', 12),
 ('bad', 11),
 ('eyes', 11),
 ('matter', 11),
 ('crazy', 11),
 ('think', 11),
 ('show', 10),
 ('control', 10),
 ('need', 10),
 ('room', 10),
 ('ah', 10),
 ('am', 9),
 ('turn', 9),
 ('put', 9),
 ('little', 9),
 ('forget', 9),
 ('way', 9),
 ('home', 9),
 ('only', 8),
 ('hear', 8),
 ('coming', 8),
 ('girl', 8),
 ('game', 8),
 ('doesn', 7),
 ('face', 7),
 ('honey', 7),
 ('ready', 7),
 ('한국', 7),
 ('break', 7),
 ('dance', 7),
 ('worry', 7),
 ('okay', 7),
 ('stay', 7),
 ('follow', 7),
 ('play', 7),
 ('head', 7),
 ('high', 7),
 ('informazioni', 7),
 ('hit', 7),
 ('light', 7),
 ('world', 7),
 ('cuz', 6),
 ('give', 6),
 ('territory', 6),
 ('child', 6),
 ('fight', 6),
 ('kill', 6),
 ('feeling', 6),
 ('uh', 6),
 ('boy', 6),
 ('didn', 6),
 ('start', 6),
 ('still', 6),
 ('top', 

In [72]:
tfidf_summer_kpop = sort_dictionary(summer)
tfidf_summer_kpop

[('love', 3.7295123013654305),
 ('put', 2.4826616351462447),
 ('trust', 2.4324877675268377),
 ('hot', 2.3126303347836705),
 ('keep', 2.0253074558606508),
 ('woo', 1.9559731310647535),
 ('flower', 1.9553951154721705),
 ('crazy', 1.9351914543271769),
 ('fall', 1.896361950718139),
 ('worry', 1.889137009235887),
 ('mind', 1.8725577824369803),
 ('shy', 1.8625770823631649),
 ('business', 1.858239331562063),
 ('blah', 1.7916503324920754),
 ('한국', 1.7395776915919268),
 ('yes', 1.6725149283019323),
 ('em', 1.6580317150045465),
 ('control', 1.6275816088166775),
 ('sorry', 1.6181170837152725),
 ('think', 1.6029053675806249),
 ('hit', 1.5946636358702568),
 ('heart', 1.4564988011678022),
 ('down', 1.4448309833777575),
 ('ah', 1.4297295710406557),
 ('ready', 1.4238208197606785),
 ('say', 1.3873513044732313),
 ('matter', 1.3653434369426067),
 ('bad', 1.348649525140968),
 ('only', 1.3440981678289643),
 ('home', 1.3414986661490813),
 ('break', 1.3410441116279574),
 ('tell', 1.3296895882414261),
 ('care

### Topic analysis

In [73]:
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel

In [74]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
def save_lda_vis(lda, corpus, dictionary, filename):
    vis = gensimvis.prepare(lda, corpus, dictionary)
    pyLDAvis.save_html(vis, filename)
    
    with open(filename, "r") as f:
        html = f.read()
        
    return html, vis

In [75]:
def pre_process(df_lyrics):
    # tokenize lyrics
    lyrics_processed = df_lyrics.fillna("").str.split()

    for _list in lyrics_processed:
        for cnt in range(len(_list)):
            _list[cnt] = _list[cnt].replace(".", "").replace(",", "").replace("?", "").replace("\\", "").replace("/", "").replace(":", "").lower()
            
            if is_stopword(_list[cnt]):
                _list[cnt] = ""
                
    # remove empty string
    for cnt in range(len(lyrics_processed)):
        while(True):
            try:
                lyrics_processed[cnt].remove("")
            except:
                break

    return lyrics_processed

#### Among global

In [76]:
df_looking = df[df["grouping"].isin(target_and_global)].reset_index(drop=True)

In [77]:
#train model
lyrics_processed = pre_process(df_looking["lyrics"])
dictionary = Dictionary(lyrics_processed)
corpus = [dictionary.doc2bow(text) for text in lyrics_processed]
lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=100)

all topics

In [78]:
lda_html_global, vis = save_lda_vis(lda, corpus, dictionary, "lda.html")

vis

PreparedData(topic_coordinates=              x         y  topics  cluster      Freq
topic                                               
50    -0.106252  0.010924       1        1  3.447687
52    -0.079181  0.031054       2        1  3.211360
38    -0.056493  0.052581       3        1  2.660447
96    -0.037085  0.049647       4        1  2.611651
16    -0.065287  0.031986       5        1  2.608120
...         ...       ...     ...      ...       ...
40     0.038524 -0.035555      96        1  0.302896
91     0.123180 -0.069258      97        1  0.286579
97     0.076043 -0.024657      98        1  0.277863
36     0.111926 -0.059627      99        1  0.192615
57     0.089308 -0.057801     100        1  0.188692

[100 rows x 5 columns], topic_info=         Term         Freq        Total  Category  logprob  loglift
4853   labour  5770.000000  5770.000000   Default  30.0000  30.0000
88       baby  4774.000000  4774.000000   Default  29.0000  29.0000
862      cake  1771.000000  1771.000000 

what topics are in kpop?

In [79]:
threshold = 0.3

counter = {}
summer = {}

for cnt in range(len(df_looking)):
    if df_looking["grouping"][cnt] != target_group:
        continue
    
    topics = lda.get_document_topics(corpus[cnt], minimum_probability=0)
    
    for topic in topics:
        if topic[1] > threshold:
            counter[topic[0]] = counter.get(topic[0], 0) + 1
            
        summer[topic[0]] = summer.get(topic[0], 0) + float(topic[1])

In [80]:
_lda_counter_global = sort_dictionary(counter)
lda_counter_global = []
for tup in _lda_counter_global:
    new_tup = [tup[0], tup[1], lda.show_topic(tup[0])]
    lda_counter_global.append(new_tup)
    
lda_counter_global

[[44,
  5,
  [('nana', 0.03782504),
   ('think', 0.034090456),
   ('put', 0.033308737),
   ('need', 0.015609178),
   ('cut', 0.015175978),
   ("'em", 0.011700819),
   ('love', 0.011430544),
   ('baby', 0.010039039),
   ('club', 0.009904735),
   ('tell', 0.009518084)]],
 [97,
  3,
  [('piccola', 0.036875468),
   ('shy', 0.030599197),
   ('questo', 0.022342665),
   ('cha', 0.020276053),
   ('sta', 0.01774291),
   ('di', 0.016283667),
   ('grande', 0.014236953),
   ('labour', 0.011173741),
   ('know!', 0.010785065),
   ('tutto', 0.009487417)]],
 [7,
  3,
  [("let's", 0.09923697),
   ('fun', 0.030544406),
   ('mob', 0.02171706),
   ('sex', 0.020838248),
   ('dance', 0.015594882),
   ('safe', 0.01374731),
   ('talk', 0.013193416),
   ('girls', 0.013166148),
   ('healthy', 0.013075118),
   ('back', 0.011342449)]],
 [33,
  3,
  [('child', 0.05005783),
   ('take', 0.020907449),
   ('baby', 0.0135773765),
   ('fly', 0.011886623),
   ('let', 0.010044382),
   ('son', 0.008777405),
   ('girl', 0.0

In [81]:
_lda_summer_global = sort_dictionary(summer)
lda_summer_global = []
for tup in _lda_summer_global:
    new_tup = [tup[0], tup[1], lda.show_topic(tup[0])]
    lda_summer_global.append(new_tup)
    
lda_summer_global

[[44,
  3.9489017710161534,
  [('nana', 0.03782504),
   ('think', 0.034090456),
   ('put', 0.033308737),
   ('need', 0.015609178),
   ('cut', 0.015175978),
   ("'em", 0.011700819),
   ('love', 0.011430544),
   ('baby', 0.010039039),
   ('club', 0.009904735),
   ('tell', 0.009518084)]],
 [52,
  3.0216233902115164,
  [('take', 0.024875551),
   ('way', 0.021792656),
   ("can't", 0.019393234),
   ('girl', 0.018301029),
   ('love', 0.017485708),
   ('away', 0.011751763),
   ('mom', 0.010795083),
   ('let', 0.009961639),
   ('mind', 0.009499806),
   ('night', 0.009058922)]],
 [24,
  2.873195454857978,
  [('mine', 0.04640762),
   ('worry', 0.04266631),
   ('body', 0.03187231),
   ('yes', 0.028897451),
   ('innocence', 0.025072314),
   ('love', 0.022386098),
   ('yell', 0.019982588),
   ("chi't", 0.01627605),
   ('hell', 0.014309908),
   ("'cence", 0.013501894)]],
 [7,
  2.794071640529637,
  [("let's", 0.09923697),
   ('fun', 0.030544406),
   ('mob', 0.02171706),
   ('sex', 0.020838248),
   ('

#### Among KPOP

In [82]:
df_looking = df[df["grouping"].isin(target_and_kpop)].reset_index(drop=True)

In [83]:
#train model
lyrics_processed = pre_process(df_looking["lyrics"])
dictionary = Dictionary(lyrics_processed)
corpus = [dictionary.doc2bow(text) for text in lyrics_processed]
lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=100)

# observe topics
topics = lda.print_topics()

In [84]:
threshold = 0.3

counter = {}
summer = {}

for cnt in range(len(df_looking)):
    if df_looking["grouping"][cnt] != target_group:
        continue
    
    topics = lda.get_document_topics(corpus[cnt], minimum_probability=0)
    
    for topic in topics:
        if topic[1] > threshold:
            counter[topic[0]] = counter.get(topic[0], 0) + 1
            
        summer[topic[0]] = summer.get(topic[0], 0) + float(topic[1])


In [85]:
lda_html_kpop, vis = save_lda_vis(lda, corpus, dictionary, "lda.html")

vis

PreparedData(topic_coordinates=              x         y  topics  cluster      Freq
topic                                               
32    -0.139833  0.089456       1        1  3.549436
63    -0.306511  0.010223       2        1  2.464509
22     0.023461  0.076368       3        1  2.350275
16    -0.090061  0.056019       4        1  2.217348
58     0.048124  0.097781       5        1  1.962754
...         ...       ...     ...      ...       ...
79     0.022952 -0.068418      96        1  0.409604
95     0.022548 -0.086543      97        1  0.409210
94    -0.218029 -0.013233      98        1  0.368333
45     0.018956 -0.041477      99        1  0.366943
15     0.031840 -0.117753     100        1  0.337412

[100 rows x 5 columns], topic_info=        Term         Freq        Total  Category  logprob  loglift
280    child  3969.000000  3969.000000   Default  30.0000  30.0000
425    drunk  2394.000000  2394.000000   Default  29.0000  29.0000
494     cake  1146.000000  1146.000000   De

In [86]:
_lda_counter_kpop = sort_dictionary(counter)
lda_counter_kpop = []
for tup in _lda_counter_kpop:
    new_tup = [tup[0], tup[1], lda.show_topic(tup[0])]
    lda_counter_kpop.append(new_tup)
    
lda_counter_kpop

[[53,
  4,
  [('put', 0.025768993),
   ("we're", 0.02337015),
   ('bang', 0.01986733),
   ('tough', 0.013178399),
   ('sneakers', 0.012986793),
   ('boy', 0.012752493),
   ('boyfriend', 0.011893028),
   ('home', 0.011206723),
   ('baby', 0.009959479),
   ("'em", 0.009826529)]],
 [73,
  4,
  [('prova', 0.062297884),
   ('baby', 0.02765523),
   ('hit', 0.012464166),
   ('love', 0.010909128),
   ('grande', 0.010776444),
   ('yes', 0.009619778),
   ('home', 0.0089818705),
   ('fire', 0.008293454),
   ('di', 0.007978254),
   ('say', 0.0079698665)]],
 [31,
  3,
  [('oriente', 0.037215024),
   ('power', 0.033337314),
   ('grande', 0.027297042),
   ('love', 0.020322705),
   ('shy', 0.017341837),
   ('think', 0.014016291),
   ('id', 0.013897702),
   ('heart', 0.013233092),
   ('turn', 0.012878959),
   ('leave', 0.012847771)]],
 [33,
  3,
  [('yah', 0.03115656),
   ('cuse', 0.021487612),
   ('take', 0.014342201),
   ('mob', 0.01302476),
   ('honey', 0.0127610415),
   ('excuse', 0.0127327945),
  

In [87]:
_lda_summer_kpop = sort_dictionary(summer)
lda_summer_kpop = []
for tup in _lda_summer_kpop:
    new_tup = [tup[0], tup[1], lda.show_topic(tup[0])]
    lda_summer_kpop.append(new_tup)
    
lda_summer_kpop

[[73,
  3.5748723389451698,
  [('prova', 0.062297884),
   ('baby', 0.02765523),
   ('hit', 0.012464166),
   ('love', 0.010909128),
   ('grande', 0.010776444),
   ('yes', 0.009619778),
   ('home', 0.0089818705),
   ('fire', 0.008293454),
   ('di', 0.007978254),
   ('say', 0.0079698665)]],
 [53,
  3.526087677572832,
  [('put', 0.025768993),
   ("we're", 0.02337015),
   ('bang', 0.01986733),
   ('tough', 0.013178399),
   ('sneakers', 0.012986793),
   ('boy', 0.012752493),
   ('boyfriend', 0.011893028),
   ('home', 0.011206723),
   ('baby', 0.009959479),
   ("'em", 0.009826529)]],
 [31,
  3.352862980090322,
  [('oriente', 0.037215024),
   ('power', 0.033337314),
   ('grande', 0.027297042),
   ('love', 0.020322705),
   ('shy', 0.017341837),
   ('think', 0.014016291),
   ('id', 0.013897702),
   ('heart', 0.013233092),
   ('turn', 0.012878959),
   ('leave', 0.012847771)]],
 [54,
  3.1510283107745636,
  [('cake', 0.41132516),
   ('top', 0.008413903),
   ('spicy', 0.0075659393),
   ('take', 0.0

### Emotion analysis
Emotions:  
fear
anger
anticipation
trust
surprise
positive
negative
sadness
disgust
joy

In [88]:
import numpy as np

In [89]:
labels = [
    "anger",
    "anticipation",
    "disgust",
    "fear",
    "joy",
    "sadness",
    "surprise",	
    "trust",
    "negative",	
    "positive",
]

emotions = [
    "anger",
    "anticipation",
    "disgust",
    "fear",
    "joy",
    "sadness",
    "surprise",	
    "trust",
]

sentiments = [
    "negative",	
    "positive",
]

In [90]:
df_nrc = pd.read_csv("NRC.txt", sep="\t", names=["word", "label", "amount"])

words_to_labels = {}

for _, row in df_nrc.iterrows():
    words_to_labels[row["word"]] = words_to_labels.get(row["word"], np.zeros((10, 1)))
    words_to_labels[row["word"]][labels.index(row["label"])] = row["amount"]

In [91]:
df_label = df.copy()
for label in labels:
    df_label[label] = 0

In [92]:
for cnt in range(len(df)):
    lyrics = df["lyrics"][cnt]

    if type(lyrics) != str:
        continue

    label_this = np.zeros((10, 1))

    for word in lyrics.split():
        if word in words_to_labels:
            label_this += words_to_labels[word]
            
    emotion_this = label_this[:8]
    sentiment_this = label_this[8:]
            
    if np.linalg.norm(emotion_this) != 0:
        emotion_this /= np.linalg.norm(emotion_this)
        
    if np.linalg.norm(sentiment_this) != 0:
        sentiment_this /= np.linalg.norm(sentiment_this)
        
    label_this = np.concatenate((emotion_this, sentiment_this), axis=None)
    
    for label in labels:
        df_label[label][cnt] = label_this[labels.index(label)]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_label[label][cnt] = label_this[labels.index(label)]
  df_label[label][cnt] = label_this[labels.index(label)]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_label[label][cnt] = label_this[labels.index(label)]
  df_label[label][cnt] = label_this[labels.index(label)]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_label[label][cnt] = label_this[labels.index(label)]
  df_label[label][cnt] = label_this[labels.index(label)]
A value is trying to be set on a copy of

In [93]:
df_emotion_global = df_label[df_label["grouping"].isin(global_group)]
mean_emotion_global = df_emotion_global[emotions].mean()
normalized_mean_emotion_global = mean_emotion_global / np.linalg.norm(mean_emotion_global)
mean_sentiment_global = df_emotion_global[sentiments].mean()
normalized_mean_sentiment_global = mean_sentiment_global / np.linalg.norm(mean_sentiment_global)

df_emotion_kpop = df_label[df_label["grouping"].isin(kpop_group)]
mean_emotion_kpop = df_emotion_kpop[emotions].mean()
normalized_mean_emotion_kpop = mean_emotion_kpop / np.linalg.norm(mean_emotion_kpop)
mean_sentiment_kpop = df_emotion_kpop[sentiments].mean()
normalized_mean_sentiment_kpop = mean_sentiment_kpop / np.linalg.norm(mean_sentiment_kpop)

df_emotion_target = df_label[df_label["grouping"] == target_group]
mean_emotion_target = df_emotion_target[emotions].mean()
normalized_mean_emotion_target = mean_emotion_target / np.linalg.norm(mean_emotion_target)
mean_sentiment_target = df_emotion_target[sentiments].mean()
normalized_mean_sentiment_target = mean_sentiment_target / np.linalg.norm(mean_sentiment_target)

In [94]:
normalized_mean_emotion_target

anger           0.438379
anticipation    0.375822
disgust         0.170637
fear            0.330805
joy             0.493340
sadness         0.322054
surprise        0.200253
trust           0.375272
dtype: float64

In [95]:
normalized_mean_sentiment_target

negative    0.688952
positive    0.724807
dtype: float64

against global

In [96]:
emotion_percentage_global = (normalized_mean_emotion_target - normalized_mean_emotion_global) / normalized_mean_emotion_global * 100

emotion_percentage_global

anger           45.648572
anticipation    -8.772393
disgust        -12.205216
fear            -0.224858
joy             -0.273445
sadness        -12.341296
surprise       -15.363607
trust           -5.082505
dtype: float64

In [97]:
sentiment_percentage_global = (normalized_mean_sentiment_target - normalized_mean_sentiment_global) / normalized_mean_sentiment_global * 100

sentiment_percentage_global

negative    5.049715
positive   -3.986985
dtype: float64

against KPOP

In [98]:
emotion_percentage_kpop = (normalized_mean_emotion_target - normalized_mean_emotion_kpop) / normalized_mean_emotion_kpop * 100

emotion_percentage_kpop

anger           97.407605
anticipation   -17.561263
disgust         22.265617
fear            11.910441
joy            -14.256188
sadness          1.878356
surprise       -15.187418
trust           -2.871784
dtype: float64

In [99]:
sentiment_percentage_kpop = (normalized_mean_sentiment_target - normalized_mean_sentiment_kpop) / normalized_mean_sentiment_kpop * 100

sentiment_percentage_kpop

negative    35.117273
positive   -15.743482
dtype: float64

## Save Data

In [100]:
# to float
for _sum in lda_summer_global:
    _sum[2] = [(tup[0], float(tup[1])) for tup in _sum[2]]
    
for _sum in lda_summer_kpop:
    _sum[2] = [(tup[0], float(tup[1])) for tup in _sum[2]]
    
for count in lda_counter_global:
    count[2] = [(tup[0], float(tup[1])) for tup in count[2]]

for count in lda_counter_kpop:
    count[2] = [(tup[0], float(tup[1])) for tup in count[2]]

In [101]:
json_data = {
    "tfidf_counter_global": tfidf_counter_global,
    "tfidf_summer_global": tfidf_summer_global,
    "tfidf_counter_kpop": tfidf_counter_kpop,
    "tfidf_summer_kpop": tfidf_summer_kpop,
    "lda_counter_global": lda_counter_global,
    "lda_summer_global": lda_summer_global,
    "lda_counter_kpop": lda_counter_kpop,
    "lda_summer_kpop": lda_summer_kpop,
    "lda_html_global": lda_html_global,
    "lda_html_kpop": lda_html_kpop,
    "normalized_mean_emotion_target": normalized_mean_emotion_target.astype(float).tolist(),
    "normalized_mean_sentiment_target": normalized_mean_sentiment_target.astype(float).tolist(),
    "emotion_percentage_global": emotion_percentage_global.astype(float).tolist(),
    "sentiment_percentage_global": sentiment_percentage_global.astype(float).tolist(),
    "emotion_percentage_kpop": emotion_percentage_kpop.astype(float).tolist(),
    "sentiment_percentage_kpop": sentiment_percentage_kpop.astype(float).tolist(),
}

In [102]:
import json
with open("data_"+ target_group +".json", "w") as f:
    json.dump(json_data, f)