# Lyrics Analysis

In [52]:
lyrics_directory = "Lyrics_Data/"
tracks_csv = "tracks_group.csv"
stopwords_file = "stopwords.txt"

In [53]:
target_group = "bts"
kpop_group = set(['kpop', 'itzy', 'seventeen', 'bts', 'twice'])
global_group = set(['global', 'billboard'])

target_and_kpop = kpop_group.union(set([target_group]))
target_and_global = global_group.union(set([target_group]))

## Import data

In [54]:
import pandas as pd

In [55]:
df = pd.read_csv(tracks_csv)

df.head(3)

Unnamed: 0,track_name,track_id,grouping
0,Ice Cream (with Selena Gomez),2J4P46vCFm1rPkNkp9pZWX,kpop
1,Dynamite,0v1x6rN6JHRapa03JElljE,kpop
2,THE BADDEST,2V4Fx72svQRxrFvNT1eq5f,kpop


In [56]:
df.shape

(4618, 3)

In [57]:
df["grouping"].unique()

array(['kpop', 'itzy', 'seventeen', 'bts', 'twice', 'global', 'billboard'],
      dtype=object)

In [58]:
def get_lyrics(track_id):
    try:
        lyrics = ""
        with open(lyrics_directory + track_id + ".txt", "r") as f:
            lyrics = f.read()
            
        return lyrics            
    except:
        return None
    
df["lyrics"] = df["track_id"].apply(get_lyrics)

df.shape

(4618, 4)

In [59]:
# stop words
# https://www.kaggle.com/datasets/rowhitswami/stopwords/
stopwords = set()
with open(stopwords_file, "r") as f:
    stopwords = set(f.read().split("\n"))
    
stopwords

{',',
 '>',
 '?',
 'Just',
 'Yes',
 'a',
 'abaft',
 'abafter',
 'abaftest',
 'about',
 'abouter',
 'aboutest',
 'above',
 'abover',
 'abovest',
 'accordingly',
 'aer',
 'aest',
 'afore',
 'after',
 'afterer',
 'afterest',
 'afterward',
 'afterwards',
 'again',
 'against',
 'aid',
 'ain',
 'albeit',
 'all',
 'aller',
 'allest',
 'alls',
 'allyou',
 'almost',
 'along',
 'alongside',
 'already',
 'also',
 'although',
 'always',
 'amid',
 'amidst',
 'among',
 'amongst',
 'an',
 'and',
 'andor',
 'anear',
 'anent',
 'another',
 'any',
 'anybody',
 'anyhow',
 'anyone',
 'anything',
 'anywhere',
 'apart',
 'aparter',
 'apartest',
 'appear',
 'appeared',
 'appearing',
 'appears',
 'appropriate',
 'appropriated',
 'appropriater',
 'appropriates',
 'appropriatest',
 'appropriating',
 'are',
 'ares',
 'around',
 'as',
 'ases',
 'aside',
 'asides',
 'aslant',
 'astraddle',
 'astraddler',
 'astraddlest',
 'astride',
 'astrider',
 'astridest',
 'at',
 'athwart',
 'atop',
 'atween',
 'aught',
 'aught

## Analyze

In [60]:
def is_stopword(word):
    word = word.lower()
    
    if len(word) <= 1:
        return True
    
    # in the list
    if word in stopwords:
        return True
    
    # is Korean
    if ord("가") <= ord(word[0]) <= ord("힣"):
        return True
    
    return False

### Characteristic Keyword
Using TF-IDF

In [61]:
from sklearn.feature_extraction.text import TfidfVectorizer

#### Among global

In [62]:
df_looking = df[df["grouping"].isin(target_and_global)].reset_index(drop=True)

In [63]:
vectorizer = TfidfVectorizer(stop_words=list(stopwords), max_features=1000, min_df=10)

x = vectorizer.fit_transform(df_looking["lyrics"].fillna("")).toarray()

df_tfidf = pd.DataFrame(x, columns=vectorizer.get_feature_names_out()).T



In [64]:
threshold = 0.001
counter = {}
summer = {}

for cnt in range(len(df_looking)):
    if df_looking["grouping"][cnt] != target_group:
        continue
    
    tfidf_series = df_tfidf[cnt]
    
    keywords = tfidf_series[tfidf_series > threshold].index.tolist()
    for keyword in keywords:
        counter[keyword] = counter.get(keyword, 0) + 1
        summer[keyword] = summer.get(keyword, 0) + float(tfidf_series[keyword])

In [65]:
def sort_dictionary(dictionary):
    return [(k, dictionary[k]) for k in sorted(dictionary, key=dictionary.get, reverse=True)]

In [66]:
tfidf_counter_global = sort_dictionary(counter)
tfidf_counter_global

[('home', 111),
 ('love', 99),
 ('let', 93),
 ('sorry', 92),
 ('news', 87),
 ('life', 76),
 ('night', 73),
 ('say', 66),
 ('world', 63),
 ('heart', 61),
 ('think', 59),
 ('eyes', 57),
 ('dream', 54),
 ('only', 53),
 ('am', 52),
 ('take', 52),
 ('yes', 51),
 ('new', 51),
 ('back', 49),
 ('baby', 49),
 ('still', 48),
 ('light', 47),
 ('keep', 46),
 ('follow', 45),
 ('contact', 41),
 ('room', 40),
 ('end', 40),
 ('man', 39),
 ('leave', 38),
 ('tell', 37),
 ('feel', 36),
 ('way', 33),
 ('need', 33),
 ('hand', 32),
 ('girl', 32),
 ('fly', 31),
 ('away', 31),
 ('first', 30),
 ('morning', 30),
 ('top', 29),
 ('body', 29),
 ('dance', 28),
 ('true', 28),
 ('people', 28),
 ('start', 28),
 ('little', 28),
 ('star', 27),
 ('hands', 27),
 ('name', 27),
 ('give', 27),
 ('live', 27),
 ('shut', 27),
 ('god', 26),
 ('sky', 26),
 ('call', 26),
 ('care', 26),
 ('change', 26),
 ('okay', 26),
 ('hear', 26),
 ('money', 26),
 ('sleep', 26),
 ('high', 26),
 ('boy', 25),
 ('crazy', 25),
 ('looking', 25),
 ('br

In [67]:
tfidf_summer_global = sort_dictionary(summer)
tfidf_summer_global

[('sorry', 18.545721208654804),
 ('home', 17.067671100778245),
 ('love', 16.735009466826803),
 ('news', 12.674093693227155),
 ('contact', 9.22333997829706),
 ('한국', 8.222223505021587),
 ('dream', 8.030096895193802),
 ('universe', 7.9083924155994865),
 ('let', 7.608030589112444),
 ('light', 7.356065623697072),
 ('world', 7.223579992437025),
 ('night', 7.066279767766307),
 ('follow', 6.95590814499894),
 ('life', 6.71919442829168),
 ('new', 5.96204864337862),
 ('room', 5.867874875362694),
 ('best', 5.506462222532698),
 ('yes', 5.477761858632744),
 ('only', 5.458156198309174),
 ('shut', 5.449540846527193),
 ('baby', 5.403275060796169),
 ('still', 5.351539751321282),
 ('hands', 5.300967017098819),
 ('girl', 5.259574326668171),
 ('heart', 5.035193009201872),
 ('fly', 4.985401266614601),
 ('crazy', 4.7854674590072825),
 ('dynamite', 4.677174820436611),
 ('eyes', 4.604735445913241),
 ('sick', 4.51720668625965),
 ('need', 4.496714381468557),
 ('say', 4.48611354935942),
 ('end', 4.48320034049854

#### Among KPOP

In [68]:
df_looking = df[df["grouping"].isin(target_and_kpop)].reset_index(drop=True)

In [69]:
vectorizer = TfidfVectorizer(stop_words=list(stopwords), max_features=1000, min_df=10)

x = vectorizer.fit_transform(df_looking["lyrics"].fillna("")).toarray()

df_tfidf = pd.DataFrame(x, columns=vectorizer.get_feature_names_out()).T



In [70]:
threshold = 0.001
counter = {}
summer = {}

for cnt in range(len(df_looking)):
    if df_looking["grouping"][cnt] != target_group:
        continue
    
    tfidf_series = df_tfidf[cnt]
    
    keywords = tfidf_series[tfidf_series > threshold].index.tolist()
    for keyword in keywords:
        counter[keyword] = counter.get(keyword, 0) + 1
        summer[keyword] = summer.get(keyword, 0) + float(tfidf_series[keyword])

In [71]:
tfidf_counter_kpop = sort_dictionary(counter)
tfidf_counter_kpop

[('home', 110),
 ('love', 99),
 ('let', 93),
 ('sorry', 92),
 ('news', 87),
 ('life', 76),
 ('night', 73),
 ('say', 66),
 ('world', 63),
 ('heart', 61),
 ('think', 59),
 ('eyes', 57),
 ('dream', 54),
 ('only', 53),
 ('am', 52),
 ('take', 52),
 ('yes', 51),
 ('new', 51),
 ('back', 49),
 ('baby', 49),
 ('still', 48),
 ('light', 47),
 ('keep', 46),
 ('follow', 45),
 ('contact', 41),
 ('room', 40),
 ('end', 40),
 ('man', 39),
 ('leave', 38),
 ('tell', 37),
 ('feel', 36),
 ('way', 33),
 ('price_varies', 33),
 ('need', 33),
 ('hand', 32),
 ('girl', 32),
 ('fly', 31),
 ('away', 31),
 ('first', 30),
 ('morning', 30),
 ('top', 29),
 ('body', 29),
 ('dance', 28),
 ('true', 28),
 ('people', 28),
 ('start', 28),
 ('little', 28),
 ('star', 27),
 ('hands', 27),
 ('name', 27),
 ('give', 27),
 ('live', 27),
 ('shut', 27),
 ('god', 26),
 ('sky', 26),
 ('call', 26),
 ('care', 26),
 ('change', 26),
 ('hear', 26),
 ('money', 26),
 ('sleep', 26),
 ('high', 26),
 ('boy', 25),
 ('okay', 25),
 ('crazy', 25),


In [72]:
tfidf_summer_kpop = sort_dictionary(summer)
tfidf_summer_kpop

[('love', 17.244766793140478),
 ('home', 13.976033105135537),
 ('sorry', 11.214499143805613),
 ('let', 8.365441239057917),
 ('life', 7.722852882428214),
 ('night', 7.388498725011722),
 ('universe', 6.86082255138718),
 ('world', 6.732148348413589),
 ('only', 6.243375393825241),
 ('dream', 6.188214244365974),
 ('contact', 6.101288404283434),
 ('still', 6.026430835348571),
 ('news', 6.025426808817155),
 ('light', 5.982532756808491),
 ('girl', 5.958198828692089),
 ('best', 5.659946319195746),
 ('say', 5.510279668081929),
 ('yes', 5.466931898775716),
 ('crazy', 5.3143404884223635),
 ('need', 5.192483267449859),
 ('hands', 5.011287717289467),
 ('new', 5.008184068024992),
 ('baby', 4.8137220314116655),
 ('fly', 4.748502194692486),
 ('follow', 4.6920595313623235),
 ('think', 4.567508302573726),
 ('heart', 4.433581534579057),
 ('stay', 4.4185032561937945),
 ('invalid', 4.404981493203845),
 ('sick', 4.40122956611715),
 ('back', 4.374582098380377),
 ('room', 4.325917759871038),
 ('money', 4.32569

### Topic analysis

In [73]:
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel

In [74]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
def save_lda_vis(lda, corpus, dictionary, filename):
    vis = gensimvis.prepare(lda, corpus, dictionary)
    pyLDAvis.save_html(vis, filename)
    
    with open(filename, "r") as f:
        html = f.read()
        
    return html, vis

In [75]:
def pre_process(df_lyrics):
    # tokenize lyrics
    lyrics_processed = df_lyrics.fillna("").str.split()

    for _list in lyrics_processed:
        for cnt in range(len(_list)):
            _list[cnt] = _list[cnt].replace(".", "").replace(",", "").replace("?", "").replace("\\", "").replace("/", "").replace(":", "").lower()
            
            if is_stopword(_list[cnt]):
                _list[cnt] = ""
                
    # remove empty string
    for cnt in range(len(lyrics_processed)):
        while(True):
            try:
                lyrics_processed[cnt].remove("")
            except:
                break

    return lyrics_processed

#### Among global

In [76]:
df_looking = df[df["grouping"].isin(target_and_global)].reset_index(drop=True)

In [77]:
#train model
lyrics_processed = pre_process(df_looking["lyrics"])
dictionary = Dictionary(lyrics_processed)
corpus = [dictionary.doc2bow(text) for text in lyrics_processed]
lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=100)

all topics

In [78]:
lda_html_global, vis = save_lda_vis(lda, corpus, dictionary, "lda.html")

vis

PreparedData(topic_coordinates=              x         y  topics  cluster      Freq
topic                                               
47     0.077786  0.045898       1        1  3.309526
17     0.067432  0.040155       2        1  2.737718
56     0.052112  0.037631       3        1  2.645697
34     0.061447  0.040578       4        1  2.559457
61     0.060948  0.033737       5        1  2.257683
...         ...       ...     ...      ...       ...
2     -0.077358 -0.094450      96        1  0.299697
43    -0.079379 -0.076079      97        1  0.287282
87    -0.073824 -0.136478      98        1  0.283655
46    -0.114523 -0.021936      99        1  0.190818
8     -0.036035 -0.041739     100        1  0.171265

[100 rows x 5 columns], topic_info=        Term         Freq        Total  Category  logprob  loglift
4889  labour  5093.000000  5093.000000   Default  30.0000  30.0000
580    drunk  2687.000000  2687.000000   Default  29.0000  29.0000
49      baby  4765.000000  4765.000000   De

what topics are in kpop?

In [79]:
threshold = 0.3

counter = {}
summer = {}

for cnt in range(len(df_looking)):
    if df_looking["grouping"][cnt] != target_group:
        continue
    
    topics = lda.get_document_topics(corpus[cnt], minimum_probability=0)
    
    for topic in topics:
        if topic[1] > threshold:
            counter[topic[0]] = counter.get(topic[0], 0) + 1
            
        summer[topic[0]] = summer.get(topic[0], 0) + float(topic[1])

In [80]:
_lda_counter_global = sort_dictionary(counter)
lda_counter_global = []
for tup in _lda_counter_global:
    new_tup = [tup[0], tup[1], lda.show_topic(tup[0])]
    lda_counter_global.append(new_tup)
    
lda_counter_global

[[78,
  10,
  [('hands', 0.13331361),
   ('invalid', 0.062418796),
   ('fire', 0.017524254),
   ('burn', 0.01602136),
   ('dance', 0.014433545),
   ('sorry', 0.013666346),
   ('days', 0.0117965285),
   ('world', 0.010566919),
   ('和만', 0.007988357),
   ('keep', 0.007956879)]],
 [3,
  9,
  [('ка', 0.12332662),
   ('love', 0.028707948),
   ('crazy', 0.020973472),
   ('sorry', 0.018914089),
   ('ho', 0.01710507),
   ('say', 0.012831919),
   ('only', 0.010828125),
   ('let', 0.008338018),
   ('sick', 0.008315871),
   ('bad', 0.008203264)]],
 [47,
  9,
  [('love', 0.17926116),
   ('let', 0.013879992),
   ('take', 0.009048104),
   ('leave', 0.0083952565),
   ('give', 0.008002782),
   ('baby', 0.007735279),
   ('tell', 0.0076135662),
   ('fight', 0.0073377416),
   ('need', 0.007330261),
   ('heart', 0.0072404398)]],
 [53,
  8,
  [('power', 0.014439465),
   ('honey', 0.014200584),
   ('light', 0.01390929),
   ('pain', 0.012620509),
   ('baby', 0.012530698),
   ('drive', 0.012521985),
   ('dyna

In [81]:
_lda_summer_global = sort_dictionary(summer)
lda_summer_global = []
for tup in _lda_summer_global:
    new_tup = [tup[0], tup[1], lda.show_topic(tup[0])]
    lda_summer_global.append(new_tup)
    
lda_summer_global

[[78,
  13.167603064514878,
  [('hands', 0.13331361),
   ('invalid', 0.062418796),
   ('fire', 0.017524254),
   ('burn', 0.01602136),
   ('dance', 0.014433545),
   ('sorry', 0.013666346),
   ('days', 0.0117965285),
   ('world', 0.010566919),
   ('和만', 0.007988357),
   ('keep', 0.007956879)]],
 [47,
  11.935401925272345,
  [('love', 0.17926116),
   ('let', 0.013879992),
   ('take', 0.009048104),
   ('leave', 0.0083952565),
   ('give', 0.008002782),
   ('baby', 0.007735279),
   ('tell', 0.0076135662),
   ('fight', 0.0073377416),
   ('need', 0.007330261),
   ('heart', 0.0072404398)]],
 [3,
  10.34552857109611,
  [('ка', 0.12332662),
   ('love', 0.028707948),
   ('crazy', 0.020973472),
   ('sorry', 0.018914089),
   ('ho', 0.01710507),
   ('say', 0.012831919),
   ('only', 0.010828125),
   ('let', 0.008338018),
   ('sick', 0.008315871),
   ('bad', 0.008203264)]],
 [1,
  8.312309294143233,
  [('love', 0.038036104),
   ('yes', 0.026745727),
   ('feel', 0.013842689),
   ('need', 0.013514227),
 

#### Among KPOP

In [82]:
df_looking = df[df["grouping"].isin(target_and_kpop)].reset_index(drop=True)

In [83]:
#train model
lyrics_processed = pre_process(df_looking["lyrics"])
dictionary = Dictionary(lyrics_processed)
corpus = [dictionary.doc2bow(text) for text in lyrics_processed]
lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=100)

# observe topics
topics = lda.print_topics()

In [84]:
threshold = 0.3

counter = {}
summer = {}

for cnt in range(len(df_looking)):
    if df_looking["grouping"][cnt] != target_group:
        continue
    
    topics = lda.get_document_topics(corpus[cnt], minimum_probability=0)
    
    for topic in topics:
        if topic[1] > threshold:
            counter[topic[0]] = counter.get(topic[0], 0) + 1
            
        summer[topic[0]] = summer.get(topic[0], 0) + float(topic[1])


In [85]:
lda_html_kpop, vis = save_lda_vis(lda, corpus, dictionary, "lda.html")

vis

PreparedData(topic_coordinates=              x         y  topics  cluster      Freq
topic                                               
86    -0.374451  0.058917       1        1  3.625849
43    -0.014065  0.056756       2        1  2.993361
34     0.019674  0.063271       3        1  2.065253
85     0.002504  0.072254       4        1  1.775905
3     -0.015456 -0.068692       5        1  1.764869
...         ...       ...     ...      ...       ...
64    -0.009344 -0.037455      96        1  0.441981
27     0.016508 -0.005981      97        1  0.431228
31     0.004707 -0.056980      98        1  0.403712
89    -0.024589 -0.112450      99        1  0.381802
45    -0.021842 -0.116410     100        1  0.355709

[100 rows x 5 columns], topic_info=         Term         Freq        Total  Category  logprob  loglift
280     child  4231.000000  4231.000000   Default  30.0000  30.0000
425     drunk  2145.000000  2145.000000   Default  29.0000  29.0000
494      cake  1313.000000  1313.000000 

In [86]:
_lda_counter_kpop = sort_dictionary(counter)
lda_counter_kpop = []
for tup in _lda_counter_kpop:
    new_tup = [tup[0], tup[1], lda.show_topic(tup[0])]
    lda_counter_kpop.append(new_tup)
    
lda_counter_kpop

[[70,
  10,
  [('yes', 0.028797416),
   ('night', 0.025968993),
   ('light', 0.019443495),
   ("let's", 0.017990965),
   ('let', 0.011282281),
   ('stars', 0.010864326),
   ('dynamite', 0.0102211125),
   ('real', 0.008132629),
   ('star', 0.0074728583),
   ('fun', 0.00732734)]],
 [66,
  10,
  [('ba', 0.047951892),
   ('feel', 0.027755052),
   ('only', 0.013790023),
   ('let', 0.013447949),
   ('step', 0.012046887),
   ('love', 0.011653859),
   ('side', 0.011518793),
   ('back', 0.01120281),
   ('girl', 0.010852658),
   ('take', 0.010486417)]],
 [2,
  10,
  [('home', 0.03234648),
   ('dan', 0.030676097),
   ('uh', 0.015109619),
   ('non', 0.012953646),
   ('sorry', 0.012107906),
   ('bam', 0.009205314),
   ('bag', 0.008486446),
   ('give', 0.007985962),
   ('mic', 0.0077129933),
   ('love', 0.007567671)]],
 [74,
  9,
  [('grande', 0.031720906),
   ('need', 0.02381972),
   ('shy', 0.021873636),
   ('love', 0.020699138),
   ('sorry', 0.0170239),
   ('ba', 0.01572025),
   ('hate', 0.015555

In [87]:
_lda_summer_kpop = sort_dictionary(summer)
lda_summer_kpop = []
for tup in _lda_summer_kpop:
    new_tup = [tup[0], tup[1], lda.show_topic(tup[0])]
    lda_summer_kpop.append(new_tup)
    
lda_summer_kpop

[[70,
  10.955019012004868,
  [('yes', 0.028797416),
   ('night', 0.025968993),
   ('light', 0.019443495),
   ("let's", 0.017990965),
   ('let', 0.011282281),
   ('stars', 0.010864326),
   ('dynamite', 0.0102211125),
   ('real', 0.008132629),
   ('star', 0.0074728583),
   ('fun', 0.00732734)]],
 [2,
  9.724748864073717,
  [('home', 0.03234648),
   ('dan', 0.030676097),
   ('uh', 0.015109619),
   ('non', 0.012953646),
   ('sorry', 0.012107906),
   ('bam', 0.009205314),
   ('bag', 0.008486446),
   ('give', 0.007985962),
   ('mic', 0.0077129933),
   ('love', 0.007567671)]],
 [66,
  8.720727312959298,
  [('ba', 0.047951892),
   ('feel', 0.027755052),
   ('only', 0.013790023),
   ('let', 0.013447949),
   ('step', 0.012046887),
   ('love', 0.011653859),
   ('side', 0.011518793),
   ('back', 0.01120281),
   ('girl', 0.010852658),
   ('take', 0.010486417)]],
 [75,
  7.8514211804344995,
  [('invalid', 0.24193916),
   ('love', 0.028704055),
   ('flower', 0.018517949),
   ('sorry', 0.012326145),


### Emotion analysis
Emotions:  
fear
anger
anticipation
trust
surprise
positive
negative
sadness
disgust
joy

In [88]:
import numpy as np

In [89]:
labels = [
    "anger",
    "anticipation",
    "disgust",
    "fear",
    "joy",
    "sadness",
    "surprise",	
    "trust",
    "negative",	
    "positive",
]

emotions = [
    "anger",
    "anticipation",
    "disgust",
    "fear",
    "joy",
    "sadness",
    "surprise",	
    "trust",
]

sentiments = [
    "negative",	
    "positive",
]

In [90]:
df_nrc = pd.read_csv("NRC.txt", sep="\t", names=["word", "label", "amount"])

words_to_labels = {}

for _, row in df_nrc.iterrows():
    words_to_labels[row["word"]] = words_to_labels.get(row["word"], np.zeros((10, 1)))
    words_to_labels[row["word"]][labels.index(row["label"])] = row["amount"]

In [91]:
df_label = df.copy()
for label in labels:
    df_label[label] = 0

In [92]:
for cnt in range(len(df)):
    lyrics = df["lyrics"][cnt]

    if type(lyrics) != str:
        continue

    label_this = np.zeros((10, 1))

    for word in lyrics.split():
        if word in words_to_labels:
            label_this += words_to_labels[word]
            
    emotion_this = label_this[:8]
    sentiment_this = label_this[8:]
            
    if np.linalg.norm(emotion_this) != 0:
        emotion_this /= np.linalg.norm(emotion_this)
        
    if np.linalg.norm(sentiment_this) != 0:
        sentiment_this /= np.linalg.norm(sentiment_this)
        
    label_this = np.concatenate((emotion_this, sentiment_this), axis=None)
    
    for label in labels:
        df_label[label][cnt] = label_this[labels.index(label)]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_label[label][cnt] = label_this[labels.index(label)]
  df_label[label][cnt] = label_this[labels.index(label)]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_label[label][cnt] = label_this[labels.index(label)]
  df_label[label][cnt] = label_this[labels.index(label)]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_label[label][cnt] = label_this[labels.index(label)]
  df_label[label][cnt] = label_this[labels.index(label)]
A value is trying to be set on a copy of

In [93]:
df_emotion_global = df_label[df_label["grouping"].isin(global_group)]
mean_emotion_global = df_emotion_global[emotions].mean()
normalized_mean_emotion_global = mean_emotion_global / np.linalg.norm(mean_emotion_global)
mean_sentiment_global = df_emotion_global[sentiments].mean()
normalized_mean_sentiment_global = mean_sentiment_global / np.linalg.norm(mean_sentiment_global)

df_emotion_kpop = df_label[df_label["grouping"].isin(kpop_group)]
mean_emotion_kpop = df_emotion_kpop[emotions].mean()
normalized_mean_emotion_kpop = mean_emotion_kpop / np.linalg.norm(mean_emotion_kpop)
mean_sentiment_kpop = df_emotion_kpop[sentiments].mean()
normalized_mean_sentiment_kpop = mean_sentiment_kpop / np.linalg.norm(mean_sentiment_kpop)

df_emotion_target = df_label[df_label["grouping"] == target_group]
mean_emotion_target = df_emotion_target[emotions].mean()
normalized_mean_emotion_target = mean_emotion_target / np.linalg.norm(mean_emotion_target)
mean_sentiment_target = df_emotion_target[sentiments].mean()
normalized_mean_sentiment_target = mean_sentiment_target / np.linalg.norm(mean_sentiment_target)

In [94]:
normalized_mean_emotion_target

anger           0.223975
anticipation    0.448027
disgust         0.150058
fear            0.327253
joy             0.538476
sadness         0.356632
surprise        0.226047
trust           0.388915
dtype: float64

In [95]:
normalized_mean_sentiment_target

negative    0.520464
positive    0.853884
dtype: float64

against global

In [96]:
emotion_percentage_global = (normalized_mean_emotion_target - normalized_mean_emotion_global) / normalized_mean_emotion_global * 100

emotion_percentage_global

anger          -25.585826
anticipation     8.754721
disgust        -22.792924
fear            -1.296164
joy              8.850710
sadness         -2.929591
surprise        -4.462042
trust           -1.631757
dtype: float64

In [97]:
sentiment_percentage_global = (normalized_mean_sentiment_target - normalized_mean_sentiment_global) / normalized_mean_sentiment_global * 100

sentiment_percentage_global

negative   -20.640925
positive    13.111409
dtype: float64

against KPOP

In [98]:
emotion_percentage_kpop = (normalized_mean_emotion_target - normalized_mean_emotion_kpop) / normalized_mean_emotion_kpop * 100

emotion_percentage_kpop

anger            0.858687
anticipation    -1.722712
disgust          7.520861
fear            10.708836
joy             -6.411338
sadness         12.816792
surprise        -4.263159
trust            0.659336
dtype: float64

In [99]:
sentiment_percentage_kpop = (normalized_mean_sentiment_target - normalized_mean_sentiment_kpop) / normalized_mean_sentiment_kpop * 100

sentiment_percentage_kpop

negative    2.073402
positive   -0.738734
dtype: float64

## Save Data

In [100]:
# to float
for _sum in lda_summer_global:
    _sum[2] = [(tup[0], float(tup[1])) for tup in _sum[2]]
    
for _sum in lda_summer_kpop:
    _sum[2] = [(tup[0], float(tup[1])) for tup in _sum[2]]
    
for count in lda_counter_global:
    count[2] = [(tup[0], float(tup[1])) for tup in count[2]]

for count in lda_counter_kpop:
    count[2] = [(tup[0], float(tup[1])) for tup in count[2]]

In [101]:
json_data = {
    "tfidf_counter_global": tfidf_counter_global,
    "tfidf_summer_global": tfidf_summer_global,
    "tfidf_counter_kpop": tfidf_counter_kpop,
    "tfidf_summer_kpop": tfidf_summer_kpop,
    "lda_counter_global": lda_counter_global,
    "lda_summer_global": lda_summer_global,
    "lda_counter_kpop": lda_counter_kpop,
    "lda_summer_kpop": lda_summer_kpop,
    "lda_html_global": lda_html_global,
    "lda_html_kpop": lda_html_kpop,
    "normalized_mean_emotion_target": normalized_mean_emotion_target.astype(float).tolist(),
    "normalized_mean_sentiment_target": normalized_mean_sentiment_target.astype(float).tolist(),
    "emotion_percentage_global": emotion_percentage_global.astype(float).tolist(),
    "sentiment_percentage_global": sentiment_percentage_global.astype(float).tolist(),
    "emotion_percentage_kpop": emotion_percentage_kpop.astype(float).tolist(),
    "sentiment_percentage_kpop": sentiment_percentage_kpop.astype(float).tolist(),
}

In [102]:
import json
with open("data_"+ target_group +".json", "w") as f:
    json.dump(json_data, f)