# Lyrics Analysis

In [64]:
lyrics_directory = "Lyrics_Data/"
tracks_csv = "tracks_group.csv"
stopwords_file = "stopwords.txt"

In [65]:
target_group = "kpop"
kpop_group = set(['kpop', 'itzy', 'seventeen', 'bts', 'twice'])
global_group = set(['global', 'billboard'])

target_and_kpop = kpop_group.union(set([target_group]))
target_and_global = global_group.union(set([target_group]))

## Import data

In [66]:
import pandas as pd

In [67]:
df = pd.read_csv(tracks_csv)

df.head(3)

Unnamed: 0,track_name,track_id,grouping
0,Ice Cream (with Selena Gomez),2J4P46vCFm1rPkNkp9pZWX,kpop
1,Dynamite,0v1x6rN6JHRapa03JElljE,kpop
2,THE BADDEST,2V4Fx72svQRxrFvNT1eq5f,kpop


In [68]:
#all to "kpop"
df.loc[df['grouping'].isin(target_and_kpop), 'grouping'] = "kpop"

In [69]:
df.shape

(4618, 3)

In [70]:
df["grouping"].unique()

array(['kpop', 'global', 'billboard'], dtype=object)

In [71]:
def get_lyrics(track_id):
    try:
        lyrics = ""
        with open(lyrics_directory + track_id + ".txt", "r") as f:
            lyrics = f.read()
            
        return lyrics            
    except:
        return None
    
df["lyrics"] = df["track_id"].apply(get_lyrics)

df.shape

(4618, 4)

In [72]:
# stop words
# https://www.kaggle.com/datasets/rowhitswami/stopwords/
stopwords = set()
with open(stopwords_file, "r") as f:
    stopwords = set(f.read().split("\n"))
    
stopwords

{',',
 '>',
 '?',
 'Just',
 'Yes',
 'a',
 'abaft',
 'abafter',
 'abaftest',
 'about',
 'abouter',
 'aboutest',
 'above',
 'abover',
 'abovest',
 'accordingly',
 'aer',
 'aest',
 'afore',
 'after',
 'afterer',
 'afterest',
 'afterward',
 'afterwards',
 'again',
 'against',
 'aid',
 'ain',
 'albeit',
 'all',
 'aller',
 'allest',
 'alls',
 'allyou',
 'almost',
 'along',
 'alongside',
 'already',
 'also',
 'although',
 'always',
 'amid',
 'amidst',
 'among',
 'amongst',
 'an',
 'and',
 'andor',
 'anear',
 'anent',
 'another',
 'any',
 'anybody',
 'anyhow',
 'anyone',
 'anything',
 'anywhere',
 'apart',
 'aparter',
 'apartest',
 'appear',
 'appeared',
 'appearing',
 'appears',
 'appropriate',
 'appropriated',
 'appropriater',
 'appropriates',
 'appropriatest',
 'appropriating',
 'are',
 'ares',
 'around',
 'as',
 'ases',
 'aside',
 'asides',
 'aslant',
 'astraddle',
 'astraddler',
 'astraddlest',
 'astride',
 'astrider',
 'astridest',
 'at',
 'athwart',
 'atop',
 'atween',
 'aught',
 'aught

## Analyze

In [73]:
def is_stopword(word):
    word = word.lower()
    
    if len(word) <= 1:
        return True
    
    # in the list
    if word in stopwords:
        return True
    
    # is Korean
    if ord("가") <= ord(word[0]) <= ord("힣"):
        return True
    
    return False

### Characteristic Keyword
Using TF-IDF

In [74]:
from sklearn.feature_extraction.text import TfidfVectorizer

#### Among global

In [75]:
df_looking = df[df["grouping"].isin(target_and_global)].reset_index(drop=True)

In [76]:
vectorizer = TfidfVectorizer(stop_words=list(stopwords), max_features=1000, min_df=10)

x = vectorizer.fit_transform(df_looking["lyrics"].fillna("")).toarray()

df_tfidf = pd.DataFrame(x, columns=vectorizer.get_feature_names_out()).T



In [77]:
threshold = 0.001
counter = {}
summer = {}

for cnt in range(len(df_looking)):
    if df_looking["grouping"][cnt] != target_group:
        continue
    
    tfidf_series = df_tfidf[cnt]
    
    keywords = tfidf_series[tfidf_series > threshold].index.tolist()
    for keyword in keywords:
        counter[keyword] = counter.get(keyword, 0) + 1
        summer[keyword] = summer.get(keyword, 0) + float(tfidf_series[keyword])

In [78]:
def sort_dictionary(dictionary):
    return [(k, dictionary[k]) for k in sorted(dictionary, key=dictionary.get, reverse=True)]

In [79]:
tfidf_counter_global = sort_dictionary(counter)
tfidf_counter_global

[('love', 521),
 ('let', 507),
 ('sorry', 497),
 ('news', 412),
 ('home', 410),
 ('heart', 362),
 ('yes', 345),
 ('eyes', 313),
 ('baby', 311),
 ('feel', 301),
 ('say', 299),
 ('think', 292),
 ('back', 279),
 ('night', 270),
 ('take', 267),
 ('world', 239),
 ('new', 226),
 ('follow', 224),
 ('only', 220),
 ('am', 217),
 ('need', 212),
 ('tell', 206),
 ('life', 202),
 ('leave', 202),
 ('light', 199),
 ('dream', 199),
 ('keep', 195),
 ('room', 190),
 ('give', 179),
 ('way', 177),
 ('end', 171),
 ('mind', 170),
 ('down', 162),
 ('wait', 149),
 ('still', 146),
 ('hands', 143),
 ('child', 142),
 ('show', 140),
 ('dance', 139),
 ('girl', 139),
 ('looking', 136),
 ('care', 135),
 ('price_varies', 133),
 ('contact', 133),
 ('little', 130),
 ('hear', 128),
 ('ah', 128),
 ('okay', 126),
 ('away', 125),
 ('honey', 122),
 ('boy', 121),
 ('beautiful', 121),
 ('sleep', 119),
 ('find', 119),
 ('start', 119),
 ('ready', 118),
 ('feeling', 118),
 ('bad', 117),
 ('stay', 116),
 ('call', 115),
 ('sky', 1

In [80]:
tfidf_summer_global = sort_dictionary(summer)
tfidf_summer_global

[('love', 90.37153562331105),
 ('home', 76.21004040264718),
 ('sorry', 69.1397655218261),
 ('news', 48.08603122812252),
 ('let', 45.65971006638431),
 ('heart', 43.53378986841094),
 ('yes', 39.32657988099398),
 ('baby', 37.152868190550656),
 ('한국', 32.36819854379235),
 ('follow', 32.046654207752745),
 ('eyes', 30.337788713241792),
 ('night', 30.09200698624242),
 ('feel', 28.39620085138629),
 ('dream', 27.485997323149512),
 ('back', 27.444226966689936),
 ('world', 27.138972303382236),
 ('new', 25.668929887759134),
 ('say', 25.19307183864961),
 ('ah', 24.989323963741565),
 ('light', 24.884317238554694),
 ('only', 24.528770918077953),
 ('think', 24.049286804765085),
 ('woo', 23.311984021567397),
 ('room', 22.957403813727545),
 ('am', 22.89032814755495),
 ('dance', 22.81597776759301),
 ('need', 22.554843859626192),
 ('life', 21.359150404595603),
 ('contact', 20.8388989921961),
 ('hot', 20.835045446552304),
 ('give', 20.800300456215705),
 ('boy', 20.55275934228878),
 ('end', 19.7053305786462

#### Among KPOP

In [81]:
df_looking = df[df["grouping"].isin(target_and_kpop)].reset_index(drop=True)

In [82]:
vectorizer = TfidfVectorizer(stop_words=list(stopwords), max_features=1000, min_df=10)

x = vectorizer.fit_transform(df_looking["lyrics"].fillna("")).toarray()

df_tfidf = pd.DataFrame(x, columns=vectorizer.get_feature_names_out()).T



In [83]:
threshold = 0.001
counter = {}
summer = {}

for cnt in range(len(df_looking)):
    if df_looking["grouping"][cnt] != target_group:
        continue
    
    tfidf_series = df_tfidf[cnt]
    
    keywords = tfidf_series[tfidf_series > threshold].index.tolist()
    for keyword in keywords:
        counter[keyword] = counter.get(keyword, 0) + 1
        summer[keyword] = summer.get(keyword, 0) + float(tfidf_series[keyword])

In [84]:
tfidf_counter_kpop = sort_dictionary(counter)
tfidf_counter_kpop

[('love', 521),
 ('let', 507),
 ('sorry', 497),
 ('news', 413),
 ('home', 408),
 ('heart', 363),
 ('yes', 345),
 ('eyes', 313),
 ('baby', 312),
 ('feel', 301),
 ('say', 299),
 ('think', 292),
 ('back', 279),
 ('night', 271),
 ('take', 267),
 ('world', 239),
 ('new', 226),
 ('follow', 224),
 ('only', 221),
 ('am', 217),
 ('need', 212),
 ('tell', 207),
 ('life', 202),
 ('leave', 202),
 ('light', 199),
 ('dream', 199),
 ('keep', 195),
 ('room', 190),
 ('give', 179),
 ('way', 177),
 ('end', 171),
 ('mind', 170),
 ('down', 162),
 ('wait', 149),
 ('still', 146),
 ('hands', 144),
 ('child', 142),
 ('show', 140),
 ('dance', 139),
 ('girl', 139),
 ('looking', 136),
 ('care', 135),
 ('price_varies', 133),
 ('contact', 133),
 ('little', 130),
 ('hear', 128),
 ('ah', 128),
 ('away', 125),
 ('okay', 125),
 ('honey', 122),
 ('boy', 121),
 ('beautiful', 121),
 ('sleep', 119),
 ('find', 119),
 ('start', 119),
 ('feeling', 118),
 ('bad', 117),
 ('ready', 117),
 ('stay', 116),
 ('call', 115),
 ('sky', 1

In [85]:
tfidf_summer_kpop = sort_dictionary(summer)
tfidf_summer_kpop

[('love', 93.38607915430586),
 ('home', 67.60988950474074),
 ('sorry', 50.59637894638318),
 ('let', 47.93110110384121),
 ('heart', 41.116791964066365),
 ('yes', 41.10677536323856),
 ('baby', 38.186684222047816),
 ('news', 31.517989144550324),
 ('feel', 31.44896588575648),
 ('night', 31.329232070269644),
 ('say', 29.465943981617627),
 ('back', 28.917235907807306),
 ('only', 27.346261757010232),
 ('eyes', 27.17744214823235),
 ('think', 25.90023701555005),
 ('follow', 25.761256945043133),
 ('need', 25.761205628558344),
 ('world', 25.658269008485153),
 ('한국', 25.315827692846227),
 ('am', 24.95654988991367),
 ('take', 24.1156828940417),
 ('life', 23.881421538952736),
 ('give', 23.697773262598623),
 ('new', 23.122089100039577),
 ('ah', 22.84938734799792),
 ('dream', 22.769037842367304),
 ('light', 22.613825747417657),
 ('dance', 21.8695620276186),
 ('tell', 21.632868387047996),
 ('woo', 21.403899689771514),
 ('boy', 20.841865832124057),
 ('leave', 20.144998389279486),
 ('room', 19.6380575782

### Topic analysis

In [86]:
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel

In [87]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
def save_lda_vis(lda, corpus, dictionary, filename):
    vis = gensimvis.prepare(lda, corpus, dictionary)
    pyLDAvis.save_html(vis, filename)
    
    with open(filename, "r") as f:
        html = f.read()
        
    return html, vis

In [88]:
def pre_process(df_lyrics):
    # tokenize lyrics
    lyrics_processed = df_lyrics.fillna("").str.split()

    for _list in lyrics_processed:
        for cnt in range(len(_list)):
            _list[cnt] = _list[cnt].replace(".", "").replace(",", "").replace("?", "").replace("\\", "").replace("/", "").replace(":", "").lower()
            
            if is_stopword(_list[cnt]):
                _list[cnt] = ""
                
    # remove empty string
    for cnt in range(len(lyrics_processed)):
        while(True):
            try:
                lyrics_processed[cnt].remove("")
            except:
                break

    return lyrics_processed

#### Among global

In [89]:
df_looking = df[df["grouping"].isin(target_and_global)].reset_index(drop=True)

In [90]:
#train model
lyrics_processed = pre_process(df_looking["lyrics"])
dictionary = Dictionary(lyrics_processed)
corpus = [dictionary.doc2bow(text) for text in lyrics_processed]
lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=100)

all topics

In [91]:
lda_html_global, vis = save_lda_vis(lda, corpus, dictionary, "lda.html")

vis

PreparedData(topic_coordinates=              x         y  topics  cluster      Freq
topic                                               
49     0.097738 -0.045292       1        1  5.544577
55     0.127943  0.032959       2        1  3.134141
85     0.098773  0.000078       3        1  2.875993
12     0.088042 -0.003466       4        1  2.165881
10     0.075653 -0.012632       5        1  2.030444
...         ...       ...     ...      ...       ...
72    -0.080508 -0.071964      96        1  0.289373
6     -0.149071 -0.072596      97        1  0.271226
70    -0.044351 -0.069961      98        1  0.263486
8     -0.171477 -0.071043      99        1  0.225784
21    -0.089682 -0.050406     100        1  0.152273

[100 rows x 5 columns], topic_info=        Term         Freq        Total  Category  logprob  loglift
280    child  5643.000000  5643.000000   Default  30.0000  30.0000
7603  labour  5031.000000  5031.000000   Default  29.0000  29.0000
120     home  4570.000000  4570.000000   De

what topics are in kpop?

In [92]:
threshold = 0.3

counter = {}
summer = {}

for cnt in range(len(df_looking)):
    if df_looking["grouping"][cnt] != target_group:
        continue
    
    topics = lda.get_document_topics(corpus[cnt], minimum_probability=0)
    
    for topic in topics:
        if topic[1] > threshold:
            counter[topic[0]] = counter.get(topic[0], 0) + 1
            
        summer[topic[0]] = summer.get(topic[0], 0) + float(topic[1])

In [93]:
_lda_counter_global = sort_dictionary(counter)
lda_counter_global = []
for tup in _lda_counter_global:
    new_tup = [tup[0], tup[1], lda.show_topic(tup[0])]
    lda_counter_global.append(new_tup)
    
lda_counter_global

[[25,
  48,
  [('home', 0.19895926),
   ('contact', 0.055756584),
   ('shut', 0.053006604),
   ('sorry', 0.048653252),
   ('news', 0.026734227),
   ('comment', 0.025891567),
   ('follow', 0.017419903),
   ('questions', 0.012008961),
   ('heart', 0.011360186),
   ('today', 0.011045514)]],
 [55,
  41,
  [('only', 0.04016433),
   ('love', 0.030677944),
   ('heart', 0.02519656),
   ('let', 0.02446092),
   ('way', 0.021316916),
   ('mine', 0.013390415),
   ('tell', 0.0131320665),
   ('miss', 0.013013426),
   ("can't", 0.009383079),
   ('snow', 0.009055935)]],
 [30,
  39,
  [('love', 0.34617913),
   ('mean', 0.022285886),
   ('happy', 0.02022777),
   ('need', 0.0151625965),
   ('ah', 0.014091778),
   ("can't", 0.013451934),
   ('you)', 0.011442575),
   ('labour', 0.0107748015),
   ('bad', 0.010251835),
   ('heart', 0.010216874)]],
 [11,
  38,
  [('eyes', 0.046342023),
   ('world', 0.018914998),
   ('news', 0.015251838),
   ('love', 0.013297308),
   ('{{if', 0.0130068585),
   ('end', 0.012363

In [94]:
_lda_summer_global = sort_dictionary(summer)
lda_summer_global = []
for tup in _lda_summer_global:
    new_tup = [tup[0], tup[1], lda.show_topic(tup[0])]
    lda_summer_global.append(new_tup)
    
lda_summer_global

[[25,
  73.60478117740195,
  [('home', 0.19895926),
   ('contact', 0.055756584),
   ('shut', 0.053006604),
   ('sorry', 0.048653252),
   ('news', 0.026734227),
   ('comment', 0.025891567),
   ('follow', 0.017419903),
   ('questions', 0.012008961),
   ('heart', 0.011360186),
   ('today', 0.011045514)]],
 [11,
  56.46132238308701,
  [('eyes', 0.046342023),
   ('world', 0.018914998),
   ('news', 0.015251838),
   ('love', 0.013297308),
   ('{{if', 0.0130068585),
   ('end', 0.012363636),
   ('afraid', 0.011569055),
   ('price_varies}', 0.010235928),
   ('sky', 0.009793846),
   ("let's", 0.009372898)]],
 [55,
  48.20537126318459,
  [('only', 0.04016433),
   ('love', 0.030677944),
   ('heart', 0.02519656),
   ('let', 0.02446092),
   ('way', 0.021316916),
   ('mine', 0.013390415),
   ('tell', 0.0131320665),
   ('miss', 0.013013426),
   ("can't", 0.009383079),
   ('snow', 0.009055935)]],
 [30,
  45.253874326370806,
  [('love', 0.34617913),
   ('mean', 0.022285886),
   ('happy', 0.02022777),
   

#### Among KPOP

In [95]:
df_looking = df[df["grouping"].isin(target_and_kpop)].reset_index(drop=True)

In [96]:
#train model
lyrics_processed = pre_process(df_looking["lyrics"])
dictionary = Dictionary(lyrics_processed)
corpus = [dictionary.doc2bow(text) for text in lyrics_processed]
lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=100)

# observe topics
topics = lda.print_topics()

In [97]:
threshold = 0.3

counter = {}
summer = {}

for cnt in range(len(df_looking)):
    if df_looking["grouping"][cnt] != target_group:
        continue
    
    topics = lda.get_document_topics(corpus[cnt], minimum_probability=0)
    
    for topic in topics:
        if topic[1] > threshold:
            counter[topic[0]] = counter.get(topic[0], 0) + 1
            
        summer[topic[0]] = summer.get(topic[0], 0) + float(topic[1])


In [98]:
lda_html_kpop, vis = save_lda_vis(lda, corpus, dictionary, "lda.html")

vis

PreparedData(topic_coordinates=              x         y  topics  cluster      Freq
topic                                               
42    -0.329163  0.072097       1        1  3.691575
94     0.014687 -0.001615       2        1  3.033688
46     0.084148  0.292050       3        1  2.618133
97     0.015496 -0.030331       4        1  2.493525
70    -0.005393 -0.012400       5        1  1.911457
...         ...       ...     ...      ...       ...
99     0.011053 -0.012487      96        1  0.452940
18     0.014994 -0.023869      97        1  0.443615
49     0.015575 -0.018848      98        1  0.344424
61    -0.314936  0.064302      99        1  0.331858
14     0.027791 -0.031690     100        1  0.233532

[100 rows x 5 columns], topic_info=        Term         Freq        Total  Category  logprob  loglift
280    child  4067.000000  4067.000000   Default  30.0000  30.0000
425    drunk  2238.000000  2238.000000   Default  29.0000  29.0000
494     cake  1278.000000  1278.000000   De

In [99]:
_lda_counter_kpop = sort_dictionary(counter)
lda_counter_kpop = []
for tup in _lda_counter_kpop:
    new_tup = [tup[0], tup[1], lda.show_topic(tup[0])]
    lda_counter_kpop.append(new_tup)
    
lda_counter_kpop

[[97,
  50,
  [('love', 0.16675065),
   ('top', 0.023729648),
   ('home', 0.016499445),
   ('crazy', 0.01575896),
   ("can't", 0.012429796),
   ('sorry', 0.010583185),
   ('ay', 0.010504948),
   ('night', 0.009325708),
   ('baby', 0.009185769),
   ('heart', 0.007038093)]],
 [94,
  47,
  [('home', 0.12782477),
   ('love', 0.01474947),
   ('let', 0.010766711),
   ('sorry', 0.01061506),
   ('heart', 0.009993936),
   ("let's", 0.00993831),
   ('shut', 0.009267275),
   ('say', 0.008660195),
   ("can't", 0.0063858465),
   ('baby', 0.0060902163)]],
 [57,
  30,
  [('baby', 0.016240522),
   ('back', 0.0155747505),
   ('home', 0.015504847),
   ('moon', 0.012761522),
   ('dawn', 0.012319786),
   ('star', 0.012195145),
   ("can't", 0.01165075),
   ('run', 0.010799802),
   ('night', 0.010342201),
   ('next', 0.0096074855)]],
 [10,
  28,
  [('home', 0.15137497),
   ('love', 0.023484707),
   ('shoot', 0.016338661),
   ('sorry', 0.01571999),
   ('give', 0.0153519055),
   ('shut', 0.012703807),
   ('dr

In [100]:
_lda_summer_kpop = sort_dictionary(summer)
lda_summer_kpop = []
for tup in _lda_summer_kpop:
    new_tup = [tup[0], tup[1], lda.show_topic(tup[0])]
    lda_summer_kpop.append(new_tup)
    
lda_summer_kpop

[[97,
  47.682332408456205,
  [('love', 0.16675065),
   ('top', 0.023729648),
   ('home', 0.016499445),
   ('crazy', 0.01575896),
   ("can't", 0.012429796),
   ('sorry', 0.010583185),
   ('ay', 0.010504948),
   ('night', 0.009325708),
   ('baby', 0.009185769),
   ('heart', 0.007038093)]],
 [94,
  45.566633995336815,
  [('home', 0.12782477),
   ('love', 0.01474947),
   ('let', 0.010766711),
   ('sorry', 0.01061506),
   ('heart', 0.009993936),
   ("let's", 0.00993831),
   ('shut', 0.009267275),
   ('say', 0.008660195),
   ("can't", 0.0063858465),
   ('baby', 0.0060902163)]],
 [57,
  30.085410467845577,
  [('baby', 0.016240522),
   ('back', 0.0155747505),
   ('home', 0.015504847),
   ('moon', 0.012761522),
   ('dawn', 0.012319786),
   ('star', 0.012195145),
   ("can't", 0.01165075),
   ('run', 0.010799802),
   ('night', 0.010342201),
   ('next', 0.0096074855)]],
 [10,
  28.65112317668718,
  [('home', 0.15137497),
   ('love', 0.023484707),
   ('shoot', 0.016338661),
   ('sorry', 0.01571999

### Emotion analysis
Emotions:  
fear
anger
anticipation
trust
surprise
positive
negative
sadness
disgust
joy

In [101]:
import numpy as np

In [102]:
labels = [
    "anger",
    "anticipation",
    "disgust",
    "fear",
    "joy",
    "sadness",
    "surprise",	
    "trust",
    "negative",	
    "positive",
]

emotions = [
    "anger",
    "anticipation",
    "disgust",
    "fear",
    "joy",
    "sadness",
    "surprise",	
    "trust",
]

sentiments = [
    "negative",	
    "positive",
]

In [103]:
df_nrc = pd.read_csv("NRC.txt", sep="\t", names=["word", "label", "amount"])

words_to_labels = {}

for _, row in df_nrc.iterrows():
    words_to_labels[row["word"]] = words_to_labels.get(row["word"], np.zeros((10, 1)))
    words_to_labels[row["word"]][labels.index(row["label"])] = row["amount"]

In [104]:
df_label = df.copy()
for label in labels:
    df_label[label] = 0

In [105]:
for cnt in range(len(df)):
    lyrics = df["lyrics"][cnt]

    if type(lyrics) != str:
        continue

    label_this = np.zeros((10, 1))

    for word in lyrics.split():
        if word in words_to_labels:
            label_this += words_to_labels[word]
            
    emotion_this = label_this[:8]
    sentiment_this = label_this[8:]
            
    if np.linalg.norm(emotion_this) != 0:
        emotion_this /= np.linalg.norm(emotion_this)
        
    if np.linalg.norm(sentiment_this) != 0:
        sentiment_this /= np.linalg.norm(sentiment_this)
        
    label_this = np.concatenate((emotion_this, sentiment_this), axis=None)
    
    for label in labels:
        df_label[label][cnt] = label_this[labels.index(label)]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_label[label][cnt] = label_this[labels.index(label)]
  df_label[label][cnt] = label_this[labels.index(label)]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_label[label][cnt] = label_this[labels.index(label)]
  df_label[label][cnt] = label_this[labels.index(label)]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_label[label][cnt] = label_this[labels.index(label)]
  df_label[label][cnt] = label_this[labels.index(label)]
A value is trying to be set on a copy of

In [106]:
df_emotion_global = df_label[df_label["grouping"].isin(global_group)]
mean_emotion_global = df_emotion_global[emotions].mean()
normalized_mean_emotion_global = mean_emotion_global / np.linalg.norm(mean_emotion_global)
mean_sentiment_global = df_emotion_global[sentiments].mean()
normalized_mean_sentiment_global = mean_sentiment_global / np.linalg.norm(mean_sentiment_global)

df_emotion_kpop = df_label[df_label["grouping"].isin(kpop_group)]
mean_emotion_kpop = df_emotion_kpop[emotions].mean()
normalized_mean_emotion_kpop = mean_emotion_kpop / np.linalg.norm(mean_emotion_kpop)
mean_sentiment_kpop = df_emotion_kpop[sentiments].mean()
normalized_mean_sentiment_kpop = mean_sentiment_kpop / np.linalg.norm(mean_sentiment_kpop)

df_emotion_target = df_label[df_label["grouping"] == target_group]
mean_emotion_target = df_emotion_target[emotions].mean()
normalized_mean_emotion_target = mean_emotion_target / np.linalg.norm(mean_emotion_target)
mean_sentiment_target = df_emotion_target[sentiments].mean()
normalized_mean_sentiment_target = mean_sentiment_target / np.linalg.norm(mean_sentiment_target)

In [107]:
normalized_mean_emotion_target

anger           0.222068
anticipation    0.455881
disgust         0.139562
fear            0.295598
joy             0.575365
sadness         0.316116
surprise        0.236113
trust           0.386367
dtype: float64

In [108]:
normalized_mean_sentiment_target

negative    0.509892
positive    0.860239
dtype: float64

against global

In [109]:
emotion_percentage_global = (normalized_mean_emotion_target - normalized_mean_emotion_global) / normalized_mean_emotion_global * 100

emotion_percentage_global

anger          -26.219371
anticipation    10.661093
disgust        -28.193399
fear           -10.843759
joy             16.307583
sadness        -13.957481
surprise        -0.207739
trust           -2.276086
dtype: float64

In [110]:
sentiment_percentage_global = (normalized_mean_sentiment_target - normalized_mean_sentiment_global) / normalized_mean_sentiment_global * 100

sentiment_percentage_global

negative   -22.252934
positive    13.953220
dtype: float64

against KPOP

In [111]:
emotion_percentage_kpop = (normalized_mean_emotion_target - normalized_mean_emotion_kpop) / normalized_mean_emotion_kpop * 100

emotion_percentage_kpop

anger           0.0
anticipation    0.0
disgust         0.0
fear            0.0
joy             0.0
sadness         0.0
surprise        0.0
trust           0.0
dtype: float64

In [112]:
sentiment_percentage_kpop = (normalized_mean_sentiment_target - normalized_mean_sentiment_kpop) / normalized_mean_sentiment_kpop * 100

sentiment_percentage_kpop

negative    0.0
positive    0.0
dtype: float64

## Save Data

In [113]:
# to float
for _sum in lda_summer_global:
    _sum[2] = [(tup[0], float(tup[1])) for tup in _sum[2]]
    
for _sum in lda_summer_kpop:
    _sum[2] = [(tup[0], float(tup[1])) for tup in _sum[2]]
    
for count in lda_counter_global:
    count[2] = [(tup[0], float(tup[1])) for tup in count[2]]

for count in lda_counter_kpop:
    count[2] = [(tup[0], float(tup[1])) for tup in count[2]]

In [114]:
json_data = {
    "tfidf_counter_global": tfidf_counter_global,
    "tfidf_summer_global": tfidf_summer_global,
    "tfidf_counter_kpop": tfidf_counter_kpop,
    "tfidf_summer_kpop": tfidf_summer_kpop,
    "lda_counter_global": lda_counter_global,
    "lda_summer_global": lda_summer_global,
    "lda_counter_kpop": lda_counter_kpop,
    "lda_summer_kpop": lda_summer_kpop,
    "lda_html_global": lda_html_global,
    "lda_html_kpop": lda_html_kpop,
    "normalized_mean_emotion_target": normalized_mean_emotion_target.astype(float).tolist(),
    "normalized_mean_sentiment_target": normalized_mean_sentiment_target.astype(float).tolist(),
    "emotion_percentage_global": emotion_percentage_global.astype(float).tolist(),
    "sentiment_percentage_global": sentiment_percentage_global.astype(float).tolist(),
    "emotion_percentage_kpop": emotion_percentage_kpop.astype(float).tolist(),
    "sentiment_percentage_kpop": sentiment_percentage_kpop.astype(float).tolist(),
}

In [115]:
import json
with open("data_"+ target_group +".json", "w") as f:
    json.dump(json_data, f)