# Lyrics Analysis

In [53]:
lyrics_directory = "Lyrics_Data/"
tracks_csv = "tracks_group.csv"
stopwords_file = "stopwords.txt"

In [54]:
target_group = "twice"
kpop_group = set(['kpop', 'itzy', 'seventeen', 'bts', 'twice'])
global_group = set(['global', 'billboard'])

target_and_kpop = kpop_group.union(set([target_group]))
target_and_global = global_group.union(set([target_group]))

## Import data

In [55]:
import pandas as pd

In [56]:
df = pd.read_csv(tracks_csv)

df.head(3)

Unnamed: 0,track_name,track_id,grouping
0,Ice Cream (with Selena Gomez),2J4P46vCFm1rPkNkp9pZWX,kpop
1,Dynamite,0v1x6rN6JHRapa03JElljE,kpop
2,THE BADDEST,2V4Fx72svQRxrFvNT1eq5f,kpop


In [57]:
df.shape

(4618, 3)

In [58]:
df["grouping"].unique()

array(['kpop', 'itzy', 'seventeen', 'bts', 'twice', 'global', 'billboard'],
      dtype=object)

In [59]:
def get_lyrics(track_id):
    try:
        lyrics = ""
        with open(lyrics_directory + track_id + ".txt", "r") as f:
            lyrics = f.read()
            
        return lyrics            
    except:
        return None
    
df["lyrics"] = df["track_id"].apply(get_lyrics)

df.shape

(4618, 4)

In [60]:
# stop words
# https://www.kaggle.com/datasets/rowhitswami/stopwords/
stopwords = set()
with open(stopwords_file, "r") as f:
    stopwords = set(f.read().split("\n"))
    
stopwords

{',',
 '>',
 '?',
 'Just',
 'Yes',
 'a',
 'abaft',
 'abafter',
 'abaftest',
 'about',
 'abouter',
 'aboutest',
 'above',
 'abover',
 'abovest',
 'accordingly',
 'aer',
 'aest',
 'afore',
 'after',
 'afterer',
 'afterest',
 'afterward',
 'afterwards',
 'again',
 'against',
 'aid',
 'ain',
 'albeit',
 'all',
 'aller',
 'allest',
 'alls',
 'allyou',
 'almost',
 'along',
 'alongside',
 'already',
 'also',
 'although',
 'always',
 'amid',
 'amidst',
 'among',
 'amongst',
 'an',
 'and',
 'andor',
 'anear',
 'anent',
 'another',
 'any',
 'anybody',
 'anyhow',
 'anyone',
 'anything',
 'anywhere',
 'apart',
 'aparter',
 'apartest',
 'appear',
 'appeared',
 'appearing',
 'appears',
 'appropriate',
 'appropriated',
 'appropriater',
 'appropriates',
 'appropriatest',
 'appropriating',
 'are',
 'ares',
 'around',
 'as',
 'ases',
 'aside',
 'asides',
 'aslant',
 'astraddle',
 'astraddler',
 'astraddlest',
 'astride',
 'astrider',
 'astridest',
 'at',
 'athwart',
 'atop',
 'atween',
 'aught',
 'aught

## Analyze

In [61]:
def is_stopword(word):
    word = word.lower()
    
    if len(word) <= 1:
        return True
    
    # in the list
    if word in stopwords:
        return True
    
    # is Korean
    if ord("가") <= ord(word[0]) <= ord("힣"):
        return True
    
    return False

### Characteristic Keyword
Using TF-IDF

In [62]:
from sklearn.feature_extraction.text import TfidfVectorizer

#### Among global

In [63]:
df_looking = df[df["grouping"].isin(target_and_global)].reset_index(drop=True)

In [64]:
vectorizer = TfidfVectorizer(stop_words=list(stopwords), max_features=1000, min_df=10)

x = vectorizer.fit_transform(df_looking["lyrics"].fillna("")).toarray()

df_tfidf = pd.DataFrame(x, columns=vectorizer.get_feature_names_out()).T



In [65]:
threshold = 0.001
counter = {}
summer = {}

for cnt in range(len(df_looking)):
    if df_looking["grouping"][cnt] != target_group:
        continue
    
    tfidf_series = df_tfidf[cnt]
    
    keywords = tfidf_series[tfidf_series > threshold].index.tolist()
    for keyword in keywords:
        counter[keyword] = counter.get(keyword, 0) + 1
        summer[keyword] = summer.get(keyword, 0) + float(tfidf_series[keyword])

In [66]:
def sort_dictionary(dictionary):
    return [(k, dictionary[k]) for k in sorted(dictionary, key=dictionary.get, reverse=True)]

In [67]:
tfidf_counter_global = sort_dictionary(counter)
tfidf_counter_global

[('love', 81),
 ('let', 71),
 ('heart', 64),
 ('baby', 62),
 ('home', 56),
 ('yes', 51),
 ('feel', 50),
 ('sorry', 47),
 ('say', 44),
 ('think', 43),
 ('eyes', 42),
 ('need', 37),
 ('news', 34),
 ('take', 31),
 ('tell', 30),
 ('honey', 30),
 ('back', 30),
 ('child', 29),
 ('dream', 28),
 ('only', 26),
 ('am', 26),
 ('light', 25),
 ('ah', 24),
 ('world', 24),
 ('feeling', 24),
 ('night', 23),
 ('life', 22),
 ('happy', 22),
 ('share', 21),
 ('sweet', 20),
 ('follow', 20),
 ('show', 20),
 ('mind', 19),
 ('ready', 19),
 ('wait', 19),
 ('easy', 18),
 ('room', 18),
 ('end', 18),
 ('okay', 18),
 ('boy', 17),
 ('forget', 17),
 ('meet', 17),
 ('give', 17),
 ('way', 17),
 ('hear', 17),
 ('new', 17),
 ('ooh', 16),
 ('away', 16),
 ('keep', 16),
 ('true', 16),
 ('words', 16),
 ('smile', 16),
 ('shut', 16),
 ('dance', 15),
 ('care', 15),
 ('down', 15),
 ('tonight', 15),
 ('girl', 15),
 ('mine', 15),
 ('start', 15),
 ('little', 15),
 ('beautiful', 14),
 ('made', 14),
 ('move', 14),
 ('wow', 14),
 ('c

In [68]:
tfidf_summer_global = sort_dictionary(summer)
tfidf_summer_global

[('home', 13.931992319453727),
 ('love', 12.824083976293252),
 ('baby', 9.069787994701606),
 ('heart', 8.167630007034115),
 ('sorry', 6.87206765201956),
 ('let', 6.820567467417053),
 ('ah', 6.3710435563246595),
 ('yes', 6.039825610017484),
 ('sweet', 5.587227836906947),
 ('eyes', 5.571798184274636),
 ('child', 5.425195621935116),
 ('news', 5.1852820453526896),
 ('follow', 4.728578440242536),
 ('한국', 4.710007613924276),
 ('say', 4.670350696880509),
 ('back', 4.565068654238137),
 ('honey', 4.508644267693422),
 ('happy', 4.355925899456217),
 ('shut', 4.147360523112134),
 ('share', 3.828051149884993),
 ('boy', 3.823601342392143),
 ('ooh', 3.812661929570221),
 ('feel', 3.709714642563397),
 ('dream', 3.6839441198522556),
 ('only', 3.5378947428270893),
 ('room', 3.4250275121800344),
 ('think', 3.2927475359754466),
 ('dance', 3.170758416935763),
 ('easy', 3.125976679431446),
 ('give', 3.114537772389982),
 ('need', 3.1020697366848804),
 ('waiting', 2.9401781953729174),
 ('tell', 2.6353964829377

#### Among KPOP

In [69]:
df_looking = df[df["grouping"].isin(target_and_kpop)].reset_index(drop=True)

In [70]:
vectorizer = TfidfVectorizer(stop_words=list(stopwords), max_features=1000, min_df=10)

x = vectorizer.fit_transform(df_looking["lyrics"].fillna("")).toarray()

df_tfidf = pd.DataFrame(x, columns=vectorizer.get_feature_names_out()).T



In [71]:
threshold = 0.001
counter = {}
summer = {}

for cnt in range(len(df_looking)):
    if df_looking["grouping"][cnt] != target_group:
        continue
    
    tfidf_series = df_tfidf[cnt]
    
    keywords = tfidf_series[tfidf_series > threshold].index.tolist()
    for keyword in keywords:
        counter[keyword] = counter.get(keyword, 0) + 1
        summer[keyword] = summer.get(keyword, 0) + float(tfidf_series[keyword])

In [72]:
tfidf_counter_kpop = sort_dictionary(counter)
tfidf_counter_kpop

[('love', 81),
 ('let', 71),
 ('heart', 64),
 ('baby', 62),
 ('home', 56),
 ('yes', 51),
 ('feel', 50),
 ('sorry', 47),
 ('say', 44),
 ('think', 43),
 ('eyes', 42),
 ('need', 37),
 ('news', 34),
 ('take', 31),
 ('tell', 30),
 ('honey', 30),
 ('back', 30),
 ('child', 29),
 ('dream', 28),
 ('only', 26),
 ('am', 26),
 ('light', 25),
 ('contact', 25),
 ('ah', 24),
 ('world', 24),
 ('feeling', 24),
 ('night', 23),
 ('life', 22),
 ('happy', 22),
 ('share', 21),
 ('sweet', 20),
 ('follow', 20),
 ('show', 20),
 ('mind', 19),
 ('ready', 19),
 ('wait', 19),
 ('easy', 18),
 ('room', 18),
 ('end', 18),
 ('okay', 18),
 ('boy', 17),
 ('forget', 17),
 ('meet', 17),
 ('give', 17),
 ('way', 17),
 ('hear', 17),
 ('new', 17),
 ('ooh', 16),
 ('away', 16),
 ('keep', 16),
 ('true', 16),
 ('words', 16),
 ('smile', 16),
 ('shut', 16),
 ('dance', 15),
 ('care', 15),
 ('down', 15),
 ('tonight', 15),
 ('girl', 15),
 ('informazioni', 15),
 ('mine', 15),
 ('start', 15),
 ('한국어', 15),
 ('little', 15),
 ('beautiful'

In [73]:
tfidf_summer_kpop = sort_dictionary(summer)
tfidf_summer_kpop

[('love', 13.065723016326086),
 ('home', 12.046380918451089),
 ('baby', 9.37089980241579),
 ('heart', 7.39317226970636),
 ('let', 7.0242306392414395),
 ('yes', 6.590925241101974),
 ('ah', 5.6111757294054305),
 ('say', 5.35123036052182),
 ('child', 5.136613226283855),
 ('honey', 4.9609555244080585),
 ('eyes', 4.937091926832534),
 ('back', 4.787605550619838),
 ('sweet', 4.783702989497009),
 ('feel', 4.408003272188201),
 ('boy', 3.994288137930682),
 ('sorry', 3.884341745531181),
 ('only', 3.8213452721812167),
 ('happy', 3.7722986475492406),
 ('need', 3.7701262583412736),
 ('ooh', 3.7484341269486787),
 ('think', 3.6914551045215114),
 ('give', 3.6140214682608116),
 ('share', 3.6140009224389864),
 ('follow', 3.4620043174998534),
 ('shut', 3.1821969977209306),
 ('tell', 3.1245862358348635),
 ('한국', 3.0492493517826613),
 ('dance', 2.891446458906176),
 ('night', 2.865801978193554),
 ('dream', 2.83933489657322),
 ('easy', 2.814198716896),
 ('waiting', 2.7886617134179796),
 ('take', 2.72871228385

### Topic analysis

In [74]:
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel

In [75]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
def save_lda_vis(lda, corpus, dictionary, filename):
    vis = gensimvis.prepare(lda, corpus, dictionary)
    pyLDAvis.save_html(vis, filename)
    
    with open(filename, "r") as f:
        html = f.read()
        
    return html, vis

In [76]:
def pre_process(df_lyrics):
    # tokenize lyrics
    lyrics_processed = df_lyrics.fillna("").str.split()

    for _list in lyrics_processed:
        for cnt in range(len(_list)):
            _list[cnt] = _list[cnt].replace(".", "").replace(",", "").replace("?", "").replace("\\", "").replace("/", "").replace(":", "").lower()
            
            if is_stopword(_list[cnt]):
                _list[cnt] = ""
                
    # remove empty string
    for cnt in range(len(lyrics_processed)):
        while(True):
            try:
                lyrics_processed[cnt].remove("")
            except:
                break

    return lyrics_processed

#### Among global

In [77]:
df_looking = df[df["grouping"].isin(target_and_global)].reset_index(drop=True)

In [78]:
#train model
lyrics_processed = pre_process(df_looking["lyrics"])
dictionary = Dictionary(lyrics_processed)
corpus = [dictionary.doc2bow(text) for text in lyrics_processed]
lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=100)

all topics

In [79]:
lda_html_global, vis = save_lda_vis(lda, corpus, dictionary, "lda.html")

vis

PreparedData(topic_coordinates=              x         y  topics  cluster      Freq
topic                                               
3      0.048377 -0.025842       1        1  3.819362
42     0.065935 -0.051384       2        1  3.777051
64     0.056020 -0.032260       3        1  3.048676
69     0.058295 -0.043276       4        1  2.770878
8      0.051763 -0.044942       5        1  2.278845
...         ...       ...     ...      ...       ...
1     -0.171024 -0.063576      96        1  0.358910
83    -0.078172  0.126518      97        1  0.320438
76    -0.029726  0.064549      98        1  0.308343
77    -0.091314  0.123661      99        1  0.298106
39    -0.117220  0.146454     100        1  0.261508

[100 rows x 5 columns], topic_info=        Term         Freq        Total  Category  logprob  loglift
4905  labour  6329.000000  6329.000000   Default  30.0000  30.0000
1       baby  5685.000000  5685.000000   Default  29.0000  29.0000
755     work  2065.000000  2065.000000   De

what topics are in kpop?

In [80]:
threshold = 0.3

counter = {}
summer = {}

for cnt in range(len(df_looking)):
    if df_looking["grouping"][cnt] != target_group:
        continue
    
    topics = lda.get_document_topics(corpus[cnt], minimum_probability=0)
    
    for topic in topics:
        if topic[1] > threshold:
            counter[topic[0]] = counter.get(topic[0], 0) + 1
            
        summer[topic[0]] = summer.get(topic[0], 0) + float(topic[1])

In [81]:
_lda_counter_global = sort_dictionary(counter)
lda_counter_global = []
for tup in _lda_counter_global:
    new_tup = [tup[0], tup[1], lda.show_topic(tup[0])]
    lda_counter_global.append(new_tup)
    
lda_counter_global

[[23,
  11,
  [('home', 0.19665466),
   ('house', 0.0336813),
   ('imma', 0.024181006),
   ('love', 0.018329041),
   ('shut', 0.016517397),
   ('loved', 0.01237267),
   ('coming', 0.011009457),
   ('higher', 0.008473884),
   ('faded', 0.008312927),
   ('way', 0.007894956)]],
 [30,
  10,
  [('love', 0.12500276),
   ('need', 0.023329804),
   ('child', 0.015777085),
   ('fight', 0.011550103),
   ('say', 0.010189317),
   ('feel', 0.009061972),
   ('heart', 0.0078016194),
   ('baby', 0.0077479323),
   ('think', 0.007515748),
   ("can't", 0.00684685)]],
 [81,
  7,
  [('love', 0.13406004),
   ('need', 0.027778743),
   ('[bleep', 0.024401916),
   ('girl', 0.019842952),
   ('honey', 0.019399267),
   ('[bleep]', 0.018373094),
   ('yes', 0.017867386),
   ('leave', 0.013008531),
   ('let', 0.0122913765),
   ('think', 0.00987409)]],
 [69,
  7,
  [('down', 0.033434667),
   ('love', 0.025759302),
   ('only', 0.013407598),
   ('say', 0.012310562),
   ('head', 0.011654127),
   ('let', 0.011611412),
   

In [82]:
_lda_summer_global = sort_dictionary(summer)
lda_summer_global = []
for tup in _lda_summer_global:
    new_tup = [tup[0], tup[1], lda.show_topic(tup[0])]
    lda_summer_global.append(new_tup)
    
lda_summer_global

[[23,
  9.221383964300912,
  [('home', 0.19665466),
   ('house', 0.0336813),
   ('imma', 0.024181006),
   ('love', 0.018329041),
   ('shut', 0.016517397),
   ('loved', 0.01237267),
   ('coming', 0.011009457),
   ('higher', 0.008473884),
   ('faded', 0.008312927),
   ('way', 0.007894956)]],
 [30,
  8.265455559114343,
  [('love', 0.12500276),
   ('need', 0.023329804),
   ('child', 0.015777085),
   ('fight', 0.011550103),
   ('say', 0.010189317),
   ('feel', 0.009061972),
   ('heart', 0.0078016194),
   ('baby', 0.0077479323),
   ('think', 0.007515748),
   ("can't", 0.00684685)]],
 [81,
  6.3251496686461905,
  [('love', 0.13406004),
   ('need', 0.027778743),
   ('[bleep', 0.024401916),
   ('girl', 0.019842952),
   ('honey', 0.019399267),
   ('[bleep]', 0.018373094),
   ('yes', 0.017867386),
   ('leave', 0.013008531),
   ('let', 0.0122913765),
   ('think', 0.00987409)]],
 [42,
  5.761450093003077,
  [('girl', 0.02185057),
   ('baby', 0.020885346),
   ('night', 0.017401144),
   ('tonight', 0

#### Among KPOP

In [83]:
df_looking = df[df["grouping"].isin(target_and_kpop)].reset_index(drop=True)

In [84]:
#train model
lyrics_processed = pre_process(df_looking["lyrics"])
dictionary = Dictionary(lyrics_processed)
corpus = [dictionary.doc2bow(text) for text in lyrics_processed]
lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=100)

# observe topics
topics = lda.print_topics()

In [85]:
threshold = 0.3

counter = {}
summer = {}

for cnt in range(len(df_looking)):
    if df_looking["grouping"][cnt] != target_group:
        continue
    
    topics = lda.get_document_topics(corpus[cnt], minimum_probability=0)
    
    for topic in topics:
        if topic[1] > threshold:
            counter[topic[0]] = counter.get(topic[0], 0) + 1
            
        summer[topic[0]] = summer.get(topic[0], 0) + float(topic[1])


In [86]:
lda_html_kpop, vis = save_lda_vis(lda, corpus, dictionary, "lda.html")

vis

PreparedData(topic_coordinates=              x         y  topics  cluster      Freq
topic                                               
59    -0.338421 -0.195507       1        1  3.531713
23     0.006786 -0.028722       2        1  3.146811
40     0.041706 -0.058127       3        1  2.390128
68     0.027578 -0.017276       4        1  2.159530
49     0.057639 -0.043839       5        1  1.793497
...         ...       ...     ...      ...       ...
26    -0.081977  0.197445      96        1  0.326382
14     0.016356  0.033697      97        1  0.282757
42    -0.007333  0.053944      98        1  0.281739
66    -0.067294  0.139885      99        1  0.211090
75    -0.035053 -0.000044     100        1  0.149358

[100 rows x 5 columns], topic_info=         Term         Freq        Total  Category  logprob  loglift
280     child  4109.000000  4109.000000   Default  30.0000  30.0000
425     drunk  2059.000000  2059.000000   Default  29.0000  29.0000
494      cake  1563.000000  1563.000000 

In [87]:
_lda_counter_kpop = sort_dictionary(counter)
lda_counter_kpop = []
for tup in _lda_counter_kpop:
    new_tup = [tup[0], tup[1], lda.show_topic(tup[0])]
    lda_counter_kpop.append(new_tup)
    
lda_counter_kpop

[[40,
  11,
  [('home', 0.23034999),
   ('give', 0.0259348),
   ('love', 0.023560403),
   ('osserva', 0.018022496),
   ('hot', 0.015532005),
   ('shut', 0.0121586155),
   ('sorry', 0.009495863),
   ('heart', 0.008666158),
   ('ooh', 0.0067573045),
   ('chiudi', 0.0062926263)]],
 [5,
  6,
  [('invalid', 0.14889614),
   ('love', 0.062267978),
   ('night', 0.019218067),
   ('light', 0.015710194),
   ('red', 0.015694845),
   ("can't", 0.013075309),
   ('worth', 0.0075852363),
   ('let', 0.0074329115),
   ('next', 0.0063431663),
   ('ac', 0.006116149)]],
 [49,
  6,
  [('love', 0.04756524),
   ('beep', 0.035298686),
   ('heart', 0.013322674),
   ('thinking', 0.012910586),
   ('kim', 0.01253173),
   ("can't", 0.0123212775),
   ('news', 0.011399116),
   ('sorry', 0.011195544),
   ('wild', 0.010277942),
   ('world', 0.009906737)]],
 [86,
  6,
  [('love', 0.04228079),
   ('only', 0.024333082),
   ('sorry', 0.017968915),
   ('baby', 0.016971478),
   ("can't", 0.013805664),
   ('mariachi', 0.01235

In [88]:
_lda_summer_kpop = sort_dictionary(summer)
lda_summer_kpop = []
for tup in _lda_summer_kpop:
    new_tup = [tup[0], tup[1], lda.show_topic(tup[0])]
    lda_summer_kpop.append(new_tup)
    
lda_summer_kpop

[[40,
  9.654893328519393,
  [('home', 0.23034999),
   ('give', 0.0259348),
   ('love', 0.023560403),
   ('osserva', 0.018022496),
   ('hot', 0.015532005),
   ('shut', 0.0121586155),
   ('sorry', 0.009495863),
   ('heart', 0.008666158),
   ('ooh', 0.0067573045),
   ('chiudi', 0.0062926263)]],
 [49,
  5.5506910114127095,
  [('love', 0.04756524),
   ('beep', 0.035298686),
   ('heart', 0.013322674),
   ('thinking', 0.012910586),
   ('kim', 0.01253173),
   ("can't", 0.0123212775),
   ('news', 0.011399116),
   ('sorry', 0.011195544),
   ('wild', 0.010277942),
   ('world', 0.009906737)]],
 [27,
  4.915529709302064,
  [('boy', 0.03221127),
   ('love', 0.027521417),
   ('sorry', 0.016445518),
   ('ay', 0.016354358),
   ('back', 0.012868091),
   ('room', 0.012529746),
   ('need', 0.01147417),
   ('feel', 0.011416033),
   ('let', 0.011337116),
   ('young', 0.00842854)]],
 [5,
  4.833154590007325,
  [('invalid', 0.14889614),
   ('love', 0.062267978),
   ('night', 0.019218067),
   ('light', 0.0157

### Emotion analysis
Emotions:  
fear
anger
anticipation
trust
surprise
positive
negative
sadness
disgust
joy

In [89]:
import numpy as np

In [90]:
labels = [
    "anger",
    "anticipation",
    "disgust",
    "fear",
    "joy",
    "sadness",
    "surprise",	
    "trust",
    "negative",	
    "positive",
]

emotions = [
    "anger",
    "anticipation",
    "disgust",
    "fear",
    "joy",
    "sadness",
    "surprise",	
    "trust",
]

sentiments = [
    "negative",	
    "positive",
]

In [91]:
df_nrc = pd.read_csv("NRC.txt", sep="\t", names=["word", "label", "amount"])

words_to_labels = {}

for _, row in df_nrc.iterrows():
    words_to_labels[row["word"]] = words_to_labels.get(row["word"], np.zeros((10, 1)))
    words_to_labels[row["word"]][labels.index(row["label"])] = row["amount"]

In [92]:
df_label = df.copy()
for label in labels:
    df_label[label] = 0

In [93]:
for cnt in range(len(df)):
    lyrics = df["lyrics"][cnt]

    if type(lyrics) != str:
        continue

    label_this = np.zeros((10, 1))

    for word in lyrics.split():
        if word in words_to_labels:
            label_this += words_to_labels[word]
            
    emotion_this = label_this[:8]
    sentiment_this = label_this[8:]
            
    if np.linalg.norm(emotion_this) != 0:
        emotion_this /= np.linalg.norm(emotion_this)
        
    if np.linalg.norm(sentiment_this) != 0:
        sentiment_this /= np.linalg.norm(sentiment_this)
        
    label_this = np.concatenate((emotion_this, sentiment_this), axis=None)
    
    for label in labels:
        df_label[label][cnt] = label_this[labels.index(label)]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_label[label][cnt] = label_this[labels.index(label)]
  df_label[label][cnt] = label_this[labels.index(label)]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_label[label][cnt] = label_this[labels.index(label)]
  df_label[label][cnt] = label_this[labels.index(label)]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_label[label][cnt] = label_this[labels.index(label)]
  df_label[label][cnt] = label_this[labels.index(label)]
A value is trying to be set on a copy of

In [94]:
df_emotion_global = df_label[df_label["grouping"].isin(global_group)]
mean_emotion_global = df_emotion_global[emotions].mean()
normalized_mean_emotion_global = mean_emotion_global / np.linalg.norm(mean_emotion_global)
mean_sentiment_global = df_emotion_global[sentiments].mean()
normalized_mean_sentiment_global = mean_sentiment_global / np.linalg.norm(mean_sentiment_global)

df_emotion_kpop = df_label[df_label["grouping"].isin(kpop_group)]
mean_emotion_kpop = df_emotion_kpop[emotions].mean()
normalized_mean_emotion_kpop = mean_emotion_kpop / np.linalg.norm(mean_emotion_kpop)
mean_sentiment_kpop = df_emotion_kpop[sentiments].mean()
normalized_mean_sentiment_kpop = mean_sentiment_kpop / np.linalg.norm(mean_sentiment_kpop)

df_emotion_target = df_label[df_label["grouping"] == target_group]
mean_emotion_target = df_emotion_target[emotions].mean()
normalized_mean_emotion_target = mean_emotion_target / np.linalg.norm(mean_emotion_target)
mean_sentiment_target = df_emotion_target[sentiments].mean()
normalized_mean_sentiment_target = mean_sentiment_target / np.linalg.norm(mean_sentiment_target)

In [95]:
normalized_mean_emotion_target

anger           0.190875
anticipation    0.477627
disgust         0.099984
fear            0.194998
joy             0.678331
sadness         0.214830
surprise        0.231878
trust           0.356883
dtype: float64

In [96]:
normalized_mean_sentiment_target

negative    0.422718
positive    0.906261
dtype: float64

against global

In [97]:
emotion_percentage_global = (normalized_mean_emotion_target - normalized_mean_emotion_global) / normalized_mean_emotion_global * 100

emotion_percentage_global

anger          -36.582887
anticipation    15.939841
disgust        -48.556924
fear           -41.186043
joy             37.121813
sadness        -41.526304
surprise        -1.997513
trust           -9.733517
dtype: float64

In [98]:
sentiment_percentage_global = (normalized_mean_sentiment_target - normalized_mean_sentiment_global) / normalized_mean_sentiment_global * 100

sentiment_percentage_global

negative   -35.545037
positive    20.049735
dtype: float64

against KPOP

In [99]:
emotion_percentage_kpop = (normalized_mean_emotion_target - normalized_mean_emotion_kpop) / normalized_mean_emotion_kpop * 100

emotion_percentage_kpop

anger          -14.046391
anticipation     4.770193
disgust        -28.358849
fear           -34.032709
joy             17.895850
sadness        -32.040929
surprise        -1.793500
trust           -7.631122
dtype: float64

In [100]:
sentiment_percentage_kpop = (normalized_mean_sentiment_target - normalized_mean_sentiment_kpop) / normalized_mean_sentiment_kpop * 100

sentiment_percentage_kpop

negative   -17.096598
positive     5.350016
dtype: float64

## Save Data

In [101]:
# to float
for _sum in lda_summer_global:
    _sum[2] = [(tup[0], float(tup[1])) for tup in _sum[2]]
    
for _sum in lda_summer_kpop:
    _sum[2] = [(tup[0], float(tup[1])) for tup in _sum[2]]
    
for count in lda_counter_global:
    count[2] = [(tup[0], float(tup[1])) for tup in count[2]]

for count in lda_counter_kpop:
    count[2] = [(tup[0], float(tup[1])) for tup in count[2]]

In [102]:
json_data = {
    "tfidf_counter_global": tfidf_counter_global,
    "tfidf_summer_global": tfidf_summer_global,
    "tfidf_counter_kpop": tfidf_counter_kpop,
    "tfidf_summer_kpop": tfidf_summer_kpop,
    "lda_counter_global": lda_counter_global,
    "lda_summer_global": lda_summer_global,
    "lda_counter_kpop": lda_counter_kpop,
    "lda_summer_kpop": lda_summer_kpop,
    "lda_html_global": lda_html_global,
    "lda_html_kpop": lda_html_kpop,
    "normalized_mean_emotion_target": normalized_mean_emotion_target.astype(float).tolist(),
    "normalized_mean_sentiment_target": normalized_mean_sentiment_target.astype(float).tolist(),
    "emotion_percentage_global": emotion_percentage_global.astype(float).tolist(),
    "sentiment_percentage_global": sentiment_percentage_global.astype(float).tolist(),
    "emotion_percentage_kpop": emotion_percentage_kpop.astype(float).tolist(),
    "sentiment_percentage_kpop": sentiment_percentage_kpop.astype(float).tolist(),
}

In [103]:
import json
with open("data_"+ target_group +".json", "w") as f:
    json.dump(json_data, f)