In [1]:
# Import libraries
import re
import os
import csv
import pandas as pd
import numpy as np
from collections import defaultdict
from functools import partial

# Set Pandas to disply all rows of dataframes
pd.set_option('display.max_rows',500)

# nltk
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ju907\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ju907\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# For implementation of parallelization

import multiprocessing
from multiprocessing import Pool
from multiprocessing import Process, Queue, current_process
import parmap
import tqdm
from multiprocessing import Manager
from joblib import Parallel, delayed
import time
import s_a_util
num_cores = multiprocessing.cpu_count()
print(num_cores)


4


In [3]:
def preprocessing(text):
    
    """
    Make string(lyric) to list(words of lyric) after some processes.
    
    Args: 
        text (str): raw lyric of a song
    Returns:
        tokens (list): words list that make up the lyric.
    
    """
   
    # tokenize into words
    tokenizer = RegexpTokenizer("[\w]+")
    tokens = tokenizer.tokenize(text)
    
    # lower capitalization
    tokens = [word.lower() for word in tokens]
    
    # remove stopwords
    stop = stopwords.words('english')
    tokens = [token for token in tokens if token not in stop]
    
    # lemmatization
    lmtzr = WordNetLemmatizer()
    tokens = [lmtzr.lemmatize(word) for word in tokens]
    tokens = [lmtzr.lemmatize(word, 'v') for word in tokens]
  
          
    return tokens

In [4]:
def lyric_dic(t_list):
    
    """
    Make word-list to word-dictionary.
    
    Args:
        t_list(list) : list of words.
    Returns:
        dic(dic) : dictionary of words
    
    """
    dic = {}
    for i in t_list:
        if i in dic:
            dic[i]+=1
        else:
            dic[i]=1
            
    return dic

In [5]:
def making_song_list(data):
    
    """
    Make list of songs from song data.
    
    Args: 
        data(pandas.core.frame.DataFrame): dataframe which contains informataions of songs
    Returns:
        Song_list(list):
    """
    Song_list=[]
    for i,song in data.iterrows():
        print(song['title'],'-',song['artist'], 'preprocessing...')
        Song_list.append({'title': song['title'],
                         'artist': song['artist'],
                         'lyric': lyric_dic(preprocessing(song['lyric'])) ,
                         'duration': song['duration'],
                         'sentiment': {'Positive':0,'Negative':0,'Anger':0,'Anticipation':0,'Disgust':0,'Fear':0,'Joy':0,'Sadness':0,'Surprise':0,'Trust':0,'Love':0,'dominant_emo': None},
                         
                         })
        
    return Song_list

In [6]:
def making_emotion_words_dic(emotion_words):
    
    """
    Make emotion_words dictionary to use fast searching in dictionary
    
    Args :
        emotion_words(pandas.core.frame.DataFrame) : dataframe of NRC-Emotion-lexicon 
        
    Returns :
        emotion_words_dic(dic) : dictionary of NRC-Emotion-lexicon  ex) {'cry':[0,1,0,0,0,0,0,1,0,0,0],...}
    """
    emotion_words_dic = {}
    
    for i,row in emotion_words.iterrows():
        emotion_words_dic[row[0]]=row[1:]
           
            
    return emotion_words_dic

In [7]:
# import collected data
data = pd.read_excel("Song/2019_data.xlsx")

# import NRC-Emotion_Lixicon
emotion_words = pd.read_csv("Song/NRC-Emotion-Lexicon.csv")

# make emotions_dictionary 
emo_d = making_emotion_words_dic(emotion_words)


# drop the rows that at least one element is NAN
data = data.dropna(axis=0)

# data = data.loc[38:41]


# making song_list which contains all informations of every song
start = time.time()
song_list = making_song_list(data)
#print(song_list)
print("Making list time :", time.time() - start)


# do emotion analysis using multiprocessing

start = time.time()
if __name__ ==  '__main__': 
    result=Manager().Queue()
    pool = multiprocessing.Pool(processes=num_cores)
    splited_song_list = np.array_split(song_list, num_cores)
    spilited_song_list= [x.tolist() for x in splited_song_list]
    pool.starmap(s_a_util.start_analysis_v2,[(song_list,emo_d,result) for song_list in splited_song_list])
    pool.close()
    pool.join()
    
print("analysis time(multiprocessing)for",len(song_list),"songs : ", time.time() - start)  


# match multiprocessing result to song. So, update song_list's sentiment part

result.put('STOP')
result_dic={}
while True:
    tmp = result.get()
    if tmp == 'STOP':
        break
    else:
        result_dic[tmp[0]]=tmp[1]
 
        
for i in song_list:
    i['sentiment']=result_dic[i['title']]


# Store result of analysis to output.csv

f = open('result_2019.csv','w',encoding='utf-8',newline='')
wr = csv.writer(f)
wr.writerow(['title','artist','year','duration','dominant','Positive','Negative','Anger','Anticipation','Disgust','Fear','Joy','Sadness','Surprise','Trust','Love'])
for i in song_list:
    wr.writerow([i['title'],i['artist'],'2019',i['duration'],i['sentiment']['dominant_emo'],i['sentiment']['Positive'],i['sentiment']['Negative'],i['sentiment']['Anger'],i['sentiment']['Anticipation'],i['sentiment']['Disgust'],i['sentiment']['Fear'],i['sentiment']['Joy'],i['sentiment']['Sadness'],i['sentiment']['Surprise'],i['sentiment']['Trust'],i['sentiment']['Love']])
f.close()


    


Sunflower (Spider-Man: Into The Spider-Verse) - Post Malone & Swae Lee preprocessing...
Without Me - Halsey preprocessing...
Bad Guy - Billie Eilish preprocessing...
Wow. - Post Malone preprocessing...
Happier - Marshmello & Bastille preprocessing...
7 Rings - Ariana Grande preprocessing...
Talk - Khalid preprocessing...
Sicko Mode - Travis Scott preprocessing...
Sucker - Jonas Brothers preprocessing...
High Hopes - Panic! At The Disco preprocessing...
Thank U, Next - Ariana Grande preprocessing...
Truth Hurts - Lizzo preprocessing...
Dancing With A Stranger - Sam Smith & Normani preprocessing...
Senorita - Shawn Mendes & Camila Cabello preprocessing...
I Don't Care - Ed Sheeran & Justin Bieber preprocessing...
Going Bad - Meek Mill Featuring Drake preprocessing...
Shallow - Lady Gaga & Bradley Cooper preprocessing...
Better - Khalid preprocessing...
No Guidance - Chris Brown Featuring Drake preprocessing...
Girls Like You - Maroon 5 Featuring Cardi B preprocessing...
Sweet But Psycho 