In [1]:
# Import libraries
import re
import pandas as pd
import numpy as np
from collections import defaultdict

# Set Pandas to disply all rows of dataframes
pd.set_option('display.max_rows',500)

# nltk
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ju907\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ju907\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# For implementation of parallelization

import multiprocessing
from joblib import Parallel, delayed
import time

num_cores = multiprocessing.cpu_count()
print(num_cores)

4


In [3]:
def preprocessing(text):
    
    """
    Make string(lyric) to list(words of lyric) after some processes.
    
    Args: 
        text (str): raw lyric of a song
    Returns:
        tokens (list): words list that make up the lyric.
    
    """
   
    # tokenize into words
    tokenizer = RegexpTokenizer("[\w]+")
    tokens = tokenizer.tokenize(text)
    
    # lower capitalization
    tokens = [word.lower() for word in tokens]
    
    # remove stopwords
    stop = stopwords.words('english')
    tokens = [token for token in tokens if token not in stop]
    
    # lemmatization
    lmtzr = WordNetLemmatizer()
    tokens = [lmtzr.lemmatize(word) for word in tokens]
    tokens = [lmtzr.lemmatize(word, 'v') for word in tokens]
  
          
    return tokens

In [4]:
def lyric_dic(t_list):
    
    """
    Make word-list to word-dictionary.
    
    Args:
        t_list(list) : list of words.
    Returns:
        dic(dic) : dictionary of words
    
    """
    dic = {}
    for i in t_list:
        if i in dic:
            dic[i]+=1
        else:
            dic[i]=1
            
    return dic

In [10]:
def making_song_dic(data):
   
    
    """
    Make dictionary of songs from song data.
    
    Args: 
        data(pandas.core.frame.DataFrame): dataframe which contains informataions of songs
    Returns:
        Song_dictionary(dic):
    """
    
    
    Song_dictionary={}
    for i,song in data.iterrows():
        key = song['title']
        print(key,"-",song['artist'], " preprocessing...")
        Song_dictionary[key] = {'title':key,
                                'artist': song['artist'],
                                'lyric': lyric_dic(preprocessing(song['lyric'])) ,
                                'sentiment': {'Positive':0,'Negative':0,'Anger':0,'Anticipation':0,'Disgust':0,'Fear':0,'Joy':0,'Sadness':0,'Surprise':0,'Trust':0},
                                 'dominant_emo': None
                               }
        
       
        '''
        Song_dictionary[key] = {'artist': song['artist'],
                                'lyric': lyric_dic(preprocessing(song['lyric'])) ,
                                'sentiment': {'Positive':0,'Negative':0,'Anger':0,'Anticipation':0,'Disgust':0,'Fear':0,'Joy':0,'Sadness':0,'Surprise':0,'Trust':0},
                                'years': song['years'],
                                'duration': song['duration'],
                                'dominant_emo': None}
        '''
        
  
    return Song_dictionary 

In [11]:
def sentiment_analysis(song,emotion_words):

   
   """
   Sentiment(emotion) analysis for a song.
   
   Args: song(dictionary) : dicionary for all informations of a song
         emotion_words(pandas.core.frame.DataFrame): dataframe of NRC-Emotion-lexicon 
         
   
   
   """
   print(song['title'], 'is being analyzed ...') 
   for key,val in song['lyric'].items() :
        for i, row in emotion_words.iterrows():
            if key == row['Words']:
                for j in ['Positive','Negative','Anger','Anticipation','Disgust','Fear','Joy','Sadness','Surprise','Trust']:
                    if row[j] == 1:
                         song['sentiment'][j]+=val  
                            
                            
   # set dominant sentiment

   max = 0
   for i in ['Anger','Anticipation','Disgust','Fear','Joy','Sadness','Surprise','Trust']:
        if song['sentiment'][i]>max:
            max = song['sentiment'][i]
            dominant_emotion = i
   song['dominant_emo'] = dominant_emotion

In [12]:
# Import selected data
data = pd.read_excel("data/2019_data.xlsx")

# Import NRC-EmotiON_Lexicon
emotion_words = pd.read_csv("data/NRC-Emotion-Lexicon.csv")

# Drop the rows that at least one element is NAN
data = data.dropna(axis=0)

# It takes too much time to analysis I split data --> I have to find a solution for this
data = data.loc[40:41]

# test_lyric = test_lyric.dropna(how='all')
# test_lyric = test_lyric['lyric'].dropna(inplace=True)

# Making song_dic which contains all informations of every song
song_dic = making_song_dic(data)
#print(song_dic)


# Doing sentiment(emotion) analysis
start = time.time()
for key,val in song_dic.items():
    sentiment_analysis(val,emotion_words)
    print(key,"-",val['artist'],": ", val['dominant_emo'])
print("time :", time.time() - start)



A Lot - 21 Savage  preprocessing...
ME! - Taylor Swift Featuring Brendon Urie  preprocessing...
A Lot is being analyzed ...
A Lot - 21 Savage :  Surprise
ME! is being analyzed ...
ME! - Taylor Swift Featuring Brendon Urie :  Joy
time : 205.11143684387207
