In [13]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

from collections import Counter

[nltk_data] Downloading package punkt to /home/keyur/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/keyur/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [14]:
def pos_count(text):
    tokens = nltk.word_tokenize(text)
    pos_tags = nltk.pos_tag(tokens)
    
    cc_count = 0
    in_count = 0
    adj_count = 0
    noun_count = 0
    prp_count = 0
    adv_count = 0
    to_count = 0
    uh_count = 0
    verb_count = 0
    
    for token, tag in pos_tags:
        if tag == 'CC':
            cc_count += 1
        elif tag == 'IN':
            in_count += 1
        elif tag.startswith('JJ'):
            adj_count += 1
        elif tag.startswith('NN'):
            noun_count += 1
        elif tag == 'PRP' or tag == 'PRP$':
            prp_count += 1
        elif tag.startswith('RB'):
            adv_count += 1
        elif tag == 'TO':
            to_count += 1
        elif tag == 'UH':
            uh_count += 1
        elif tag.startswith('VB'):
            verb_count += 1
            
    return [cc_count, in_count, adj_count, noun_count, prp_count, adv_count, to_count, uh_count, verb_count]



In [15]:
import os

scores = {}
for filename in os.listdir('OneFilePerEmotion'):
    # split the file name based on -
    parts = filename.split('-')
    emotion = parts[0]
    # read the file 
    with open('OneFilePerEmotion/' + filename) as f:
        # read the lines
        lines = f.readlines()
        # iterate over the lines
        for line in lines:
            # split the line into words
            words = line.split()
            if words[0] in scores:
                scores[words[0]][emotion] = int(words[1])
            else:
                scores[words[0]] = {emotion: int(words[1])}

In [16]:

def get_emotion_scores(text, scores=scores):
    # split the text into words
    words = text.split()
    # initialize the emotion scores
    emotion_scores = {'anger': 0, 'anticipation': 0, 'disgust': 0, 'fear': 0, 'joy': 0, 'negative': 0, 'positive': 0, 'sadness': 0, 'surprise': 0, 'trust': 0}
    # iterate over the words
    for word in words:
        # if the word is in the scores dictionary
        if word in scores:
            # iterate over the emotions
            for emotion in scores[word]:
                # add the score to the emotion score
                emotion_scores[emotion] += scores[word][emotion]
    scores = []
    emotions = ['anger', 'anticipation', 'disgust', 'fear', 'joy', 'negative', 'positive', 'sadness', 'surprise', 'trust']
    for emotion in emotions:
        scores.append(emotion_scores[emotion])
    return scores

In [18]:
# check all songs in uniqueSongs.csv
# add the emotion score and pos count to the DataFrame
import pandas as pd
import langdetect


# Load the CSV file into a Pandas DataFrame
df = pd.read_csv('uniqueSongs.csv')
# remove duplicate ids
df = df.drop_duplicates(subset='id', keep='first')

emotions = ['anger', 'anticipation', 'disgust', 'fear', 'joy', 'nega    tive', 'positive', 'sadness', 'surprise', 'trust']
pos_tags = ['Coordinate conjunction', 'Preposition or subordinating conjunction', 'Adjective', 'Noun', 'Pronoun', 'Adverb', 'TO', 'Interjection', 'Verb']

# iterate over the rows of the DataFrame
for index, row in df.iterrows():
    # read the lyrics file
    with open('lyrics/' + row['song_path']) as f:
        lyrics = f.read()
        if len(lyrics) == 0:
            continue
        # check if the lyrics are in english
        try:
            if langdetect.detect(lyrics) != 'en':
                continue
        except:
            print(row['song_path'])
            continue
        # get the emotion scores
        emotion_scores = get_emotion_scores(lyrics)
        # get the pos counts
        pos_counts = pos_count(lyrics)
        # add the emotion scores to the DataFrame
        for i in range(len(emotion_scores)):
            df.at[index, emotions[i]] = emotion_scores[i]
        # add the pos counts to the DataFrame
        for i in range(len(pos_counts)):
            df.at[index, pos_tags[i]] = pos_counts[i]

# Save the DataFrame to a new CSV file
df.to_csv('uniqueSongsWithEmotionScores.csv', index=False)