In [1]:
# system import
import sys
import os
import csv
import config

# data processing
import pandas as pd
import numpy as np
#https://spacy.io/
import spacy
from spacy.tokens import Doc, Token
#https://github.com/nickdavidhaynes/spacy-cld
from spacy_cld import LanguageDetector
#https://textblob.readthedocs.io/en/dev/
from textblob import TextBlob
#https://github.com/cjhutto/vaderSentiment#installation
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
#https://github.com/fnielsen/afinn
from afinn import Afinn

# set env
af = Afinn()
PROJECT_PATH = config.data_directory
AFINN_PATH = config.afinn_path

# spacy set language
nlp = spacy.load("en_core_web_sm")
language_detector = LanguageDetector()
nlp.add_pipe(language_detector)

In [7]:
# file operations
def read_file(file):
    text_file = open('{}'.format(file), "r", encoding='utf-8')
    text = text_file.read()
    text_file.close()
    return text

# save dataframe to csv file
def dataframe_to_csv(pDf, name = 'engSongs'):
    file_path = '{}/{}.csv'.format(PROJECT_PATH, name)
    pDf.to_csv(file_path, sep=',', encoding='utf-8', index=False)
    
# load dataFrame
df = pd.read_csv('{}/{}.csv'.format(PROJECT_PATH, 'songs'))
df.head()

Unnamed: 0,id,name,popularity,duration_ms,explicit,artist,artist_id,album,album_id,album_release_date,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,genres
0,57bgtoPSgt236HzfBOd8kj,Thunderstruck,80,292880,False,AC/DC,711MCceyCBcFnzjGY4Q7Un,The Razors Edge,4vu7F6h90Br1ZtYYaqfITy,1990-09-24,...,4,-5.175,1,0.0364,0.000147,0.0114,0.217,0.257,133.519,"['album rock', 'australian rock', 'hard rock',..."
1,5u5F7qLDvZjBSktaDp4HxB,Moneytalks,62,225946,False,AC/DC,711MCceyCBcFnzjGY4Q7Un,The Razors Edge,4vu7F6h90Br1ZtYYaqfITy,1990-09-24,...,7,-3.677,1,0.0314,0.000841,0.000351,0.122,0.409,121.896,"['album rock', 'australian rock', 'hard rock',..."
2,5pKJtX4wBeby9qIfFhyOJj,Don't Go Breaking My Heart,69,271133,False,Elton John,3PhoLpVuITZKcymswpck5b,To Be Continued...,7iKDBfOFUtg8E8RbuDhiux,1990-11-08,...,5,-7.79,1,0.0416,0.196,1.1e-05,0.0685,0.779,131.312,"['glam rock', 'mellow gold', 'piano rock', 'ro..."
3,0EANX0OVKSCcmarY50Xa4p,Can You Feel the Love Tonight - End Title/ Fro...,57,241013,False,Elton John,3PhoLpVuITZKcymswpck5b,The Lion King,3YA5DdB3wSz4pdfEXoMyRd,1994-01-01,...,10,-17.077,1,0.0337,0.88,0.000346,0.117,0.167,121.498,"['glam rock', 'mellow gold', 'piano rock', 'ro..."
4,7em38Hh2sKwnZ1HDgxvg82,Circle Of Life,47,290093,False,Elton John,3PhoLpVuITZKcymswpck5b,Love Songs,1bnoqPJyaxMCtYuSprcBTD,1995-11-06,...,10,-8.089,1,0.0369,0.283,0.44,0.107,0.157,160.45,"['glam rock', 'mellow gold', 'piano rock', 'ro..."


In [3]:
def isTextInEnglish(text):
    doc = nlp(text)
    languages = doc._.languages
    try:
        score = doc._.language_scores['en']
        return score > 0.75
    except:
        return False

In [4]:
# text cleansing 
def clean_text(text):
    return text.replace('"',"'").replace('“','"').replace('”','"').replace('’','"').replace('‘','"')

def add_lyrics(row):
    if row['has_lyrics'] == True:
        return clean_text(read_file(row['file_name']))
    else:
        return ''
    
def get_lyric_name_path(a, t):
    aaz = a.replace(' ', '').lower()
    taz = t.replace(' ', '').lower()
    title = '{}/{}'.format(aaz, taz)
    # print(title)
    file_path = '{}/{}.txt'.format(PROJECT_PATH, title.replace('/','-'))
    path = [title, file_path]

    if not os.path.exists(file_path):
        return path + [True]
    else:
        return path + [False]

In [8]:
df['lyrics'] = df.apply(lambda x: add_lyrics(x), axis=1)

In [9]:
df['is_english'] = df.apply(lambda x: isTextInEnglish(x['lyrics']), axis=1)

In [10]:
df_eng = df[df['is_english'] == True].copy()

In [11]:
# spacy entity recognition
def entityRecognition(text):
    ret = {}
    doc = nlp(text)
    for ent in doc.ents:
        ret[(ent.label, ent.text)] = ent.label_
    return ret

In [12]:
df_eng['t_entities'] = df_eng.apply(lambda x: entityRecognition(x['lyrics']), axis=1)

In [13]:
Doc.set_extension('vader_neutral', default=None, force=True)
Doc.set_extension('vader_positive', default=None, force=True)
Doc.set_extension('vader_negative', default=None, force=True)

In [14]:
sid_obj = SentimentIntensityAnalyzer()

def vader_pipe(doc):
    sentiment_dict = sid_obj.polarity_scores(doc.text)
    
    doc._.vader_negative = float(sentiment_dict['neg'])
    doc._.vader_neutral = float(sentiment_dict['neu'])
    doc._.vader_positive = float(sentiment_dict['pos'])

    return doc

In [15]:
if nlp.has_pipe('vader_pipe'):
    nlp.remove_pipe('vader_pipe')
nlp.add_pipe(vader_pipe)

In [16]:
Doc.set_extension('tb_subjectivity', default=None, force=True)
Doc.set_extension('tb_polarity', default=None, force=True)

In [17]:
def textblob_pipe(doc):
    textBlobText = TextBlob(doc.text)
    
    doc._.tb_polarity = float(textBlobText.sentiment[0])
    doc._.tb_subjectivity = float(textBlobText.sentiment[1])

    return doc

In [18]:
if nlp.has_pipe('textblob_pipe'):
    nlp.remove_pipe('textblob_pipe')
nlp.add_pipe(textblob_pipe)

In [19]:
Doc.set_extension('afinn_score', default=None, force=True)
Doc.set_extension('custom_afinn_score', default=None, force=True)

In [20]:
def scoresFromFile(termScoreFile):
    scores = {}
    with open(termScoreFile, 'r') as f:
        for line in f:
            term, score = line.split("\t")
            scores[term] = float(score)
    return scores

scores165 = scoresFromFile('{}/{}.txt'.format(AFINN_PATH, "AFINN-en-165"))
scores111 = scoresFromFile('{}/{}.txt'.format(AFINN_PATH, "AFINN-111"))

# afinn implemented analysis
def getSentiments(doc, setScores = scores111):
    tokens = [word for word in doc if word.text.isalpha() and not word.is_stop]
    words_set = set([token.lemma_.lower() for token in tokens])
    score = 0
    for eachWord in words_set:
        if eachWord in setScores:
            score += setScores[eachWord]
    return score

# affin library implementarion
def afScores(text):
    return af.score(text)

def classifyScoredAfinn(scores = []):
    return [[score, 'positive'] if score > 0 
            else [score, 'negative'] if score < 0 
            else [score, 'neutral'] 
            for score in scores]

def afinn_pipe(doc):
    doc._.afinn_score = afScores(doc.text)
    doc._.custom_afinn_score = getSentiments(doc)

    return doc

In [21]:
if nlp.has_pipe('afinn_pipe'):
    nlp.remove_pipe('afinn_pipe')
nlp.add_pipe(afinn_pipe)

In [22]:
df_eng['subjectivity'] = 0
df_eng['polarity'] = 0
df_eng['positivity'] = 0
df_eng['neutrality'] = 0
df_eng['negativity'] = 0
df_eng['afinn_score'] = 0
df_eng['custom_afinn_score'] = 0

In [23]:
for index, row in df_eng.iterrows():
    nlpElem = nlp(row.lyrics)
    df_num_date = nlpElem._.tb_subjectivity
    df_eng.loc[index, 'polarity'] = nlpElem._.tb_polarity
    df_eng.loc[index, 'positivity'] = nlpElem._.vader_positive
    df_eng.loc[index, 'neutrality'] = nlpElem._.vader_neutral
    df_eng.loc[index, 'negativity'] = nlpElem._.vader_negative
    df_eng.loc[index, 'afinn_score'] = nlpElem._.afinn_score
    df_eng.loc[index, 'custom_afinn_score'] = nlpElem._.custom_afinn_score

In [24]:
#validate column types
df_eng.select_dtypes(exclude = ['float64', 'int64'])

Unnamed: 0,id,name,explicit,artist,artist_id,album,album_id,album_release_date,p_name,p_artist,file_name,has_lyrics,genres,lyrics,is_english,t_entities
0,57bgtoPSgt236HzfBOd8kj,Thunderstruck,False,AC/DC,711MCceyCBcFnzjGY4Q7Un,The Razors Edge,4vu7F6h90Br1ZtYYaqfITy,1990-09-24,thunderstruck,ac dc,./data/acdc-thunderstruck.txt,True,"['album rock', 'australian rock', 'hard rock',...",Thunder [x10] I was caught In the middle of ...,True,"{(384, 'Beating'): 'GPE', (380, 'Thunderstruck..."
1,5u5F7qLDvZjBSktaDp4HxB,Moneytalks,False,AC/DC,711MCceyCBcFnzjGY4Q7Un,The Razors Edge,4vu7F6h90Br1ZtYYaqfITy,1990-09-24,moneytalks,ac dc,./data/acdc-moneytalks.txt,True,"['album rock', 'australian rock', 'hard rock',...","Tailored suits, chauffeured cars Fine hotels...",True,"{(390, 'the Moneytalk Come'): 'LAW', (381, 'Fr..."
2,5pKJtX4wBeby9qIfFhyOJj,Don't Go Breaking My Heart,False,Elton John,3PhoLpVuITZKcymswpck5b,To Be Continued...,7iKDBfOFUtg8E8RbuDhiux,1990-11-08,don t go breaking my heart,elton john,./data/eltonjohn-dontgobreakingmyheart.txt,True,"['glam rock', 'mellow gold', 'piano rock', 'ro...",Don't go breaking my heart I couldn't if I t...,True,"{(383, 'Baby'): 'ORG'}"
4,7em38Hh2sKwnZ1HDgxvg82,Circle Of Life,False,Elton John,3PhoLpVuITZKcymswpck5b,Love Songs,1bnoqPJyaxMCtYuSprcBTD,1995-11-06,circle of life,elton john,./data/eltonjohn-circleoflife.txt,True,"['glam rock', 'mellow gold', 'piano rock', 'ro...","From the day we arrive on the planet And, bl...",True,"{(391, 'the day'): 'DATE', (384, 'Keeps'): 'GPE'}"
5,6gZVQvQZOFpzIy3HblJ20F,Man in the Box,True,Alice In Chains,64tNsm6TnZe2zpcMVMOoHL,Facelift,5LbHbwejgZXRZAgzVAjkhj,1990,man in the box,alice in chains,./data/aliceinchains-maninthebox.txt,True,"['alternative metal', 'alternative rock', 'gru...",I'm the man in the box [Album version:] Buri...,True,"{(380, 'Jesus Christ'): 'PERSON', (383, 'Music..."
6,1gh6flppAA19XDw30g5LEN,Grind,False,Alice In Chains,64tNsm6TnZe2zpcMVMOoHL,Alice In Chains,49R4Qye4UUwzjPPQhtCkRe,1995-10-30,grind,alice in chains,./data/aliceinchains-grind.txt,True,"['alternative metal', 'alternative rock', 'gru...","In the darkest hole, you'd be well advised N...",True,"{(392, 'morning'): 'TIME', (380, 'Hear'): 'PER..."
9,6m59VvDUi0UQsB2eZ9wVbH,Poison,False,Bell Biv DeVoe,2zFZiWQJFFshzojycnXoTL,Poison,6H5mxGUWguDjtQ4Uzd8veD,1990-01-01,poison,bell biv devoe,./data/bellbivdevoe-poison.txt,True,"['boy band', 'funk', 'hip hop', 'hip pop', 'ne...","[Michael Bivins:] Yeah, Spiderman and Freeze...",True,"{(380, 'Michael Bivins'): 'PERSON', (380, 'Spi..."
10,3Ti0GdlrotgwsAVBBugv0I,Can I Kick It?,False,A Tribe Called Quest,09hVIj6vWgoCDtT03h8ZCa,People's Instinctive Travels and the Paths of ...,3kV0i1qqudjf0PGawJ4jck,1990-04-17,can i kick it,a tribe called quest,./data/atribecalledquest-canikickit.txt,True,"['alternative hip hop', 'conscious hip hop', '...","[Q-Tip] Can I kick it? (Yes, you can!) [7X]...",True,"{(384, 'Quest'): 'GPE', (380, 'Gettin'): 'PERS..."
11,0301nLjG0ti26rx5ZmfqtP,Scenario - LP Mix,False,A Tribe Called Quest,09hVIj6vWgoCDtT03h8ZCa,The Low End Theory,1p12OAWwudgMqfMzjMvl2a,1991-09-24,scenario lp mix,a tribe called quest,./data/atribecalledquest-scenariolpmix.txt,True,"['alternative hip hop', 'conscious hip hop', '...",[Busta Rhymes:] Here in 1992 we present The ...,True,"{(380, 'Busta Rhymes'): 'PERSON', (391, '1992'..."
12,4HfxDJ0uLHTLe0fZrx0MbQ,Check the Rhime,False,A Tribe Called Quest,09hVIj6vWgoCDtT03h8ZCa,The Low End Theory,1p12OAWwudgMqfMzjMvl2a,1991-09-24,check the rhime,a tribe called quest,./data/atribecalledquest-checktherhime.txt,True,"['alternative hip hop', 'conscious hip hop', '...",[Q:] Check the rhyme y'all. [Q:] Back in th...,True,"{(391, 'the days'): 'DATE', (384, 'Linden'): '..."


In [25]:
df_eng.select_dtypes(include = ['float64', 'int64'])

Unnamed: 0,popularity,duration_ms,album_total_tracks,danceability,energy,key,loudness,mode,speechiness,acousticness,...,liveness,valence,tempo,subjectivity,polarity,positivity,neutrality,negativity,afinn_score,custom_afinn_score
0,80,292880,12,0.501,0.889,4,-5.175,1,0.0364,0.000147,...,0.2170,0.257,133.519,0.399074,0.275926,0.232,0.664,0.104,22.0,10.0
1,62,225946,12,0.650,0.903,7,-3.677,1,0.0314,0.000841,...,0.1220,0.409,121.896,0.330324,0.042593,0.025,0.961,0.014,5.0,6.0
2,69,271133,67,0.743,0.858,5,-7.790,1,0.0416,0.196000,...,0.0685,0.779,131.312,0.563757,0.154497,0.009,0.971,0.019,16.0,-1.0
4,47,290093,15,0.276,0.607,10,-8.089,1,0.0369,0.283000,...,0.1070,0.157,160.450,0.538571,0.221006,0.125,0.846,0.029,14.0,4.0
5,70,284426,12,0.346,0.765,8,-8.558,1,0.0413,0.002290,...,0.2130,0.644,106.463,0.566667,0.058333,0.079,0.768,0.153,-3.0,-4.0
6,48,284640,12,0.393,0.882,11,-5.884,1,0.0367,0.000146,...,0.5070,0.364,88.417,0.560185,0.272963,0.200,0.723,0.077,9.0,0.0
9,66,261853,10,0.829,0.573,6,-10.316,0,0.2240,0.002160,...,0.6120,0.803,111.848,0.613473,0.241411,0.097,0.833,0.070,6.0,2.0
10,69,251573,17,0.848,0.666,0,-6.547,1,0.2740,0.173000,...,0.1290,0.744,96.662,0.393786,0.194229,0.168,0.802,0.030,39.0,23.0
11,60,250133,14,0.795,0.600,2,-14.004,1,0.3410,0.112000,...,0.1110,0.633,101.780,0.555173,0.080836,0.114,0.794,0.093,-31.0,-13.0
12,58,216693,14,0.881,0.361,11,-14.728,0,0.2370,0.050200,...,0.2460,0.799,96.430,0.742915,0.306203,0.098,0.846,0.056,14.0,5.0


In [26]:
df_eng["explicit"] = df_eng["explicit"].astype(int)

In [27]:
df_eng.set_index('id')

Unnamed: 0_level_0,name,popularity,duration_ms,explicit,artist,artist_id,album,album_id,album_release_date,album_total_tracks,...,lyrics,is_english,t_entities,subjectivity,polarity,positivity,neutrality,negativity,afinn_score,custom_afinn_score
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
57bgtoPSgt236HzfBOd8kj,Thunderstruck,80,292880,0,AC/DC,711MCceyCBcFnzjGY4Q7Un,The Razors Edge,4vu7F6h90Br1ZtYYaqfITy,1990-09-24,12,...,Thunder [x10] I was caught In the middle of ...,True,"{(384, 'Beating'): 'GPE', (380, 'Thunderstruck...",0.399074,0.275926,0.232,0.664,0.104,22.0,10.0
5u5F7qLDvZjBSktaDp4HxB,Moneytalks,62,225946,0,AC/DC,711MCceyCBcFnzjGY4Q7Un,The Razors Edge,4vu7F6h90Br1ZtYYaqfITy,1990-09-24,12,...,"Tailored suits, chauffeured cars Fine hotels...",True,"{(390, 'the Moneytalk Come'): 'LAW', (381, 'Fr...",0.330324,0.042593,0.025,0.961,0.014,5.0,6.0
5pKJtX4wBeby9qIfFhyOJj,Don't Go Breaking My Heart,69,271133,0,Elton John,3PhoLpVuITZKcymswpck5b,To Be Continued...,7iKDBfOFUtg8E8RbuDhiux,1990-11-08,67,...,Don't go breaking my heart I couldn't if I t...,True,"{(383, 'Baby'): 'ORG'}",0.563757,0.154497,0.009,0.971,0.019,16.0,-1.0
7em38Hh2sKwnZ1HDgxvg82,Circle Of Life,47,290093,0,Elton John,3PhoLpVuITZKcymswpck5b,Love Songs,1bnoqPJyaxMCtYuSprcBTD,1995-11-06,15,...,"From the day we arrive on the planet And, bl...",True,"{(391, 'the day'): 'DATE', (384, 'Keeps'): 'GPE'}",0.538571,0.221006,0.125,0.846,0.029,14.0,4.0
6gZVQvQZOFpzIy3HblJ20F,Man in the Box,70,284426,1,Alice In Chains,64tNsm6TnZe2zpcMVMOoHL,Facelift,5LbHbwejgZXRZAgzVAjkhj,1990,12,...,I'm the man in the box [Album version:] Buri...,True,"{(380, 'Jesus Christ'): 'PERSON', (383, 'Music...",0.566667,0.058333,0.079,0.768,0.153,-3.0,-4.0
1gh6flppAA19XDw30g5LEN,Grind,48,284640,0,Alice In Chains,64tNsm6TnZe2zpcMVMOoHL,Alice In Chains,49R4Qye4UUwzjPPQhtCkRe,1995-10-30,12,...,"In the darkest hole, you'd be well advised N...",True,"{(392, 'morning'): 'TIME', (380, 'Hear'): 'PER...",0.560185,0.272963,0.200,0.723,0.077,9.0,0.0
6m59VvDUi0UQsB2eZ9wVbH,Poison,66,261853,0,Bell Biv DeVoe,2zFZiWQJFFshzojycnXoTL,Poison,6H5mxGUWguDjtQ4Uzd8veD,1990-01-01,10,...,"[Michael Bivins:] Yeah, Spiderman and Freeze...",True,"{(380, 'Michael Bivins'): 'PERSON', (380, 'Spi...",0.613473,0.241411,0.097,0.833,0.070,6.0,2.0
3Ti0GdlrotgwsAVBBugv0I,Can I Kick It?,69,251573,0,A Tribe Called Quest,09hVIj6vWgoCDtT03h8ZCa,People's Instinctive Travels and the Paths of ...,3kV0i1qqudjf0PGawJ4jck,1990-04-17,17,...,"[Q-Tip] Can I kick it? (Yes, you can!) [7X]...",True,"{(384, 'Quest'): 'GPE', (380, 'Gettin'): 'PERS...",0.393786,0.194229,0.168,0.802,0.030,39.0,23.0
0301nLjG0ti26rx5ZmfqtP,Scenario - LP Mix,60,250133,0,A Tribe Called Quest,09hVIj6vWgoCDtT03h8ZCa,The Low End Theory,1p12OAWwudgMqfMzjMvl2a,1991-09-24,14,...,[Busta Rhymes:] Here in 1992 we present The ...,True,"{(380, 'Busta Rhymes'): 'PERSON', (391, '1992'...",0.555173,0.080836,0.114,0.794,0.093,-31.0,-13.0
4HfxDJ0uLHTLe0fZrx0MbQ,Check the Rhime,58,216693,0,A Tribe Called Quest,09hVIj6vWgoCDtT03h8ZCa,The Low End Theory,1p12OAWwudgMqfMzjMvl2a,1991-09-24,14,...,[Q:] Check the rhyme y'all. [Q:] Back in th...,True,"{(391, 'the days'): 'DATE', (384, 'Linden'): '...",0.742915,0.306203,0.098,0.846,0.056,14.0,5.0


In [28]:
dataframe_to_csv(df_eng)