In [1]:
# system import
import sys
import os
import csv
import config

# data processing
import pandas as pd
import numpy as np
#https://spacy.io/
import spacy
from spacy.tokens import Doc, Token
#https://github.com/nickdavidhaynes/spacy-cld
from spacy_cld import LanguageDetector
#https://textblob.readthedocs.io/en/dev/
from textblob import TextBlob
#https://github.com/cjhutto/vaderSentiment#installation
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
#https://github.com/fnielsen/afinn
from afinn import Afinn

# set env
af = Afinn()
PROJECT_PATH = config.data_directory
AFINN_PATH = config.afinn_path

# spacy set language
nlp = spacy.load('en')
language_detector = LanguageDetector()
nlp.add_pipe(language_detector)

In [2]:
# file operations
def read_file(file):
    text_file = open('{}'.format(file), "r")
    text = text_file.read()
    text_file.close()
    return text

# load dataFrame
df = pd.read_csv('{}/{}.csv'.format(PROJECT_PATH, 'songs'))
df.head()

Unnamed: 0,id,name,popularity,duration_ms,explicit,artist,artist_id,album,album_id,album_release_date,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,genres
0,57bgtoPSgt236HzfBOd8kj,Thunderstruck,82,292880,False,AC/DC,711MCceyCBcFnzjGY4Q7Un,The Razors Edge,4vu7F6h90Br1ZtYYaqfITy,1990-09-24,...,4,-5.175,1,0.0364,0.000147,0.0114,0.217,0.257,133.519,"['album rock', 'australian rock', 'hard rock',..."
1,5u5F7qLDvZjBSktaDp4HxB,Moneytalks,64,225946,False,AC/DC,711MCceyCBcFnzjGY4Q7Un,The Razors Edge,4vu7F6h90Br1ZtYYaqfITy,1990-09-24,...,7,-3.677,1,0.0314,0.000841,0.000351,0.122,0.409,121.896,"['album rock', 'australian rock', 'hard rock',..."
2,7A1odihHBrI8n9k0Fefh2j,Are You Ready,65,250333,False,AC/DC,711MCceyCBcFnzjGY4Q7Un,The Razors Edge,4vu7F6h90Br1ZtYYaqfITy,1990-09-24,...,7,-4.816,1,0.0324,0.0165,4e-06,0.13,0.414,108.533,"['album rock', 'australian rock', 'hard rock',..."
3,0cLvKgKkqlaJ9UajbitH4l,Fire Your Guns,60,173746,False,AC/DC,711MCceyCBcFnzjGY4Q7Un,The Razors Edge,4vu7F6h90Br1ZtYYaqfITy,1990-09-24,...,9,-3.284,1,0.0508,0.00135,7e-06,0.0941,0.515,98.879,"['album rock', 'australian rock', 'hard rock',..."
4,4ObCns6nM6tShx5a5tHiGC,The Razors Edge,50,262533,False,AC/DC,711MCceyCBcFnzjGY4Q7Un,The Razors Edge,4vu7F6h90Br1ZtYYaqfITy,1990-09-24,...,4,-3.522,1,0.0532,0.00108,0.196,0.168,0.306,101.058,"['album rock', 'australian rock', 'hard rock',..."


In [3]:
def isTextInEnglish(text):
    doc = nlp(text)
    languages = doc._.languages
    try:
        score = doc._.language_scores['en']
        return score > 0.75
    except:
        return False

In [4]:
# text cleansing 
def clean_text(text):
    return text.replace('"',"'").replace('“','"').replace('”','"').replace('’','"').replace('‘','"')

def add_lyrics(row):
    if row['has_lyrics'] == True:
        return clean_text(read_file(row['file_name']))
    else:
        return ''
    
def get_lyric_name_path(a, t):
    aaz = a.replace(' ', '').lower()
    taz = t.replace(' ', '').lower()
    title = '{}/{}'.format(aaz, taz)
    print(title)
    file_path = '{}/{}.txt'.format(PROJECT_PATH, title.replace('/','-'))
    path = [title, file_path]

    if not os.path.exists(file_path):
        return path + [True]
    else:
        return path + [False]

In [5]:
df['lyrics'] = df.apply(lambda x: add_lyrics(x), axis=1)

In [6]:
df['is_english'] = df.apply(lambda x: isTextInEnglish(x['lyrics']), axis=1)

In [7]:
df_eng = df[df['is_english'] == True].copy()

In [8]:
# spacy entity recognition
def entityRecognition(text):
    ret = {}
    doc = nlp(text)
    for ent in doc.ents:
        ret[(ent.label, ent.text)] = ent.label_
    return ret

In [9]:
df_eng['t_entities'] = df_eng.apply(lambda x: entityRecognition(x['lyrics']), axis=1)

In [10]:
Doc.set_extension('vader_neutral', default=None, force=True)
Doc.set_extension('vader_positive', default=None, force=True)
Doc.set_extension('vader_negative', default=None, force=True)
#Token.set_extension('vader_neutral', default=None, force=True)
#Token.set_extension('vader_positive', default=None, force=True)

In [11]:
sid_obj = SentimentIntensityAnalyzer()

def vader_pipe(doc):
    sentiment_dict = sid_obj.polarity_scores(doc.text)
    
    doc._.vader_negative = sentiment_dict['neg']
    doc._.vader_neutral = sentiment_dict['neu']
    doc._.vader_positive = sentiment_dict['pos']
    
    #for sentence in doc.sents:
    #    for token in sentence:
    #        sentiment_dict_token = sid_obj.polarity_scores(token.text)
    #        token._.vader_negative = sentiment_dict_token['neg']
    #        token._.vader_neutral = sentiment_dict_token['neu']
    #        token._.vader_positive = sentiment_dict_token['pos']

    return doc

In [12]:
if nlp.has_pipe('vader_pipe'):
    nlp.remove_pipe('vader_pipe')
nlp.add_pipe(vader_pipe)

In [13]:
Doc.set_extension('tb_subjectivity', default=None, force=True)
Doc.set_extension('tb_polarity', default=None, force=True)

In [14]:
def textblob_pipe(doc):
    textBlobText = TextBlob(doc.text)
    
    doc._.tb_polarity = textBlobText.sentiment[0]
    doc._.tb_subjectivity = textBlobText.sentiment[1]

    return doc

In [15]:
if nlp.has_pipe('textblob_pipe'):
    nlp.remove_pipe('textblob_pipe')
nlp.add_pipe(textblob_pipe)

In [16]:
Doc.set_extension('sAfinn_score', default=None, force=True)
Doc.set_extension('afinn_score', default=None, force=True)

In [21]:
def scoresFromFile(termScoreFile):
    scores = {}
    with open(termScoreFile, 'r') as f:
        for line in f:
            term, score = line.split("\t")
            scores[term] = float(score)
    return scores

scores165 = scoresFromFile('{}/{}.txt'.format(AFINN_PATH, "AFINN-en-165"))
scores111 = scoresFromFile('{}/{}.txt'.format(AFINN_PATH, "AFINN-111"))

# get tokens from text, clean stopwords
def getTokens(text):
    doc = text.split()
    words_set = set([token.lower() for token in doc])
    stopwords_set = set(spacy.lang.en.stop_words.STOP_WORDS)
    contentWords = [w for w in words_set if w.lower() not in stopwords_set]
    
    return words_set

# afinn implemented analysis
def getSentiments(text, setScores = scores165):
    tokens = getTokens(text)
    score = 0
    for eachWord in tokens:
        if eachWord in setScores:
            score += setScores[eachWord]
    return score

# affin library implementarion
def afScores(text):
    return af.score(text)

def classifyScoredAfinn(scores = []):
    return [[score, 'positive'] if score > 0 
            else [score, 'negative'] if score < 0 
            else [score, 'neutral'] 
            for score in scores]

def afinn_pipe(doc):
    doc._.sAfinn_score = afScores(doc.text)
    doc._.afinn_score = getSentiments(doc.text)

    return doc

In [22]:
if nlp.has_pipe('afinn_pipe'):
    nlp.remove_pipe('afinn_pipe')
nlp.add_pipe(afinn_pipe)

In [19]:
df_eng['subjectivity'] = ''
df_eng['polarity'] = ''
df_eng['positivity'] = ''
df_eng['neutrality'] = ''
df_eng['negativity'] = ''
df_eng['safinn_score'] = ''
df_eng['afinn_score'] = ''

In [23]:
for index, row in df_eng.iterrows():
    nlpElem = nlp(row.lyrics)
    df_eng.loc[index, 'subjectivity'] = nlpElem._.tb_subjectivity
    df_eng.loc[index, 'polarity'] = nlpElem._.tb_polarity
    df_eng.loc[index, 'positivity'] = nlpElem._.vader_positive
    df_eng.loc[index, 'neutrality'] = nlpElem._.vader_neutral
    df_eng.loc[index, 'negativity'] = nlpElem._.vader_negative
    df_eng.loc[index, 'safinn_score'] = nlpElem._.sAfinn_score
    df_eng.loc[index, 'afinn_score'] = nlpElem._.afinn_score

In [24]:
df_eng.head()

Unnamed: 0,id,name,popularity,duration_ms,explicit,artist,artist_id,album,album_id,album_release_date,...,lyrics,is_english,t_entities,subjectivity,polarity,positivity,neutrality,negativity,safinn_score,afinn_score
0,57bgtoPSgt236HzfBOd8kj,Thunderstruck,82,292880,False,AC/DC,711MCceyCBcFnzjGY4Q7Un,The Razors Edge,4vu7F6h90Br1ZtYYaqfITy,1990-09-24,...,Thunder [x10] I was caught In the middle of ...,True,"{(384, 'Beating'): 'GPE', (380, 'Thunderstruck...",0.399074,0.275926,0.232,0.664,0.104,22,10
1,5u5F7qLDvZjBSktaDp4HxB,Moneytalks,64,225946,False,AC/DC,711MCceyCBcFnzjGY4Q7Un,The Razors Edge,4vu7F6h90Br1ZtYYaqfITy,1990-09-24,...,"Tailored suits, chauffeured cars Fine hotels...",True,"{(390, 'the Moneytalk Come'): 'LAW', (381, 'Fr...",0.330324,0.0425926,0.025,0.961,0.014,5,4
2,7A1odihHBrI8n9k0Fefh2j,Are You Ready,65,250333,False,AC/DC,711MCceyCBcFnzjGY4Q7Un,The Razors Edge,4vu7F6h90Br1ZtYYaqfITy,1990-09-24,...,Sweet apple pie Standing in the street Hands...,True,{},0.55898,0.402925,0.349,0.643,0.009,56,2
3,0cLvKgKkqlaJ9UajbitH4l,Fire Your Guns,60,173746,False,AC/DC,711MCceyCBcFnzjGY4Q7Un,The Razors Edge,4vu7F6h90Br1ZtYYaqfITy,1990-09-24,...,Wild beast I'll make you mine Taste your kis...,True,"{(380, 'Sweet'): 'PERSON'}",0.332843,-0.000490196,0.122,0.549,0.329,-40,3
4,4ObCns6nM6tShx5a5tHiGC,The Razors Edge,50,262533,False,AC/DC,711MCceyCBcFnzjGY4Q7Un,The Razors Edge,4vu7F6h90Br1ZtYYaqfITy,1990-09-24,...,There's fightin' on the left And marching on...,True,"{(380, 'Razor'): 'PERSON'}",0.437188,-0.069161,0.035,0.803,0.162,-16,-11
