In [1]:
# system import
import sys
import os
import csv
import config

# data processing
import pandas as pd
import numpy
import spacy
from spacy_cld import LanguageDetector
from afinn import Afinn

# set env
af = Afinn()
PROJECT_PATH = config.data_directory
AFINN_PATH = config.afinn_path

# spacy set language
nlp = spacy.load('en')
language_detector = LanguageDetector()
nlp.add_pipe(language_detector)

In [2]:
# load dataFrame
df = pd.read_csv('{}/{}.csv'.format(PROJECT_PATH, 'songs'))
df.head()

Unnamed: 0,id,name,popularity,duration_ms,explicit,artist,artist_id,album,album_id,album_release_date,...,liveness,valence,tempo,p_name,p_artist,file_name,has_lyrics,lyrics,is_english,t_entities
0,57bgtoPSgt236HzfBOd8kj,Thunderstruck,82,292880,False,AC/DC,711MCceyCBcFnzjGY4Q7Un,The Razors Edge,4vu7F6h90Br1ZtYYaqfITy,1990-09-24,...,0.217,0.257,133.519,thunderstruck,ac dc,./data/acdc-thunderstruck.txt,True,Thunder [x10] I was caught In the middle of ...,True,"{(384, 'Beating'): 'GPE', (380, 'Thunderstruck..."
1,5pKJtX4wBeby9qIfFhyOJj,Don't Go Breaking My Heart,71,271133,False,Elton John,3PhoLpVuITZKcymswpck5b,To Be Continued...,7iKDBfOFUtg8E8RbuDhiux,1990-11-08,...,0.0685,0.779,131.312,don t go breaking my heart,elton john,./data/eltonjohn-dontgobreakingmyheart.txt,True,Don't go breaking my heart I couldn't if I t...,True,"{(383, 'Baby'): 'ORG'}"
2,6gZVQvQZOFpzIy3HblJ20F,Man in the Box,72,284426,True,Alice In Chains,64tNsm6TnZe2zpcMVMOoHL,Facelift,5LbHbwejgZXRZAgzVAjkhj,1990,...,0.213,0.644,106.463,man in the box,alice in chains,./data/aliceinchains-maninthebox.txt,True,I'm the man in the box [Album version:] Buri...,True,"{(380, 'Jesus Christ'): 'PERSON', (383, 'Music..."
3,6m59VvDUi0UQsB2eZ9wVbH,Poison,68,261853,False,Bell Biv DeVoe,2zFZiWQJFFshzojycnXoTL,Poison,6H5mxGUWguDjtQ4Uzd8veD,1990-01-01,...,0.612,0.803,111.848,poison,bell biv devoe,./data/bellbivdevoe-poison.txt,True,"[Michael Bivins:] Yeah, Spiderman and Freeze...",True,"{(380, 'Michael Bivins'): 'PERSON', (380, 'Spi..."
4,63vL5oxWrlvaJ0ayNaQnbX,Istanbul,73,153813,False,They Might Be Giants,6zB02lwP6L6ZH32nggQiJT,Flood,7FwAtuhhWivxvK4aPgyyUD,1990-01-02,...,0.136,0.892,114.144,istanbul,they might be giants,./data/theymightbegiants-istanbul.txt,True,Istanbul was Constantinople Now it's Istanbu...,True,"{(384, 'Istanbul'): 'GPE', (383, 'Constantinop..."


In [3]:
def isTextInEnglish(text):
    doc = nlp(text)
    languages = doc._.languages
    try:
        score = doc._.language_scores['en']
        return score > 0.75
    except:
        return False

In [4]:
# text cleansing 
def clean_text(text):
    return text.replace('"',"'").replace('“','"').replace('”','"').replace('’','"').replace('‘','"')

def add_lyrics(row):
    if row['has_lyrics']:
        return clean_text(read_file(row['file_name']))
    else:
        return ''
    
def get_lyric_name_path(a, t):
    aaz = a.replace(' ', '').lower()
    taz = t.replace(' ', '').lower()
    title = '{}/{}'.format(aaz, taz)
    print(title)
    file_path = '{}/{}.txt'.format(PROJECT_PATH, title.replace('/','-'))
    path = [title, file_path]

    if not os.path.exists(file_path):
        return path + [True]
    else:
        return path + [False]

In [5]:
# az data
pattern = r'[^a-z0-9A-z]'
df['p_name'] = df['name'].str.replace(pattern, ' ').str.lower()
df['p_artist'] = df['artist'].str.replace(pattern, ' ').str.lower()
df['file_name'] = df.apply(lambda x: get_lyric_name_path(x['p_artist'], x['p_name'])[1], axis=1)
df['has_lyrics'] = 0

acdc/thunderstruck
eltonjohn/dontgobreakingmyheart
aliceinchains/maninthebox
bellbivdevoe/poison
theymightbegiants/istanbul
atribecalledquest/canikickit
extreme/morethanwords
theromantics/whatilikeaboutyou
pantera/cowboysfromhell
mchammer/ucanttouchthis
theblackcrowes/hardtohandle
warrant/cherrypie
digitalunderground/thehumptydance
wilsonphillips/holdon
joediffie/johndeeregreen
thelas/thereshegoes
scorpions/windofchange
depechemode/enjoythesilencesinglemix
beegees/stayinalivefromsaturdaynightfeversoundtrack
joediffie/pickupman
ramjam/blackbetty
therighteousbrothers/unchainedmelody
sinadoconnor/nothingcompares2u
templeofthedog/hungerstrike
keithwhitley/dontcloseyoureyes
juangabriel/abrzamemuyfuerte
pantera/cemeterygates
man/rayandoelsol
joediffie/propmeupbesidethejukeboxifidie
llcoolj/mamasaidknockyouout
megadeth/holywarsthepunishmentdueremastered2004
joediffie/pickupman
tonytoniton/feelsgood
vanillaice/iceicebaby
poison/unskinnybop
heart/alliwannadoismakelovetoyou
envogue/holdon
hifive

janesaddiction/stop
warrant/uncletomscabin
eltonjohn/sadsongssaysomuch
james/sitdown
enigma/meaculpa
megadeth/fivemagicsremastered2004
bennygoodman/stompinatthesavoy
firehouse/allshewrotelive
thebangles/hazyshadeofwinter
losprisioneros/trenalsur
markchesnutt/brotherjukebox
charlestrenet/lamer
marciagriffiths/electricboogie
madonna/crazyforyoueditversion
bettemidler/fromadistance
harryconnickjr/weareinlove
motherlovebone/crownofthorns
davidbowie/heroessingleversion1990remasteredversion
iggypop/candy
pantera/thesleep
dannyelfman/icedance
warrant/isawred
wilsonphillips/youreinlove
thejudds/lovecanbuildabridge
extreme/getthefunkout
tka/louderthanlove
icecube/amerikkkasmostwanted
digitalunderground/freaksoftheindustry
theblackcrowes/seeingthings
extreme/holehearted
sergeiprokofiev/lieutenantkijop60iiromance
fleetwoodmac/loveisdangerous
tesla/signsliveatthetrocadero1990
danzig/devilsplaything
slayer/skeletonsofsociety
gruponiche/buscapordentro
danieljohnston/somethingslastalongtime
petshopbo

In [6]:
df['lyrics'] = df.apply(lambda x: add_lyrics(x), axis=1)
df['is_english'] = df.apply(lambda x: isTextInEnglish(x['lyrics']), axis=1)

In [7]:
# spacy entity recognition
def entityRecognition(text):
    ret = {}
    doc = nlp(text)
    for ent in doc.ents:
        ret[(ent.label, ent.text)] = ent.label_
    return ret

In [8]:
df['t_entities'] = df.apply(lambda x: entityRecognition(x['lyrics']), axis=1)

In [9]:
df.head()

Unnamed: 0,id,name,popularity,duration_ms,explicit,artist,artist_id,album,album_id,album_release_date,...,liveness,valence,tempo,p_name,p_artist,file_name,has_lyrics,lyrics,is_english,t_entities
0,57bgtoPSgt236HzfBOd8kj,Thunderstruck,82,292880,False,AC/DC,711MCceyCBcFnzjGY4Q7Un,The Razors Edge,4vu7F6h90Br1ZtYYaqfITy,1990-09-24,...,0.217,0.257,133.519,thunderstruck,ac dc,./data/acdc-thunderstruck.txt,0,,False,{}
1,5pKJtX4wBeby9qIfFhyOJj,Don't Go Breaking My Heart,71,271133,False,Elton John,3PhoLpVuITZKcymswpck5b,To Be Continued...,7iKDBfOFUtg8E8RbuDhiux,1990-11-08,...,0.0685,0.779,131.312,don t go breaking my heart,elton john,./data/eltonjohn-dontgobreakingmyheart.txt,0,,False,{}
2,6gZVQvQZOFpzIy3HblJ20F,Man in the Box,72,284426,True,Alice In Chains,64tNsm6TnZe2zpcMVMOoHL,Facelift,5LbHbwejgZXRZAgzVAjkhj,1990,...,0.213,0.644,106.463,man in the box,alice in chains,./data/aliceinchains-maninthebox.txt,0,,False,{}
3,6m59VvDUi0UQsB2eZ9wVbH,Poison,68,261853,False,Bell Biv DeVoe,2zFZiWQJFFshzojycnXoTL,Poison,6H5mxGUWguDjtQ4Uzd8veD,1990-01-01,...,0.612,0.803,111.848,poison,bell biv devoe,./data/bellbivdevoe-poison.txt,0,,False,{}
4,63vL5oxWrlvaJ0ayNaQnbX,Istanbul,73,153813,False,They Might Be Giants,6zB02lwP6L6ZH32nggQiJT,Flood,7FwAtuhhWivxvK4aPgyyUD,1990-01-02,...,0.136,0.892,114.144,istanbul,they might be giants,./data/theymightbegiants-istanbul.txt,0,,False,{}


In [11]:
def scoresFromFile(termScoreFile):
    scores = {}
    with open(termScoreFile, 'r') as f:
        for line in f:
            term, score = line.split("\t")
            scores[term] = float(score)
    return scores

scores165 = scoresFromFile('{}/{}.txt'.format(AFINN_PATH, "AFINN-en-165"))
scores111 = scoresFromFile('{}/{}.txt'.format(AFINN_PATH, "AFINN-111"))

# get tokens from text, clean stopwords
def getTokens(text):
    doc = nlp(text)
    words_set = set([token.text.lower() for token in doc])
    stopwords_set = set(spacy.lang.en.stop_words.STOP_WORDS)
    contentWords = [w for w in words_set if w.lower() not in stopwords_set]
    
    return words_set

# afinn implemented analysis
def getSentiments(text, setScores = scores165):
    tokens = getTokens(text)
    score = 0
    for eachWord in tokens:
        if eachWord in setScores:
            score += setScores[eachWord]
    return score

# affin library implementarion
def afScores(text):
    return af.score(text)

def classifyScoredAfinn(scores = []):
    return [[score, 'positive'] if score > 0 
            else [score, 'negative'] if score < 0 
            else [score, 'neutral'] 
            for score in scores]