In [1]:
# system import
import sys
import os
import csv
import config
import random
import time

# data processing
import pandas as pd
import numpy
import spacy
from spacy_cld import LanguageDetector

# data load
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from bs4 import BeautifulSoup
import requests

In [2]:
# set env
PROJECT_PATH = config.data_directory
client_credentials_manager = SpotifyClientCredentials(client_id=config.client_id, client_secret=config.client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

# try to not get banned
USER_AGENTS = [
    'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11',
    'Mozilla/5.0 (iPad; CPU OS 8_4_1 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12H321 Safari/600.1.4',
    'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
    'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393'
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:11.0) Gecko/20100101 Firefox/11.0',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:53.0) Gecko/20100101 Firefox/53.0',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36 OPR/58.0.3135.107']

def getFromUrl(url, user_agent=True):
    time.sleep(random.randint(1,10))
    if user_agent:
        return requests.get(url, headers={'User-Agent': random.choice(USER_AGENTS)})
    return requests.get(url)

def getLyric(title):
    url = "https://www.azlyrics.com/lyrics/{}.html".format(title)
    return getFromUrl(url)

def seachLyric(title, artist):
    url = "https://search.azlyrics.com/search.php?q={}".format(title.replace(' ', '+').lower())
    c = getFromUrl(url)
    soup = BeautifulSoup(c.content)
    links = soup.find_all('b', string=artist)
    if len(links) > 0:
        link = links[0].find_previous_sibling('a')['href']
        return getFromUrl(link)
    else:
        return None

In [3]:
# file operations
def read_file(file):
    text_file = open('{}'.format(file), "r")
    text = text_file.read()
    text_file.close()
    return text

def get_lyric_name_path(a, t):
    aaz = a.replace(' ', '').lower()
    taz = t.replace(' ', '').lower()
    title = '{}/{}'.format(aaz, taz)
    print(title)
    file_path = '{}/{}.txt'.format(PROJECT_PATH, title.replace('/','-'))
    path = [title, file_path]

    if not os.path.exists(file_path):
        return path + [True]
    else:
        return path + [False]

def fetch_lylic_to_file(a, t, aa = None, tt = None):
    path = get_lyric_name_path(a, t)

    if not path[2]:
        result = getLyric(path[0])
        
        if result.status_code != requests.codes.ok:
            result = seachLyric(t, aa)
            
        if result is None:
            return 0
            
        c = result.content
        soup = BeautifulSoup(c)
        ringtones = soup.find_all("div", class_="ringtone")
        
        if len(ringtones) > 0:
            ringtone = ringtones[0]
            html_lyrics = ringtone.find_next_sibling('div')
            cleanText = html_lyrics.get_text().replace('\r','').replace('\n',' ').replace("\'",'"')

            fo = open(path[1], "w", encoding='utf-8')
            fo.write(cleanText)
            fo.close()
            
            return 1
        else:
            return 0
    else:
        print('File Already Exists')
        return 1

In [4]:
# spotify interaction
def fetch_songs(offset, count, name = 'songs', limit = 50):
    file_path = '{}/{}.csv'.format(PROJECT_PATH, name)
    new_file = 'a'
    tracks = []
    if not os.path.exists(file_path):
        tracks = [['id', 'name', 'popularity', 'duration_ms', 'explicit', 'artist', 'artist_id', 'album', 'album_id', 'album_release_date', 'album_release_date_precision', 'album_total_tracks']]
        new_file = 'w'
        
    with open(file_path, new_file, newline = '', encoding='utf-8') as csvFile:
        writer = csv.writer(csvFile)
        for i in range(offset, count * limit, limit):
            track_results = sp.search(q='year:1990', type='track', limit=limit,offset=i)
            for i, t in enumerate(track_results['tracks']['items']):
                tracks.append([t['id'], t['name'], t['popularity'], t['duration_ms'], t['explicit'], t['artists'][0]['name'], t['artists'][0]['id'],  t['album']['name'], t['album']['id'], t['album']['release_date'], t['album']['release_date_precision'], t['album']['total_tracks']])
        writer.writerows(tracks)
    csvFile.close()

In [None]:
fetch_songs(0, 10)

In [5]:
# load dataFrame
df = pd.read_csv('{}/{}.csv'.format(PROJECT_PATH, 'songs'))

In [None]:
# get audio features

ids = df['id'].tolist()

n = 100
x = (len(ids) + n - 1) // n
rg = range(x)
cIds = [ids[i * n:(i + 1) * n] for i in rg]  

features = []

for cId in cIds:
    audio_results = sp.audio_features(cId)
    for af in audio_results:
        if af:
            features.append([af['id'], af['danceability'], af['energy'], af['key'], af['loudness'], af['mode'], af['speechiness'], af['acousticness'], af['instrumentalness'], af['liveness'], af['valence'], af['tempo']])

In [None]:
dfa = pd.DataFrame(features, columns = ['id', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo'])

df = pd.merge(df, dfa, on='id', how='inner')

In [6]:
# save dataframe to csv file
def dataframe_to_csv(pDf, name = 'songs'):
    file_path = '{}/{}.csv'.format(PROJECT_PATH, name)
    pDf.to_csv(file_path, sep=',', encoding='utf-8', index=False)

In [None]:
dataframe_to_csv(df)

In [None]:
df.head()

In [7]:
# spacy set language
nlp = spacy.load('en')
language_detector = LanguageDetector()
nlp.add_pipe(language_detector)

def isTextInEnglish(text):
    doc = nlp(text)
    languages = doc._.languages
    try:
        score = doc._.language_scores['en']
        return score > 0.75
    except:
        return False

In [12]:
# text cleansing 
def clean_text(text):
    return text.replace('"',"'").replace('“','"').replace('”','"').replace('’','"').replace('‘','"')

def add_lyrics(row):
    if row['has_lyrics']:
        return clean_text(read_file(row['file_name']))
    else:
        return ''

In [8]:
# az data
pattern = r'[^a-z0-9A-z]'
df['p_name'] = df['name'].str.replace(pattern, ' ').str.lower()
df['p_artist'] = df['artist'].str.replace(pattern, ' ').str.lower()
df['file_name'] = df.apply(lambda x: get_lyric_name_path(x['p_artist'], x['p_name'])[1], axis=1)
df['has_lyrics'] = 0

acdc/thunderstruck
eltonjohn/dontgobreakingmyheart
aliceinchains/maninthebox
bellbivdevoe/poison
theymightbegiants/istanbul
atribecalledquest/canikickit
extreme/morethanwords
theromantics/whatilikeaboutyou
pantera/cowboysfromhell
mchammer/ucanttouchthis
theblackcrowes/hardtohandle
warrant/cherrypie
digitalunderground/thehumptydance
wilsonphillips/holdon
joediffie/johndeeregreen
thelas/thereshegoes
scorpions/windofchange
depechemode/enjoythesilencesinglemix
beegees/stayinalivefromsaturdaynightfeversoundtrack
joediffie/pickupman
ramjam/blackbetty
therighteousbrothers/unchainedmelody
sinadoconnor/nothingcompares2u
templeofthedog/hungerstrike
keithwhitley/dontcloseyoureyes
juangabriel/abrzamemuyfuerte
pantera/cemeterygates
man/rayandoelsol
joediffie/propmeupbesidethejukeboxifidie
llcoolj/mamasaidknockyouout
megadeth/holywarsthepunishmentdueremastered2004
joediffie/pickupman
tonytoniton/feelsgood
vanillaice/iceicebaby
poison/unskinnybop
heart/alliwannadoismakelovetoyou
envogue/holdon
hifive

janesaddiction/stop
warrant/uncletomscabin
eltonjohn/sadsongssaysomuch
james/sitdown
enigma/meaculpa
megadeth/fivemagicsremastered2004
bennygoodman/stompinatthesavoy
firehouse/allshewrotelive
thebangles/hazyshadeofwinter
losprisioneros/trenalsur
markchesnutt/brotherjukebox
charlestrenet/lamer
marciagriffiths/electricboogie
madonna/crazyforyoueditversion
bettemidler/fromadistance
harryconnickjr/weareinlove
motherlovebone/crownofthorns
davidbowie/heroessingleversion1990remasteredversion
iggypop/candy
pantera/thesleep
dannyelfman/icedance
warrant/isawred
wilsonphillips/youreinlove
thejudds/lovecanbuildabridge
extreme/getthefunkout
tka/louderthanlove
icecube/amerikkkasmostwanted
digitalunderground/freaksoftheindustry
theblackcrowes/seeingthings
extreme/holehearted
sergeiprokofiev/lieutenantkijop60iiromance
fleetwoodmac/loveisdangerous
tesla/signsliveatthetrocadero1990
danzig/devilsplaything
slayer/skeletonsofsociety
gruponiche/buscapordentro
danieljohnston/somethingslastalongtime
petshopbo

In [None]:
# fetch from internet
for index, row in df.iterrows():
    df.loc[index, 'has_lyrics'] = fetch_lylic_to_file(row['p_artist'], row['p_name'], row['artist'], row['name'])

In [10]:
# fetch from local
for index, row in df.iterrows():
    exists = os.path.isfile(row['file_name'])
    if exists:
        df.loc[index, 'has_lyrics'] = exists

In [None]:
df['lyrics'] = df.apply(lambda x: add_lyrics(x), axis=1)
df['is_english'] = df.apply(lambda x: isTextInEnglish(x['lyrics']), axis=1)

In [None]:
df.head(30)

In [None]:
# spacy entity recognition
def entityRecognition(text):
    ret = {}
    doc = nlp(text)
    for ent in doc.ents:
        ret[(ent.label, ent.text)] = ent.label_
    return ret

In [None]:
df['t_entities'] = df.apply(lambda x: entityRecognition(x['lyrics']), axis=1)

In [None]:
df.head()