In [None]:
# system import
import sys
import os
import csv
import config
import random
import time

# data processing
import pandas as pd
import numpy
import spacy
from spacy_cld import LanguageDetector

# data load
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from bs4 import BeautifulSoup
import requests

In [None]:
# set env
PROJECT_PATH = config.data_directory
client_credentials_manager = SpotifyClientCredentials(client_id=config.client_id, client_secret=config.client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

# try to not get banned
USER_AGENTS = [
    'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11',
    'Mozilla/5.0 (iPad; CPU OS 8_4_1 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12H321 Safari/600.1.4',
    'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
    'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393'
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:11.0) Gecko/20100101 Firefox/11.0',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:53.0) Gecko/20100101 Firefox/53.0',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36 OPR/58.0.3135.107']

def getFromUrl(url, user_agent=True):
    time.sleep(random.randint(1,10))
    if user_agent:
        return requests.get(url, headers={'User-Agent': random.choice(USER_AGENTS)})
    return requests.get(url)

def getLyric(title):
    url = "https://www.azlyrics.com/lyrics/{}.html".format(title)
    return getFromUrl(url)

def seachLyric(title, artist):
    url = "https://search.azlyrics.com/search.php?q={}".format(title.replace(' ', '+').lower())
    c = getFromUrl(url)
    soup = BeautifulSoup(c.content)
    links = soup.find_all('b', string=artist)
    if len(links) > 0:
        link = links[0].find_previous_sibling('a')['href']
        return getFromUrl(link)
    else:
        return None

In [None]:
# file operations
def read_file(file):
    text_file = open('{}'.format(file), "r")
    text = text_file.read()
    text_file.close()
    return text

def get_lyric_name_path(a, t):
    aaz = a.replace(' ', '').lower()
    taz = t.replace(' ', '').lower()
    title = '{}/{}'.format(aaz, taz)
    print(title)
    file_path = '{}/{}.txt'.format(PROJECT_PATH, title.replace('/','-'))
    path = [title, file_path]

    if not os.path.exists(file_path):
        return path + [True]
    else:
        return path + [False]

def fetch_lylic_to_file(a, t, aa = None, tt = None):
    path = get_lyric_name_path(a, t)

    if not path[2]:
        result = getLyric(path[0])
        
        if result.status_code != requests.codes.ok:
            result = seachLyric(t, aa)
            
        if result is None:
            return 0
            
        c = result.content
        soup = BeautifulSoup(c)
        ringtones = soup.find_all("div", class_="ringtone")
        
        if len(ringtones) > 0:
            ringtone = ringtones[0]
            html_lyrics = ringtone.find_next_sibling('div')
            cleanText = html_lyrics.get_text().replace('\r','').replace('\n',' ').replace("\'",'"')

            fo = open(path[1], "w", encoding='utf-8')
            fo.write(cleanText)
            fo.close()
            
            return 1
        else:
            return 0
    else:
        print('File Already Exists')
        return 1

In [None]:
# spotify interaction
def fetch_songs(offset, count, name = 'songs', limit = 50):
    file_path = '{}/{}.csv'.format(PROJECT_PATH, name)
    new_file = 'a'
    tracks = []
    if not os.path.exists(file_path):
        tracks = [['id', 'name', 'popularity', 'duration_ms', 'explicit', 'artist', 'artist_id', 'album', 'album_id', 'album_release_date', 'album_release_date_precision', 'album_total_tracks']]
        new_file = 'w'
        
    with open(file_path, new_file, newline = '', encoding='utf-8') as csvFile:
        writer = csv.writer(csvFile)
        for i in range(offset, count * limit, limit):
            track_results = sp.search(q='year:1990', type='track', limit=limit,offset=i)
            for i, t in enumerate(track_results['tracks']['items']):
                tracks.append([t['id'], t['name'], t['popularity'], t['duration_ms'], t['explicit'], t['artists'][0]['name'], t['artists'][0]['id'],  t['album']['name'], t['album']['id'], t['album']['release_date'], t['album']['release_date_precision'], t['album']['total_tracks']])
        writer.writerows(tracks)
    csvFile.close()

In [None]:
fetch_songs(0, 10)

In [None]:
# load dataFrame
df = pd.read_csv('{}/{}.csv'.format(PROJECT_PATH, 'songs'))

In [None]:
# get audio features

ids = df['id'].tolist()

n = 100
x = (len(ids) + n - 1) // n
rg = range(x)
cIds = [ids[i * n:(i + 1) * n] for i in rg]  

features = []

for cId in cIds:
    audio_results = sp.audio_features(cId)
    for af in audio_results:
        if af:
            features.append([af['id'], af['danceability'], af['energy'], af['key'], af['loudness'], af['mode'], af['speechiness'], af['acousticness'], af['instrumentalness'], af['liveness'], af['valence'], af['tempo']])

In [None]:
dfa = pd.DataFrame(features, columns = ['id', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo'])

df = pd.merge(df, dfa, on='id', how='inner')

In [None]:
# save dataframe to csv file
def dataframe_to_csv(pDf, name = 'songs'):
    file_path = '{}/{}.csv'.format(PROJECT_PATH, name)
    pDf.to_csv(file_path, sep=',', encoding='utf-8', index=False)

In [None]:
dataframe_to_csv(df)

In [None]:
df.head()

In [None]:
# spacy set language
nlp = spacy.load('en')
language_detector = LanguageDetector()
nlp.add_pipe(language_detector)

def isTextInEnglish(text):
    doc = nlp(text)
    languages = doc._.languages
    try:
        score = doc._.language_scores['en']
        return score > 0.75
    except:
        return False

In [None]:
# text cleansing 
def clean_text(text):
    return text.replace('"',"'").replace('“','"').replace('”','"').replace('’','"').replace('‘','"')

def add_lyrics(row):
    if row['has_lyrics']:
        return clean_text(read_file(row['file_name']))
    else:
        return ''

In [None]:
# az data
pattern = r'[^a-z0-9A-z]'
df['p_name'] = df['name'].str.replace(pattern, ' ').str.lower()
df['p_artist'] = df['artist'].str.replace(pattern, ' ').str.lower()
df['file_name'] = df.apply(lambda x: get_lyric_name_path(x['p_artist'], x['p_name'])[1], axis=1)
df['has_lyrics'] = 0

In [None]:
# fetch from internet
for index, row in df.iterrows():
    df.loc[index, 'has_lyrics'] = fetch_lylic_to_file(row['p_artist'], row['p_name'], row['artist'], row['name'])

In [10]:
# fetch from local
for index, row in df.iterrows():
    exists = os.path.isfile(row['file_name'])
    if exists:
        df.loc[index, 'has_lyrics'] = exists

In [11]:
df['lyrics'] = df.apply(lambda x: add_lyrics(x), axis=1)
df['is_english'] = df.apply(lambda x: isTextInEnglish(x['lyrics']), axis=1)

In [12]:
df.head(30)

Unnamed: 0,id,name,popularity,duration_ms,explicit,artist,artist_id,album,album_id,album_release_date,...,instrumentalness,liveness,valence,tempo,p_name,p_artist,file_name,has_lyrics,lyrics,is_english
0,57bgtoPSgt236HzfBOd8kj,Thunderstruck,82,292880,False,AC/DC,711MCceyCBcFnzjGY4Q7Un,The Razors Edge,4vu7F6h90Br1ZtYYaqfITy,1990-09-24,...,0.0114,0.217,0.257,133.519,thunderstruck,ac dc,./data/acdc-thunderstruck.txt,True,Thunder [x10] I was caught In the middle of ...,True
1,5pKJtX4wBeby9qIfFhyOJj,Don't Go Breaking My Heart,71,271133,False,Elton John,3PhoLpVuITZKcymswpck5b,To Be Continued...,7iKDBfOFUtg8E8RbuDhiux,1990-11-08,...,1.1e-05,0.0685,0.779,131.312,don t go breaking my heart,elton john,./data/eltonjohn-dontgobreakingmyheart.txt,True,Don't go breaking my heart I couldn't if I t...,True
2,6gZVQvQZOFpzIy3HblJ20F,Man in the Box,72,284426,True,Alice In Chains,64tNsm6TnZe2zpcMVMOoHL,Facelift,5LbHbwejgZXRZAgzVAjkhj,1990,...,8e-06,0.213,0.644,106.463,man in the box,alice in chains,./data/aliceinchains-maninthebox.txt,True,I'm the man in the box [Album version:] Buri...,True
3,6m59VvDUi0UQsB2eZ9wVbH,Poison,68,261853,False,Bell Biv DeVoe,2zFZiWQJFFshzojycnXoTL,Poison,6H5mxGUWguDjtQ4Uzd8veD,1990-01-01,...,0.00432,0.612,0.803,111.848,poison,bell biv devoe,./data/bellbivdevoe-poison.txt,True,"[Michael Bivins:] Yeah, Spiderman and Freeze...",True
4,63vL5oxWrlvaJ0ayNaQnbX,Istanbul,73,153813,False,They Might Be Giants,6zB02lwP6L6ZH32nggQiJT,Flood,7FwAtuhhWivxvK4aPgyyUD,1990-01-02,...,0.00453,0.136,0.892,114.144,istanbul,they might be giants,./data/theymightbegiants-istanbul.txt,True,Istanbul was Constantinople Now it's Istanbu...,True
5,3Ti0GdlrotgwsAVBBugv0I,Can I Kick It?,71,251573,False,A Tribe Called Quest,09hVIj6vWgoCDtT03h8ZCa,People's Instinctive Travels and the Paths of ...,3kV0i1qqudjf0PGawJ4jck,1990-04-17,...,0.000699,0.129,0.744,96.662,can i kick it,a tribe called quest,./data/atribecalledquest-canikickit.txt,True,"[Q-Tip] Can I kick it? (Yes, you can!) [7X]...",True
6,1gVgkQFOKa8Wc1HYsJtPdH,More Than Words,70,334106,False,Extreme,6w7j5wQ5AI5OQYlcM15s2L,Extreme II - Pornograffitti,7DKHQxJTI32UyCdDdGwvRC,1990-01-01,...,0.0,0.114,0.249,91.769,more than words,extreme,./data/extreme-morethanwords.txt,True,Saying 'I love you' Is not the words I want ...,True
7,6NdcSEhpGGAYXNnnhGS2s6,What I Like About You,69,175400,False,The Romantics,3daM7asS0gCFvyLemNx2EE,What I Like About You (And Other Romantic Hits),5ZwUOFZdWQ81RYMwXc4j3B,1990,...,5.3e-05,0.171,0.92,159.81,what i like about you,the romantics,./data/theromantics-whatilikeaboutyou.txt,True,Hey Uh huh huh Hey Uh huh huh What I like a...,True
8,2SgbR6ttzoNlCRGQOKjrop,Cowboys From Hell,71,243533,True,Pantera,14pVkFUHDL207LzLHtSA18,Cowboys From Hell,5szY4sBOSD6IFjFN1RtWTe,1990-07-20,...,0.0345,0.082,0.437,114.816,cowboys from hell,pantera,./data/pantera-cowboysfromhell.txt,True,Under the lights where we stand tall Nobody ...,True
9,1B75hgRqe7A4fwee3g3Wmu,U Can't Touch This,73,257360,False,MC Hammer,2rblp9fJo16ZPTcKDtlmKW,Please Hammer Don't Hurt 'Em,4r1WecJyt5FOhglysp9zhN,1990-02-20,...,0.000339,0.0864,0.866,133.148,u can t touch this,mc hammer,./data/mchammer-ucanttouchthis.txt,True,"You can't touch this [5x] My, my, my, my mu...",True


In [13]:
# spacy entity recognition
def entityRecognition(text):
    doc = nlp(text)
    return set(doc.ents)

In [14]:
df['t_entities'] = df.apply(lambda x: entityRecognition(x['lyrics']), axis=1)

In [15]:
df.head()

Unnamed: 0,id,name,popularity,duration_ms,explicit,artist,artist_id,album,album_id,album_release_date,...,liveness,valence,tempo,p_name,p_artist,file_name,has_lyrics,lyrics,is_english,t_entities
0,57bgtoPSgt236HzfBOd8kj,Thunderstruck,82,292880,False,AC/DC,711MCceyCBcFnzjGY4Q7Un,The Razors Edge,4vu7F6h90Br1ZtYYaqfITy,1990-09-24,...,0.217,0.257,133.519,thunderstruck,ac dc,./data/acdc-thunderstruck.txt,True,Thunder [x10] I was caught In the middle of ...,True,"{(Broke), (Beating), (Texas), (Thunderstruck),..."
1,5pKJtX4wBeby9qIfFhyOJj,Don't Go Breaking My Heart,71,271133,False,Elton John,3PhoLpVuITZKcymswpck5b,To Be Continued...,7iKDBfOFUtg8E8RbuDhiux,1990-11-08,...,0.0685,0.779,131.312,don t go breaking my heart,elton john,./data/eltonjohn-dontgobreakingmyheart.txt,True,Don't go breaking my heart I couldn't if I t...,True,{(Baby)}
2,6gZVQvQZOFpzIy3HblJ20F,Man in the Box,72,284426,True,Alice In Chains,64tNsm6TnZe2zpcMVMOoHL,Facelift,5LbHbwejgZXRZAgzVAjkhj,1990,...,0.213,0.644,106.463,man in the box,alice in chains,./data/aliceinchains-maninthebox.txt,True,I'm the man in the box [Album version:] Buri...,True,"{(Jesus, Christ), (Jesus, Christ), (Jesus, Chr..."
3,6m59VvDUi0UQsB2eZ9wVbH,Poison,68,261853,False,Bell Biv DeVoe,2zFZiWQJFFshzojycnXoTL,Poison,6H5mxGUWguDjtQ4Uzd8veD,1990-01-01,...,0.612,0.803,111.848,poison,bell biv devoe,./data/bellbivdevoe-poison.txt,True,"[Michael Bivins:] Yeah, Spiderman and Freeze...",True,"{(Lookin), (YOU'LL), (Ralph, T), (1), (Ronnie,..."
4,63vL5oxWrlvaJ0ayNaQnbX,Istanbul,73,153813,False,They Might Be Giants,6zB02lwP6L6ZH32nggQiJT,Flood,7FwAtuhhWivxvK4aPgyyUD,1990-01-02,...,0.136,0.892,114.144,istanbul,they might be giants,./data/theymightbegiants-istanbul.txt,True,Istanbul was Constantinople Now it's Istanbu...,True,"{(Constantinople, Why), (Turks), (New, Amsterd..."
