In [1]:
import requests
import pandas as pd
import csv
import datetime
import dateutil
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta, FR
import json
from pathlib import Path
from difflib import SequenceMatcher
import operator
import re
import plotly
import plotly.plotly as py
import plotly.graph_objs as go
# Scientific libraries
from numpy import arange,array,ones
from scipy import stats
import spacy
from spacy_langdetect import LanguageDetector
nlp = spacy.load('en')
nlp.add_pipe(LanguageDetector(), name='language_detector', last=True)
with open('keys.json', 'r') as fp:
    our_keys = json.load(fp)
key = our_keys["musixmatch_key"]
plotly_key = our_keys["plotly_key"]
plotly.tools.set_credentials_file(username='jagluck', api_key=plotly_key)

In [2]:
#save songs
def save_songs(songs):
    if songs != {}:
        with open('data/songs.json', 'w') as fp:
            json.dump(songs, fp)
        
def load_songs():
    #load songs
    with open('data/songs.json', 'r') as fp:
        songs = json.load(fp)
        
    return songs
       
#compare similarity of two strings
def similar(a, b):
    sep = "feat"
    arest = a.split(sep, 1)[0]
    brest = b.split(sep, 1)[0]
    return SequenceMatcher(None, arest, brest).ratio()

#not currently using this
def get_lang(text):
    doc = nlp(text)
    # document level language detection. Think of it like average language of the document!
    if (doc._.language['language'] == 'es'):
        print("es")
        print(doc._.language)
        return 'es'
    elif (doc._.language['language'] == 'en'):
        print("en")
        print(doc._.language)
        return 'en'
  
    return 'X'

#get language/lyric information for a single track
def get_track(song, artist):
    songs = load_songs()
    search_url = 'http://api.musixmatch.com/ws/1.1/track.search'
    params = {'q_track' : song, 'page_size' : '30', 'page' : '1', 's_track_rating' : 'desc', 'apikey': key}
    resp = requests.get(search_url, params=params)
    tracks = json.loads(resp.text)['message']['body']['track_list']

    artist_results = {}

    index = 0
    for track in tracks:
        track_id = track['track']['track_id']
        artist_results[index] = similar(track['track']['artist_name'], artist)
        index = index + 1

    sorted_index = sorted(artist_results.items(), key=operator.itemgetter(1), reverse=True)
    
    if ((len(sorted_index) > 0) and (sorted_index[0][1] > .6)):
        winningIndex = sorted_index[0][0]
        print(tracks[winningIndex]['track'])
        track_id = tracks[winningIndex]['track']['track_id']
        commontrack_id = tracks[winningIndex]['track']['commontrack_id']
        print(track_id)

        search_url = 'http://api.musixmatch.com/ws/1.1/track.lyrics.get'
        params = {'track_id' : track_id, "commontrack_id" : commontrack_id, 'apikey': key}
        resp = requests.get(search_url, params=params)

        body = json.loads(resp.text)['message']['body']
        if ('lyrics' in body):
            lyrics = body['lyrics']
            lyrics_body  = lyrics['lyrics_body']
#             lyrics_language = lyrics['lyrics_language']

            id = (song + " by " + artist)
            song_info = {}
            song_info['song'] = song
            song_info['artist'] = artist
            song_info['lyrics_body'] = lyrics['lyrics_body'].replace('\n', ' ')[:-59]
#             song_info['lyrics_language'] = lyrics['lyrics_language']
            song_info['lyrics_language'] = get_lang(song_info['lyrics_body'])
            songs[id] = song_info
        else:
            print("empty resp")
            id = (song + " by " + artist)
            song_info = {}
            song_info['song'] = song
            song_info['artist'] = artist
            song_info['lyrics_body'] = "no lyrics"
            lg = "X"
            song_info['lyrics_language'] = lg
            songs[id] = song_info
    else:
        id = (song + " by " + artist)
        song_info = {}
        song_info['song'] = song
        song_info['artist'] = artist
        song_info['lyrics_body'] = "no lyrics"
        lg = "X"
        song_info['lyrics_language'] = lg
        songs[id] = song_info
        
    save_songs(songs)
       
#go through every song in a file
def get_songs(df, fileName):
    songs = load_songs()
    languages = []
    lyrics = []
    for songTitle, artist in zip(df['Track Name'], df['Artist']):
        songTitle = str(songTitle)
        id = (songTitle + ' by ' + artist) 
        print(id)
        #if we do not alreay have the songs language and info archived, then query for it
        if id not in songs:
            get_track(songTitle, artist)
        else:
            print("already have")
        songs = load_songs()
        songInfo = songs[id]
        lyrics.append(songInfo['lyrics_body'])
        languages.append(songInfo['lyrics_language'])
        
    #add new info to dataframe
    df['language'] = languages
    df['lyrics'] = lyrics
    
    df.to_csv(fileName)
    
#create urls and download files for a country and the last x weeks
def download_charts(country, weeks):
    
    #find last fridays date
    last_friday = datetime.now() + relativedelta(weekday=FR(-1))

    if (datetime.now().strftime('%Y-%m-%d') == last_friday.strftime('%Y-%m-%d')):
        last_friday = datetime.now() + relativedelta(weekday=FR(-1))

    files = []
    urls = []
    newer_date = last_friday - timedelta(days=7)
    
    #create uls by counting back a friday at a time
    for i in range(weeks):
        older_date = newer_date - timedelta(days=7)
        newer_str = newer_date.strftime('%Y-%m-%d')
        older_str = older_date.strftime('%Y-%m-%d')
        newer_date = older_date
        url = 'https://spotifycharts.com/regional/' + country + '/weekly/' + older_str + '--' + newer_str + '/download'
        file_name = "data/" + country + "/" + country + "_" + older_str + '--' + newer_str + '.csv'
#         print(url)
#         print(file_name)
        urls.append(url)
        files.append(file_name)
        
    #if we do not already have the file download it
    for file, url in zip(files,urls):
        my_file = Path(file)
        if not my_file.is_file():
            resp = requests.get(url)
            with open(file, 'w') as f:
                writer = csv.writer(f)
                reader = csv.reader(resp.text.splitlines())

                for row in reader:
                    writer.writerow(row)
                
    return files

#take file and add language/lyric info if needed
def add_language(files):

    for file in files:
        
        #load and reformat this file
        df = pd.read_csv(file,skiprows=[0])
        if (list(df) != ['Position', 'Track Name', 'Artist', 'Streams', 'URL']):
            df = pd.read_csv(file,skiprows=[])
            if 'Unnamed: 0' in list(df):
                df = df.drop('Unnamed: 0', axis=1)


        df.Artist = df.Artist.astype(str)
        
        #use api and add language/lyrics only if you do not already have them
        if ('language' not in list(df)):
            get_songs(df, file) 
            songs = load_songs()
            print(len(songs))
        
def get_data(country, weeks):
       
    #download chart files
    files = download_charts(country, weeks)   
    
    #add language information to files
    add_language(files)

    #analyze language for every week
    country = []
    week = []
    english_percent = []
    spanish_percent = []

    for file in files:
        df = pd.read_csv(file,skiprows=[0])
        if (list(df) != ['Position', 'Track Name', 'Artist', 'Streams', 'URL']):
            df = pd.read_csv(file,skiprows=[])
            if 'Unnamed: 0' in list(df):
                df = df.drop('Unnamed: 0', axis=1)


        df.Artist = df.Artist.astype(str)
        lang = {}
        for l in df['language']:
            if l in lang:
                lang[l] = lang[l] + 1
            else:
                lang[l] = 1

        eng = 0
        if "en" in lang:
            eng = lang['en']

        esp = 0
        if "es" in lang:
            esp = lang['es']

        total = eng + esp
        english_percent.append(round((eng/total),2))
        spanish_percent.append(round((esp/total),2))
        country.append(file[5:7])
        week.append(file[-14:][0:10])

    data = pd.DataFrame(
            {'country': country,
             'week': week,
             "english_percent": english_percent,
             "spanish_percent": spanish_percent
            })
    
    return data

In [3]:
us = get_data('us', 104)
us.sort_values(by=['week'], ascending=False)
us.head(104)

Unnamed: 0,country,week,english_percent,spanish_percent
0,us,2019-06-28,0.96,0.04
1,us,2019-06-21,0.96,0.04
2,us,2019-06-14,0.96,0.04
3,us,2019-06-07,0.96,0.04
4,us,2019-05-31,0.97,0.03
5,us,2019-05-24,0.96,0.04
6,us,2019-05-17,0.96,0.04
7,us,2019-05-10,0.96,0.04
8,us,2019-05-03,0.96,0.04
9,us,2019-04-26,0.96,0.04


In [4]:
do = get_data('do', 104)
do.sort_values(by=['week'], ascending=False)
do.head(104)

Unnamed: 0,country,week,english_percent,spanish_percent
0,do,2019-06-28,0.11,0.89
1,do,2019-06-21,0.10,0.90
2,do,2019-06-14,0.10,0.90
3,do,2019-06-07,0.10,0.90
4,do,2019-05-31,0.13,0.87
5,do,2019-05-24,0.13,0.87
6,do,2019-05-17,0.15,0.85
7,do,2019-05-10,0.15,0.85
8,do,2019-05-03,0.14,0.86
9,do,2019-04-26,0.14,0.86


In [5]:
mx = get_data('mx', 104)
mx.sort_values(by=['week'], ascending=False)
mx.head(104)

Unnamed: 0,country,week,english_percent,spanish_percent
0,mx,2019-06-28,0.21,0.79
1,mx,2019-06-21,0.21,0.79
2,mx,2019-06-14,0.22,0.78
3,mx,2019-06-07,0.20,0.80
4,mx,2019-05-31,0.19,0.81
5,mx,2019-05-24,0.22,0.78
6,mx,2019-05-17,0.22,0.78
7,mx,2019-05-10,0.23,0.77
8,mx,2019-05-03,0.22,0.78
9,mx,2019-04-26,0.22,0.78


In [6]:
mx = get_data('cr', 104)
mx.sort_values(by=['week'], ascending=False)
mx.head(104)

Unnamed: 0,country,week,english_percent,spanish_percent
0,cr,2019-06-28,0.29,0.71
1,cr,2019-06-21,0.29,0.71
2,cr,2019-06-14,0.29,0.71
3,cr,2019-06-07,0.28,0.72
4,cr,2019-05-31,0.28,0.72
5,cr,2019-05-24,0.30,0.70
6,cr,2019-05-17,0.32,0.68
7,cr,2019-05-10,0.32,0.68
8,cr,2019-05-03,0.33,0.67
9,cr,2019-04-26,0.34,0.66


In [7]:
ar = get_data('ar', 104)
ar.sort_values(by=['week'], ascending=False)
ar.head(104)

Unnamed: 0,country,week,english_percent,spanish_percent
0,ar,2019-06-28,0.16,0.84
1,ar,2019-06-21,0.14,0.86
2,ar,2019-06-14,0.13,0.87
3,ar,2019-06-07,0.15,0.85
4,ar,2019-05-31,0.15,0.85
5,ar,2019-05-24,0.16,0.84
6,ar,2019-05-17,0.17,0.83
7,ar,2019-05-10,0.18,0.82
8,ar,2019-05-03,0.20,0.80
9,ar,2019-04-26,0.21,0.79


In [8]:
gt = get_data('gt', 104)
gt.sort_values(by=['week'], ascending=False)
gt.head(104)

Unnamed: 0,country,week,english_percent,spanish_percent
0,gt,2019-06-28,0.16,0.84
1,gt,2019-06-21,0.17,0.83
2,gt,2019-06-14,0.18,0.82
3,gt,2019-06-07,0.16,0.84
4,gt,2019-05-31,0.18,0.82
5,gt,2019-05-24,0.18,0.82
6,gt,2019-05-17,0.20,0.80
7,gt,2019-05-10,0.18,0.82
8,gt,2019-05-03,0.20,0.80
9,gt,2019-04-26,0.21,0.79


In [9]:
co = get_data('co', 104)
co.sort_values(by=['week'], ascending=False)
co.head(104)

Unnamed: 0,country,week,english_percent,spanish_percent
0,co,2019-06-28,0.13,0.87
1,co,2019-06-21,0.15,0.85
2,co,2019-06-14,0.15,0.85
3,co,2019-06-07,0.13,0.87
4,co,2019-05-31,0.16,0.84
5,co,2019-05-24,0.17,0.83
6,co,2019-05-17,0.19,0.81
7,co,2019-05-10,0.18,0.82
8,co,2019-05-03,0.21,0.79
9,co,2019-04-26,0.23,0.77


In [10]:
glbl = get_data('global', 104)
glbl.sort_values(by=['week'], ascending=False)
glbl.head(104)

Unnamed: 0,country,week,english_percent,spanish_percent
0,gl,2019-06-28,0.81,0.19
1,gl,2019-06-21,0.80,0.20
2,gl,2019-06-14,0.81,0.19
3,gl,2019-06-07,0.82,0.18
4,gl,2019-05-31,0.82,0.18
5,gl,2019-05-24,0.84,0.16
6,gl,2019-05-17,0.82,0.18
7,gl,2019-05-10,0.83,0.17
8,gl,2019-05-03,0.83,0.17
9,gl,2019-04-26,0.84,0.16


In [32]:
import pandas as pd

def get_all_data(countries, go_back):
    #find last fridays date
    last_friday = datetime.now() + relativedelta(weekday=FR(-1))

    if (datetime.now().strftime('%Y-%m-%d') == last_friday.strftime('%Y-%m-%d')):
        last_friday = datetime.now() + relativedelta(weekday=FR(-1))

    all_pds = []
    files = []
    urls = []
    country = []
    week = []
    newer_date = last_friday - timedelta(days=7)

    #create uls by counting back a friday at a time
    for i in range(go_back):
        dfs = []
        first = True
        older_date = newer_date - timedelta(days=7)
        newer_str = newer_date.strftime('%Y-%m-%d')
        older_str = older_date.strftime('%Y-%m-%d')
        newer_date = older_date
        for c in countries:
            file_name = "data/" + c + "/" + c + "_" + older_str + '--' + newer_str + '.csv'
            df = pd.read_csv(file_name,skiprows=[0])
            if (list(df) != ['Position', 'Track Name', 'Artist', 'Streams', 'URL']):
                df = pd.read_csv(file_name,skiprows=[])
                if 'Unnamed: 0' in list(df):
                    df = df.drop('Unnamed: 0', axis=1)


            df.Artist = df.Artist.astype(str)
            dfs.append(df)
            if first:
                first = False
                country.append("all")
                week.append(file_name[-14:][0:10])
        whole=pd.concat(dfs)   
        all_pds.append(whole)  

    #analyze language for every week
    english_percent = []
    spanish_percent = []

    for df in all_pds:
        if (list(df) != ['Position', 'Track Name', 'Artist', 'Streams', 'URL']):
            if 'Unnamed: 0' in list(df):
                df = df.drop('Unnamed: 0', axis=1)

        lang = {}
        for l in df['language']:
            if l in lang:
                lang[l] = lang[l] + 1
            else:
                lang[l] = 1

        eng = 0
        if "en" in lang:
            eng = lang['en']

        esp = 0
        if "es" in lang:
            esp = lang['es']

        total = eng + esp
        english_percent.append(round((eng/total),2))
        spanish_percent.append(round((esp/total),2))

    data = pd.DataFrame(
            {'country': country,
             'week': week,
             "english_percent": english_percent,
             "spanish_percent": spanish_percent
            })
    
    return data

In [33]:
all_countries = get_all_data(["mx","gt","ar","do", "co"],104)
all_countries.head(104)
all_countries.to_csv("data/all_countries.csv")

In [29]:
# Create a trace
trace = go.Scatter(
    x = us['week'],
    y = us['spanish_percent']
)

data = [trace]

layout = dict(title = 'Percent of Spanish songs in Spotifys US weekly top 200 Chart',
              xaxis = dict(title = 'Week'),
              yaxis = dict(title = 'Percent of Spanish songs', tickformat = '.0%'),
              )

fig = dict(data=data, layout=layout)
py.iplot(figure_or_data=fig, filename='basic-line')

High five! You successfully sent some data to your account on plotly. View your plot in your browser at https://plot.ly/~jagluck/0 or inside your plot.ly account where it is named 'basic-line'


In [14]:
# Create a trace
trace = go.Scatter(
    x = do['week'],
    y = do['spanish_percent']
)

data = [trace]

layout = dict(title = 'Percent of Spanish songs in Spotifys Dominican Republic weekly top 200 Chart',
              xaxis = dict(title = 'Week'),
              yaxis = dict(title = 'Percent of Spanish songs', tickformat = '.0%'),
              )

fig = dict(data=data, layout=layout)
py.iplot(figure_or_data=fig, filename='basic-line')

High five! You successfully sent some data to your account on plotly. View your plot in your browser at https://plot.ly/~jagluck/0 or inside your plot.ly account where it is named 'basic-line'


In [15]:
# Create a trace
trace = go.Scatter(
    x = co['week'],
    y = co['spanish_percent']
)

data = [trace]

layout = dict(title = 'Percent of Spanish songs in Spotifys Colombia weekly top 200 Chart',
              xaxis = dict(title = 'Week'),
              yaxis = dict(title = 'Percent of Spanish songs', tickformat = '.0%'),
              )

fig = dict(data=data, layout=layout)
py.iplot(figure_or_data=fig, filename='basic-line')

High five! You successfully sent some data to your account on plotly. View your plot in your browser at https://plot.ly/~jagluck/0 or inside your plot.ly account where it is named 'basic-line'


In [16]:
# Create a trace
trace = go.Scatter(
    x = mx['week'],
    y = mx['spanish_percent']
)

data = [trace]

layout = dict(title = 'Percent of Spanish songs in Spotifys Mexico weekly top 200 Chart',
              xaxis = dict(title = 'Week'),
              yaxis = dict(title = 'Percent of Spanish songs', tickformat = '.0%'),
              )

fig = dict(data=data, layout=layout)
py.iplot(figure_or_data=fig, filename='basic-line')

High five! You successfully sent some data to your account on plotly. View your plot in your browser at https://plot.ly/~jagluck/0 or inside your plot.ly account where it is named 'basic-line'


In [17]:
# Create a trace
trace = go.Scatter(
    x = ar['week'],
    y = ar['spanish_percent']
)

data = [trace]

layout = dict(title = 'Percent of Spanish songs in Spotifys Argentina weekly top 200 Chart',
              xaxis = dict(title = 'Week'),
              yaxis = dict(title = 'Percent of Spanish songs', tickformat = '.0%'),
              )

fig = dict(data=data, layout=layout)
py.iplot(figure_or_data=fig, filename='basic-line')

High five! You successfully sent some data to your account on plotly. View your plot in your browser at https://plot.ly/~jagluck/0 or inside your plot.ly account where it is named 'basic-line'


In [18]:
# Create a trace
trace = go.Scatter(
    x = gt['week'],
    y = gt['spanish_percent']
)

data = [trace]

layout = dict(title = 'Percent of Spanish songs in Spotifys Guatemala weekly top 200 Chart',
              xaxis = dict(title = 'Week'),
              yaxis = dict(title = 'Percent of Spanish songs', tickformat = '.0%'),
              )

fig = dict(data=data, layout=layout)
py.iplot(figure_or_data=fig, filename='basic-line')

High five! You successfully sent some data to your account on plotly. View your plot in your browser at https://plot.ly/~jagluck/0 or inside your plot.ly account where it is named 'basic-line'


In [19]:
# Create a trace
trace = go.Scatter(
    x = glbl['week'],
    y = glbl['spanish_percent']
)

data = [trace]

layout = dict(title = 'Percent of Spanish songs in Spotifys Global weekly top 200 Chart',
              xaxis = dict(title = 'Week'),
              yaxis = dict(title = 'Percent of Spanish songs', tickformat = '.0%'),
              )

fig = dict(data=data, layout=layout)
py.iplot(figure_or_data=fig, filename='basic-line')

High five! You successfully sent some data to your account on plotly. View your plot in your browser at https://plot.ly/~jagluck/0 or inside your plot.ly account where it is named 'basic-line'


In [34]:
print(len(all_countries))

# Create a trace
trace = go.Scatter(
    x = all_countries['week'],
    y = all_countries['spanish_percent'],
    name='Spanish Percent',
)

xi = arange(0,len(all_countries))

# Generated linear fit
slope, intercept, r_value, p_value, std_err = stats.linregress(xi,all_countries['spanish_percent'])
line = slope*xi+intercept

trace2 = go.Scatter(
                  x=all_countries['week'],
                  y=line,
                  mode='lines',
                  marker=go.Marker(color='rgb(31, 119, 180)'),
                  name='Fit',
                  line = dict(
                    color = ('rgb(205, 12, 24)'),
                    width = 2)
                  )

data = [trace, trace2]

layout = dict(title = 'Percent of Spanish songs in Spotifys Combined Spanish Speaking Countries weekly top 200 Chart',
              xaxis = dict(title = ''),
              yaxis = dict(title = 'Percent of Spanish songs', tickformat = '.0%'),
              )

fig = dict(data=data, layout=layout)
py.iplot(figure_or_data=fig, filename='basic-line')

104
High five! You successfully sent some data to your account on plotly. View your plot in your browser at https://plot.ly/~jagluck/0 or inside your plot.ly account where it is named 'basic-line'
