In [1]:
from io import StringIO
import boto3
from botocore.client import Config
import requests
import base64
import six
import six.moves.urllib.parse as urllibparse
import time
import numpy as np
import pandas as pd
#comment out pandas if we don't want to convert to a dataframe

credentials = open('spotify_client_credentials.txt', 'r')
id_secret = credentials.readlines()
client_id = id_secret[0][:-1]
client_secret = id_secret[1]
credentials.close()

#ad-hoc basic, will not be updated continuously for now, so taking the list of artists from csv's
stubhub_artist_list = list(pd.read_csv('TM_ArtistDetails.csv')['Artist'].unique())
ticketmaster_artist_list = list(pd.read_csv('events_perf_2018_09_10.csv')['performer_name'].unique())
seatgeek_artist_list = list(pd.read_csv('SeatGeek_eventsDF_9_11_2018.csv')['performer'].unique())

stubhub_artist_list = [str(x).lower() for x in stubhub_artist_list]
ticketmaster_artist_list = [str(x).lower() for x in ticketmaster_artist_list]
seatgeek_artist_list = [str(x).lower() for x in seatgeek_artist_list]

artist_names = list(set(stubhub_artist_list + ticketmaster_artist_list + seatgeek_artist_list))

def _make_authorization_headers(client_id, client_secret):
    auth_header = base64.b64encode(six.text_type(client_id + ':' + client_secret).encode('ascii'))
    return 'Basic %s' % auth_header.decode('ascii')

post_token = requests.post('https://accounts.spotify.com/api/token', {'grant_type': 'client_credentials'}, headers = 
              {'Authorization':_make_authorization_headers(client_id, client_secret)})
#return json including access token

token = post_token.json()['access_token']
#extract token from json

def get_artist_ids(artist_names):
    artist_ids = []
    for artist in artist_names:
        try:
            artist_ids.append(requests.get('https://api.spotify.com/v1/search', 
                                            headers = {'Authorization':'Bearer '+ token},
                                            params = {'q' : artist,'type' : 'artist'}).json()['artists']['items'][0]['id'])
            #time.sleep()
            #sleep if needed because of limitations
        except:
            continue
    return artist_ids

#search for each artist that appears

artist_ids = get_artist_ids(artist_names)
#separate into lists with max length 50, because can only grab that many artists at a time

artist_ids = [artist_ids[x:x+50] for x in range(0, len(artist_ids), 50)]
#make sublists with max length 50

#Spotify API Call to Get Artist Info

token = post_token.json()['access_token']
#if a new one is needed

def get_artist_info(artist_ids):
    artist_data = []
    for sublist in artist_ids:
        time.sleep(3)
        for ID in sublist: #limit to 50 artists per call as per the spotify api documentation
            try:
                artist_data.append(requests.get('https://api.spotify.com/v1/artists', 
                                                headers = {'Authorization':'Bearer '+ token},
                                                params = {'ids' : ID}).json())
                #time.sleep()
                #sleep if needed because of limitations
            except:
                continue
    return artist_data

#pass a list of artist id's into the function
#use token to get artist info for each artist and save as a list of dictionaries

artist_data = get_artist_info(artist_ids)

#Grab what we want from the JSON (and convert to a DataFrame if needed)

#make a list of dictionaries of only the data that we want
def build_artist_df(artist_data):
    
    output = []
    
    for i in range(len(artist_data)):
        d = {}
        
        try:
            d['url'] = artist_data[i]['artists'][0]['external_urls']['spotify'] #spotify URL
        except:
            pass
        
        try:
            d['followers'] = artist_data[i]['artists'][0]['followers']['total'] #spotify followers
        except:
            pass
        
        try:
            d['genres'] = artist_data[i]['artists'][0]['genres'] #genres
        except:
            pass
        
        try:
            d['id'] = artist_data[i]['artists'][0]['id'] #spotify id
        except:
            pass
        
        try:
            d['image_url'] = artist_data[i]['artists'][0]['images'][0]['url'] #image url
        except:
            pass
            
        try:
            d['artist'] = artist_data[i]['artists'][0]['name'] #artist name
        except:
            pass
        
        try:
            d['popularity'] = artist_data[i]['artists'][0]['popularity'] #spotify popularity index
        except:
            pass
        
        output.append(d)
        
    return output

spotify_artist_info = build_artist_df(artist_data)

artist_df = pd.DataFrame(spotify_artist_info)

genre_df = artist_df[['artist','genres']]

artist_df = artist_df.drop(['genres'],axis=1)

lst_col = 'genres'

genre_df = pd.DataFrame({
    col:np.repeat(genre_df[col].values, genre_df[lst_col].str.len())
    for col in genre_df.columns.difference([lst_col])
}).assign(**{lst_col:np.concatenate(genre_df[lst_col].values)})[genre_df.columns.tolist()]
#create new row for each genre

s3_credentials = open('s3_credentials.txt', 'r')
ACCESS_SECRET = s3_credentials.readlines()
ACCESS_KEY = ACCESS_SECRET[1][20:-1]
SECRET_KEY = ACCESS_SECRET[2][24:]
s3_credentials.close()

BUCKET_NAME = 'nycdsa.ta-am'

s3 = boto3.resource(
    's3',
    aws_access_key_id=ACCESS_KEY,
    aws_secret_access_key=SECRET_KEY,
    config=Config(signature_version='s3v4')
)

artist_csv_buffer = StringIO()
artist_df.to_csv(artist_csv_buffer, index=False)

s3.Bucket(BUCKET_NAME).put_object(Key='Spotify_Artist_Table.csv', Body=artist_csv_buffer.getvalue())

genre_csv_buffer = StringIO()
genre_df.to_csv(genre_csv_buffer, index=False)

s3.Bucket(BUCKET_NAME).put_object(Key='Spotify_Genre_Table.csv', Body=genre_csv_buffer.getvalue())
#add date to filename if copying this code for the other sites

print('Uploaded to S3')

Uploaded to S3
