In [None]:
import pymysql
import json
import random
from scipy.cluster.vq import vq, kmeans, whiten
from scipy.spatial.distance import cdist

import numpy as np

region_name = 'us-east-1b'
conn = None
 
def openConnection():
    '''Call to open connection. 
       For privacy, username, password, and DB names have been anonymized.
       If you would like to replicate this code, 
       you can fill those in with your own values'''
    
    global conn
    name = 'USERNAME'
    password = 'PASSWORD'
    db_name = 'DB_NAME'
    rds_host = 'DB_HOST_NAME'

    if(conn is None):
        conn = pymysql.connect(
            rds_host, user=name, passwd=password, db=db_name, connect_timeout=3)
    elif (not conn.open):
        conn = pymysql.connect(
            rds_host, user=name, passwd=password, db=db_name, connect_timeout=3)

def lambda_handler(event, context):
    '''Query RDS, perform k-means (n_clusters = 1), find song closest to centroid
    
       Inputs: event from API Gateway containing queryStringParameters.
               queryStringParameters = python dict with arbitrary number of tags'''
    
    #retrieve tags from event string
    request_l = event['queryStringParameters'].values()
    
    #for testing purposes, can uncomment this to test these hardcoded tags
    # request_l = ['pop','90s','rock']
    
    #we're going to be looking up request_l as a string in this format:
    ##'tag_1|tag_2|...|tag_n'
    request_l = '|'.join([x.strip().lower() for x in request_l])
    
    #initialize responses from RDS and the spotify_id of songs returned
    ##responses will be LDA vectors from INFERENCE table
    response_l = []
    track_order = []

    #call function to connect to RDS
    openConnection()
    with conn.cursor() as cur:
        #subquery retrieves all songs with those tags (union of all sets)
        #main query retrieves LDA vectors of those songs
        #for performance purposes, limit to max 1000 random songs
        cur.execute(
            """select t1.*, t2.SPOTIFYID 
            from INFERENCE t1
            INNER JOIN 
                (select SONG_ID, SPOTIFYID 
                from TRACKS_WITH_TAGS 
                WHERE TAGS REGEXP '{}' AND SPOTIFYID <> ""
                ORDER BY RAND()
                LIMIT 1000) t2
            ON t1.SONG_ID = t2.SONG_ID"""
            .format(request_l))
        for row in cur:
            #put responses in the right place
            response_l.append(row[:24])
            track_order.append(row[26])
    conn.close()
    
    #we're going to run kmeans in scipy
    #get them into the right np.array format
    for i,x in enumerate(response_l):
        if i == 0:
            to_cluster = np.fromiter(response_l[i], float)
        else:
            to_cluster = np.vstack((to_cluster,np.fromiter(response_l[i], float)))
    
    #standardize vectors before running kmeans
    to_cluster = whiten(to_cluster)
    #run kmeans
    centroid = kmeans(to_cluster, 1, iter = 100)[0]
    #and calculate the distance of each point to the centroid
    dist = cdist(to_cluster, centroid).flatten()
    
    #and return the songs back in order of their distance from centroid
    #we will call this the playlist
    if len(dist) >= 11:
        playlist = [track_order[i] for i in list(np.argpartition(dist,11)[:11])]
    elif len(dist) == 0:
        playlist = []
    else:
        playlist = [track_order[i] for i in list(np.argpartition(dist,len(dist)-1))]

    return {
        'statusCode': 200,
        'body': json.dumps(playlist)
    }
