In [2]:
import json
import boto3
import numpy as np
from io import BytesIO
import scipy.sparse
from scipy import optimize
from itertools import combinations
import string

def lambda_handler(event, context):
    '''Use tag-level cosine similarity matrix (computed in SageMaker) to 
    1. find tags nearest to those inputted by user and 
    2. project both user-inputted and nearest tags in 2-D space 
    
    Inputs: tag-level cosine similarity matrix; user-inputted tags
    queryStringParameters = tagcount=3&displaycount=6&tag1=tag1&tag2=tag2&
    tag3=tag3'''
    
    #low-level functional API
    client = boto3.client('s3') 
    #high-level object-oriented API
    resource = boto3.resource('s3') 
    
    # Load full tag cs matrix (.npz) - this particular version set the `min_df 
    # param = 50 so any tags seen < 50 times in the MSD will not be included
    obj = client.get_object(Bucket='cs-matrices', Key='inverted_full_cs_matrx_df50.npz')
    array = scipy.sparse.load_npz(BytesIO(obj['Body'].read()))
    
    ## Load index, or tag labels (.npy)
    tag_npy_obj = client.get_object(Bucket='cs-matrices', Key='inverted_full_index_df50.npy')
    tag_npy = np.load(BytesIO(tag_npy_obj['Body'].read()), allow_pickle=True)
    
    # Tag labels & indices 
    tagcount = int(event['queryStringParameters']['tagcount'])
    displaycount = int(event['queryStringParameters']['displaycount'])
    
    # Import 1st user-inputted tag & find place in tag index
    tag1 = event['queryStringParameters']['tag1'].translate(str.maketrans('', '', string.punctuation)).lower(); tag1_index = np.where(tag_npy == tag1)
    # Handle case where user inputs one tag
    if tagcount == 1:
        user_inputted_tags = [tag1]; user_inputted_tag_index = [tag1_index]
    # Handle case where user inputs more than one tag
    else:
        # Import 2nd user-inputted tag & find place in tag index
        tag2 = event['queryStringParameters']['tag2'].translate(str.maketrans('', '', string.punctuation)).lower(); tag2_index = np.where(tag_npy == tag2)
        # Handle case where user inputs two tags
        if tagcount == 2:
            user_inputted_tags = [tag1, tag2]; user_inputted_tag_index = [tag1_index, tag2_index]
        # Handle case where user inputs three tags
        if tagcount == 3:
            # Import 3rd user-inputted tag & find place in tag index
            tag3 = event['queryStringParameters']['tag3'].translate(str.maketrans('', '', string.punctuation)).lower(); tag3_index = np.where(tag_npy == tag3)
            user_inputted_tags = [tag1, tag2, tag3]; user_inputted_tag_index = [tag1_index, tag2_index, tag3_index]
        else:
            print("Please input between one and three tags")
    
    # Check for erroneous tags
    erroenous_tags = 0
    # Iterate in reverse so popping does not result in skipped indices
    for i in range(tagcount-1,-1,-1):
        # If tag does not appear in index, remove from both lists
        if len(user_inputted_tag_index[i][0]) == 0:
            print("Tag", user_inputted_tags[i], "not found. Ignoring")
            user_inputted_tag_index.pop(i)
            user_inputted_tags.pop(i)
            erroenous_tags += 1
    # Adjust tagcount accordingly
    tagcount -= erroenous_tags
    
    # Check for duplicate tags
    while len(user_inputted_tags) != len(np.unique(user_inputted_tags)):
        for i in range(tagcount-1,-1,-1):
            # If current tag appears more than once, remove from both lists
            if user_inputted_tags.count(user_inputted_tags[i]) > 1:
                user_inputted_tag_index.pop(i)
                user_inputted_tags.pop(i)
                # Adjust tagcount accordingly
                tagcount -= 1
                
    # Handle edge case where user inputs three erroneous tags
    if erroenous_tags != 3:
        # After above, we should be left with conventional tags
        # Sum the cosine similarity metric across user-inputted tags,
        # effectively creating metric for which tags are most similar to 
        # cluster defined by user-inputted tags 
        joint_tag_similarity = 0
        for i in user_inputted_tag_index:
            joint_tag_similarity += array[i]
        
        # Find six tags with highest "adjacency" rank
        tag_index_inclusive = np.argpartition(joint_tag_similarity.toarray().ravel(), -displaycount)[-displaycount:]
        
        # Handle edge case where one of user-inputted tags is not in 6 most adjacent
        user_inputted_tag_index_list = []
        for i in user_inputted_tag_index:
            user_inputted_tag_index_list.append(i[0][0])
        for user_ix in user_inputted_tag_index_list:
            # If current user-inputted tag is not included in most adjacent tags, 
            # add it in place of non-user-inputted tag
            if user_ix not in tag_index_inclusive:
                for i in range(displaycount-1, -1, -1):
                    if tag_index_inclusive[i] not in user_inputted_tag_index_list:
                        print('replacing', tag_index_inclusive[i], 'with', user_ix, 'in tag_index_inclusive')
                        tag_index_inclusive[i] = user_ix
                        break
        
        # Map indices back to labels
        tag_names_inclusive = tag_npy[tag_index_inclusive]
        
        # Isolate adjacent tags
        adjacent_tags = list(set(tag_names_inclusive) - set(user_inputted_tags))
        
        # Initialize distance and coordinate lists
        distances = []
        coords = []
        
        # "Plot" first point at origin
        coords.append([user_inputted_tags[0], [0,0]])
        # Initialize estimates for optimization function
        x_est = 0
        y_est = 0
        
        # For each combination of tags 
        perms = combinations(list(range(0,len(tag_index_inclusive))), 2)
        for i in perms:
            # Find "distance" between each tag pair
            d = [tag_names_inclusive[i[0]], tag_names_inclusive[i[1]], 1-array[tag_index_inclusive[i[0]], tag_index_inclusive[i[1]]]]
            if (tagcount > 1 and d[0] in user_inputted_tags[:2]) and (d[1] in user_inputted_tags[:2]):
                d01 = d[2]
                # "Plot" second point when user inputs 2 or 3 tags along x axis
                coords.append([user_inputted_tags[1], [d01,0]])
                # Update initial estimate
                x_est += d01
            # If tags 0 & 2
            elif tagcount > 2 and (d[0] in (user_inputted_tags[0], user_inputted_tags[2])) and (d[1] in (user_inputted_tags[0], user_inputted_tags[2])):
                    d02 = d[2]
            # If tags 1 & 2
            elif tagcount > 2 and d[0] in user_inputted_tags[1:] and d[1] in user_inputted_tags[1:]:
                    d12 = d[2]
            else:
                distances.append(d)
        
        # Find where third point should be plotted given distances and first two coordinates
        if tagcount == 3:
            # Heron's formula
            s = (d01+d02+d12) / 2
            y = (s*(s-d01)*(s-d02)*(s-d12))**(1/2) / (.5*d01)
            x1 = (d02**2 - y**2)**(1/2) 
            # "Plot" third point when user inputs 3 tags
            coords.append([user_inputted_tags[2], [x1,y]])
            # Update initial estimate
            x_est += x1
            y_est += y
        
        # Define function to minimize distance between all (user-inputted + adjacent) tags
        def cumulative_distance(x):
            # Initialize counter
            opt = 0
            for d in distances:
                # Find x0 & y0
                if d[0] in adjacent_tags:
                    d0x_val = x[adjacent_tags.index(d[0])*2]
                    d0y_val = x[adjacent_tags.index(d[0])*2 + 1]
                else:
                    d0x_val = coords[user_inputted_tags.index(d[0])][1][0]
                    d0y_val = coords[user_inputted_tags.index(d[0])][1][1]
                # Find x1 & y1
                if d[1] in adjacent_tags:
                    d1x_val = x[adjacent_tags.index(d[1])*2]
                    d1y_val = x[adjacent_tags.index(d[1])*2 + 1]
                else:
                    d1x_val = coords[user_inputted_tags.index(d[1])][1][0]
                    d1y_val = coords[user_inputted_tags.index(d[1])][1][1]
                # Compute difference between calculated and actual distance
                opt += (((d0x_val - d1x_val)**2 + (d0y_val - d1y_val)**2)**(1/2) - d[2])**2
            return opt
        
        # Initialize vector to be used in optimization as mid-point of user-inputted tags
        x0 = np.repeat([x_est/tagcount, y_est/tagcount],len(adjacent_tags))
        # Optimize by minimizing distance between tags
        res = scipy.optimize.minimize(cumulative_distance, x0, method = 'BFGS')
        
        # Append newly calculated coordinates to list
        for adj in adjacent_tags:
            coords.append([adj, [res.x[adjacent_tags.index(adj)*2], res.x[adjacent_tags.index(adj)*2 + 1]]])
        
        # Prepare for output
        output = user_inputted_tags + adjacent_tags + coords
    
    # Handle edge case where user inputs three erroneous tags
    else:
        # Prepare for output
        output = user_inputted_tags
    
    return {
        'statusCode': 200,
        'body': json.dumps((output))
    }
