In [1]:
%matplotlib inline
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import os

import json
from sagemaker.predictor import json_deserializer

<h1>FM Cloud Prediction Invocation Template</h1>
<h4>Invoke SageMaker Prediction Service</h4>

In [2]:
import boto3
import re
from sagemaker import get_execution_role
import sagemaker

In [3]:
# Acquire a realtime endpoint
endpoint_name = 'fm-movie-v2'
predictor_sparse = sagemaker.predictor.RealTimePredictor(endpoint=endpoint_name)

In [4]:
# Read Dimension: Number of unique users + Number of unique movies in our dataset
dim_movie = 0

# Update movie dimension - from file used for training 
with open(r'ml-latest-small/movie_dimension.txt','r') as f:
    dim_movie = int(f.read())

In [5]:
print(dim_movie)

10334


In [6]:
def fm_sparse_serializer(data):
    js = {'instances': []}
    for row in data:
        
        column_list = row.tolist()
        value_list = np.ones(len(column_list),dtype=int).tolist()
       
        js['instances'].append({'data':{'features': { 'keys': column_list, 'shape':[dim_movie], 'values': value_list}}})
    return json.dumps(js)

In [7]:
# Testing
print(fm_sparse_serializer([np.array([341,1416]),np.array([209,2640]),np.array([164,1346])]))

{"instances": [{"data": {"features": {"keys": [341, 1416], "shape": [10334], "values": [1, 1]}}}, {"data": {"features": {"keys": [209, 2640], "shape": [10334], "values": [1, 1]}}}, {"data": {"features": {"keys": [164, 1346], "shape": [10334], "values": [1, 1]}}}]}


In [8]:
# Initialize Predictor with correct configuration

In [9]:
predictor_sparse.content_type = 'application/json'
predictor_sparse.serializer =  fm_sparse_serializer
predictor_sparse.deserializer = json_deserializer

In [10]:
# Test libSVM

In [11]:
# Load the test file in svm format. '5 341:1 1416:1'
test_file = r'ml-latest-small/user_movie_test.svm'

In [12]:
df_test = pd.read_csv(test_file, sep=' ', names=['rating','user_index','movie_index'])

In [13]:
df_test.head()

Unnamed: 0,rating,user_index,movie_index
0,4.0,561:1,2822:1
1,3.5,473:1,2600:1
2,4.5,361:1,2548:1
3,1.0,20:1,3582:1
4,5.0,468:1,2867:1


In [14]:
# update column to contain only the one hot encoded index
df_test.user_index = df_test.user_index.map(lambda value: int(value.split(':')[0]))
df_test.movie_index = df_test.movie_index.map(lambda value: int(value.split(':')[0]))

In [15]:
df_test.head()

Unnamed: 0,rating,user_index,movie_index
0,4.0,561,2822
1,3.5,473,2600
2,4.5,361,2548
3,1.0,20,3582
4,5.0,468,2867


In [16]:
df_test.shape

(30251, 3)

In [17]:
# For large number of predictions, we can split the input data and
# Query the prediction service.
# array_split is convenient to specify how many splits are needed
def get_predictions(predictor, arr_features):
    predictions = []
    for arr in np.array_split(arr_features,100):        
        if arr.shape[0] > 0:
            print (arr.shape, end=' ')
            result = predictor.predict(arr)
            predictions += [values['score'] for values in result['predictions']]
    return predictions

In [18]:
%time predictions = get_predictions(predictor_sparse, df_test[['user_index','movie_index']].as_matrix())

  if __name__ == '__main__':


(303, 2) 

ValidationError: An error occurred (ValidationError) when calling the InvokeEndpoint operation: Endpoint fm-movie-v2 of account 399426528351 not found.

In [19]:
df_test['predictions'] = predictions

NameError: name 'predictions' is not defined

In [None]:
df_test.head()

In [None]:
import sklearn.metrics as metrics

In [None]:
print('RMSE: ', metrics.mean_squared_error(df_test.rating, df_test.predictions)**.5)

In [None]:
# Training Data Residuals
residuals = (df_test.predictions - df_test.rating)

plt.hist(residuals)
plt.grid(True)
plt.xlabel('(Predicted - Actual)')
plt.ylabel('Count')
plt.title('Residuals Distribution')
plt.axvline(color='g')

## Get Prediction for a single user and all movies

In [None]:
# Load the one hot coded index values in svm format
test_file = r'ml-latest-small/one_hot_enc_movies.svm'

In [None]:
df_one_user_test = pd.read_csv(test_file,sep=' ',names=['movieId','user_index','movie_index'])
df_one_user_test.user_index = df_one_user_test.user_index.map(lambda value: int(value.split(':')[0]))
df_one_user_test.movie_index = df_one_user_test.movie_index.map(lambda value: int(value.split(':')[0]))

In [None]:
df_one_user_test.head()

In [None]:
df_one_user_test.shape[0]

In [None]:
%time predictions = get_predictions(predictor_sparse, df_one_user_test[['user_index','movie_index']].as_matrix())

In [None]:
df_one_user_test['rating_predicted'] = predictions

In [None]:
df_one_user_test.head()

In [None]:
df_movies = pd.read_csv(r'ml-latest-small/movies_genre.csv')

In [None]:
df_movies.head()

In [None]:
df_one_user_test = df_one_user_test.merge(df_movies, on='movieId')

In [None]:
df_one_user_test.head()

In [None]:
df_one_user_test.sort_values(['rating_predicted'], ascending=False)[['title','rating_predicted','genres']].head(10)

In [None]:
# Any Action Movies?
df_one_user_test[df_one_user_test.Action == 1].sort_values(['rating_predicted'], ascending=False)[['title','rating_predicted','genres']].head(10)

In [None]:
# What about comedy?
df_one_user_test[df_one_user_test.Comedy == 1].sort_values(['rating_predicted'], ascending=False)[['title','rating_predicted','genres']].head(10)

In [None]:
# And Drama
df_one_user_test[df_one_user_test.Drama == 1].sort_values(['rating_predicted'], ascending=False)[['title','rating_predicted','genres']].head(10)

In [None]:
df_one_user_test.user_index = 178
predictions = get_predictions(predictor_sparse, df_one_user_test[['user_index','movie_index']].as_matrix())
df_one_user_test['rating_predicted'] = predictions
df_one_user_test.sort_values(['rating_predicted'], ascending=False)[['title','rating_predicted','genres']].head(10)

In [None]:
df_one_user_test.user_index = 209
predictions = get_predictions(predictor_sparse, df_one_user_test[['user_index','movie_index']].as_matrix())
df_one_user_test['rating_predicted'] = predictions
df_one_user_test.sort_values(['rating_predicted'], ascending=False)[['title','rating_predicted','genres']].head(10)