In [30]:
import sys
import random
from rec2vec import graph
from gensim.models import Word2Vec
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
from time import time
import numpy as np
from sklearn.metrics import confusion_matrix, accuracy_score, mean_squared_error

In [31]:
def predict_rating(model, nodedict, user, movie):
    """
    Predicts the rating between a user and a movie by finding the movie-rating node with the highest
    similarity to the given user node.
    Loops through the five possible movie-rating nodes and finds the node with the highest similarity to the user.
    
    Returns an integer rating 1-5.
    """
    # YOUR CODE HERE
    similarities = []

    for i in range(1,6):
        similarities.append(model.similarity(str(nodedict[user].id), str(nodedict[movie+"_"+str(i)].id)))

    mostSimilar = max(similarities)
    index  = 1
    for i in range(1, 5):
        if similarities[i] == mostSimilar:
            index = i + 1

    return index

In [32]:
nodedict = graph.records_to_graph()

In [33]:
G = graph.load_adjacencylist("out.adj", undirected=True)

In [34]:
embed = graph.build_deepwalk_corpus(G, 2, 2, rand=random.Random(0))

In [35]:
model = Word2Vec(embed, size=64, window=5, min_count=0, workers=4)

In [36]:
with open("./data/test_user_ratings.dat") as fin:
    fin.next()
    groundtruth = [line.strip().split("\t")[:3] for line in fin]    # (user, movie, rating)
tr = [int(round(float(g[2]))) for g in groundtruth]
pr = [predict_rating(model, nodedict, "u"+str(g[0]), "m"+str(g[1])) for g in groundtruth]

print "MSE = %f" % mean_squared_error(tr, pr)
print "accuracy = %f" % accuracy_score(tr, pr)
cm = confusion_matrix(tr, pr, labels=range(1,6))
print cm

MSE = 3.460930
accuracy = 0.197824
[[  4   2   9   3   3]
 [ 13  12  16  12  11]
 [ 36  50  53  60  40]
 [ 79  88  96  85 102]
 [ 56  45  48  42  46]]
