In [44]:
# Importing libraries
import numpy as np
import scipy as sp
import scipy.sparse
import csv
import bisect
import math
import pandas as pd

# Get data from here
# http://files.grouplens.org/datasets/movielens/ml-latest-small.zip

In [45]:
## Helper functions for reading in data
# You don't need to worry about modifying these functions

def get_col_id_from_movie_id(j, movie_ids):
    """
    Convert MovieLens movie id to a column index in observation matrix.
    
    Args:
        j : MovieLens movie id
        movie_ids : List of all of the movie ids from dataset
        
    Returns:
        Position of j in movie_ids (assumed to be sorted).
    """
    res = bisect.bisect_left(movie_ids, j)
    assert movie_ids[res] == j
    return res

def get_movie_ids_names(movie_file):
    """
    Get list of movie ids and movie names from movies.csv.
    
    Args:
        movie_file : movies.csv from MovieLens database
        
    Returns:
        movie_ids : list of the movie ids
        movie_names : list of names of corresponding movies
    """
    movie_ids = []
    movie_names = []
    with open(movie_file, 'rb') as csvfile:
        # Skip the header file
        csvfile.readline()
        reader = csv.reader(csvfile, delimiter=',')
        for row in csv.reader(csvfile):
            movie_ids.append(int(row[0]))
            movie_names.append(row[1])
    return movie_ids, movie_names

def get_ratings_matrix(ratings_file, movie_ids):
    """
    Construct observation matrix.
    
    Args:
        ratings_file : ratings.csv from MovieLens database
        movie_ids : list of movie ids from database
        
    Returns:
        matrix A in CSC format such that A[i,j] is rating of 
        movie i by user j
    """
    i = []
    j = []
    v = []
    with open(ratings_file, 'rb') as csvfile:
        # Skip the header line
        csvfile.readline()
        reader = csv.reader(csvfile, delimiter=',')
        for row in csv.reader(csvfile):
            i.append(int(row[0])-1)
            j.append(get_col_id_from_movie_id(int(row[1]), movie_ids))
            v.append(float(row[2]))
    
    return sp.sparse.csc_matrix((v, (i, j)), shape=(max(i)+1, len(movie_ids)))

def recommend_movies(V, movie_id, movie_ids, movie_names, k):
    """
    Given a movie by MovieLens movie id, return the k most similar
    movies according to cosine similarity.
    
    Args:
        V : movie embedding matrix. Movie i is mapped to a point in
            Euclidean space by taking the i-th row of V.
        movie_id : MovieLens id of movie in question.
        movie_ids : list of all of the MovieLens ids
        k : Number of nearest neighbours to return.
        
    Return:
        Pandas dataframe of most similar movies to movie i.
    """
    col_ix = get_col_id_from_movie_id(movie_id, movie_ids)
    query = V[col_ix,:].T
    query_norm = np.linalg.norm(query)
    v_norms = np.linalg.norm(V,ord=2,axis=1)
    similarities = np.divide(V.dot(query), v_norms)/query_norm
    similarities = map(lambda x: -float('inf') if np.isnan(x) else x, similarities)
    
    sim_sorted = np.argsort(similarities)[::-1][:k]
    
    top_movies = [movie_names[i] for i in sim_sorted]
    top_score = [similarities[i] for i in sim_sorted]
    
    df = pd.DataFrame({'Movie': top_movies, 'Score': top_score})
    
    return df

In [46]:
# Computes RMSE (Root-Mean-Squared-Error) of embedding
#   RMSE^2 = sum_{(i,j) observed} |A_{ij} - U_{i,:}*V_{j,:}'|^2
# Feel free to speed up this function too! (although not required)
def rmse(A, U, V):
    """
    Compute Root-Mean-Squared-Error (RMSE) of movie embeddings (V)
    and row embeddings (U) according to:
        RMSE^2 = sum_{(i,j) observed} |A_{ij} - U_{i,:}*V_{j,:}'|^2
        
    Note: Feel free to speed up this function too! (but not required)
    
    Args:
        A : ratings matrix
        U : row (movie) embedding
        V : column (user) embedding
        
    Returns:
        RMSE as described above
    """
    rmse = 0
    A_coo = sp.sparse.coo_matrix(A)
    for k in range(len(A_coo.data)):
        i = A_coo.row[k]
        j = A_coo.col[k]
        rmse += (A_coo.data[k] - U[i,:].dot(V[j,:].T))**2
    return math.sqrt(rmse/len(A_coo.data))

In [4]:
def update_v_row(A, U, V, reg, j):
    """Update jth row of V"""
    r = U.shape[1]
    Aj = A.getcol(j)

    # Form Grammian U'*diag(Wj)*U + reg*I (see slides)
    ix = Aj.indices
    Usub = U[ix,:]
    UU = Usub.T.dot(Usub) + reg*np.eye(r)
    
    # Form rhs of normal equations
    Ua = U.T.dot(Aj.todense())
    return np.linalg.solve(UU, Ua).flatten()

def update_v(A, U, V, reg):
    """Update every row of V"""
    for j in range(V.shape[0]):
        V[j,:] = update_v_row(A, U, V, reg, j)

def update_u_row(A, U, V, reg, i):
    """Update ith row of U"""
    r = U.shape[1]
    Ai = A.getrow(i)

    # Form Grammian V'*diag(Wi)*V + reg*I (see slides)
    ix = Ai.indices
    Vsub = V[ix,:]
    VV = Vsub.T.dot(Vsub) + reg*np.eye(r)
    
    # Form rhs of normal equations
    Va = V.T.dot(Ai.todense().T)
    return np.linalg.solve(VV, Va).flatten()

def update_u(A, U, V, reg):
    """Updates every row of U"""
    for i in range(m):
        U[i,:] = update_u_row(A, U, V, reg, i)

def wals(A, U, V, reg, its):
    """
    Run the Weighted-Alternating-Least-Squares algorithm to minimize
        min_{U,V} |A - UV^T|_F^2 + (reg/2)*(|U|_F^2 + |V|_F^2)
        
    Args:
        A : ratings matrix (CSC format)
        U : initial U factor
        V : initial V factor
        reg : regularization parameter
        its : number of iterations to run
    """
    for _ in range(its):
        update_u(A, U, V, reg)
        update_v(A, U, V, reg)
    return U, V

In [47]:
## This cell constructs the ratings matrix in CSC format
DATA_DIRECTORY = '/home/restrin/Downloads/ml-latest-small/'
ratings_file = DATA_DIRECTORY + 'ratings.csv'
movie_file = DATA_DIRECTORY + 'movies.csv'

movie_ids, movie_names = get_movie_ids_names(movie_file)
A = get_ratings_matrix(ratings_file, movie_ids)

In [48]:
# Run Alternating Least-Squares
m,n = A.shape
r = 30
U = np.random.rand(m,r)
V = np.random.rand(n,r)
reg = 15.0
num_its = 15

U,V = wals(A, U, V, reg, num_its)
rmse(A, U, V)

0.8053543418799978

In [55]:
# Show nearest neighbours
movie_id = 5218 # Harry Potter and the Sorcerer's Stone
recommend_movies(V, movie_id, movie_ids, movie_names, 10)

Unnamed: 0,Movie,Score
0,Ice Age (2002),1.0
1,Troy (2004),0.935545
2,Rush Hour (1998),0.913945
3,Seabiscuit (2003),0.910433
4,"Hobbit: The Desolation of Smaug, The (2013)",0.90962
5,"Pelican Brief, The (1993)",0.907573
6,"Negotiator, The (1998)",0.906368
7,Star Trek: Insurrection (1998),0.906314
8,Remember the Titans (2000),0.904122
9,"League of Ordinary Gentlemen, A (2004)",0.903532


In [108]:
# Run profiler
import cProfile
cProfile.run('wals(A,U,V,reg,10)')

         20802284 function calls (20704324 primitive calls) in 39.151 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
    91250    2.101    0.000   30.235    0.000 <ipython-input-4-593192249059>:1(update_v_row)
       10    0.407    0.041   30.643    3.064 <ipython-input-4-593192249059>:12(update_v)
     6710    0.279    0.000    8.465    0.001 <ipython-input-4-593192249059>:17(update_u_row)
       10    0.043    0.004    8.508    0.851 <ipython-input-4-593192249059>:28(update_u)
        1    0.000    0.000   39.151   39.151 <ipython-input-4-593192249059>:33(wals)
        1    0.000    0.000   39.151   39.151 <string>:1(<module>)
   194740    0.097    0.000    0.803    0.000 _methods.py:25(_amax)
   194740    0.077    0.000    0.390    0.000 _methods.py:28(_amin)
   104670    0.049    0.000    0.076    0.000 base.py:1081(isspmatrix)
   634730    0.318    0.000    0.948    0.000 base.py:181(nnz)
    97960    0.174    0.000  