# CS 155 Miniproject 2

## Karthik Karnik, Anvita Mishra, Kapil Sinha

## Matrix Factorization Visualizations

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(color_codes=True)

from scipy.sparse.linalg import svds
from scipy.sparse import csc_matrix

from prob2utils_skeleton import train_model as normal_train_model, get_err as normal_get_err
from prob2utils_global_bias_skeleton import train_model as global_bias_train_model, get_err as global_bias_get_err

In [2]:
# encoding must be latin-1 to handle the format of this data
data = np.loadtxt('data/data.txt', delimiter='\t', encoding='latin-1')
# dtype must be None for the function to infer type based on the column
movies = np.genfromtxt('data/movies.txt', delimiter='\t', encoding='latin-1', dtype=None)

In [3]:
train = np.loadtxt('data/train.txt', delimiter='\t', encoding='latin-1')
test = np.loadtxt('data/test.txt', delimiter='\t', encoding='latin-1')

In [4]:
data_df = pd.DataFrame(data)
data_df.columns = ["User ID", "Movie ID", "Rating"]
movies_df = pd.DataFrame(movies)
movies_df.columns = ["Movie ID", "Movie Title", "Unknown", "Action", "Adventure", "Animation", \
                     "Childrens", "Comedy", "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", \
                     "Horror", "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"]
train_df = pd.DataFrame(train)
train_df.columns = ["User ID", "Movie ID", "Rating"]
test_df = pd.DataFrame(test)
test_df.columns = ["User ID", "Movie ID", "Rating"]

In [5]:
data_df.head()

Unnamed: 0,User ID,Movie ID,Rating
0,196.0,242.0,3.0
1,186.0,302.0,3.0
2,22.0,377.0,1.0
3,244.0,51.0,2.0
4,166.0,346.0,1.0


In [6]:
movies_df.head()

Unnamed: 0,Movie ID,Movie Title,Unknown,Action,Adventure,Animation,Childrens,Comedy,Crime,Documentary,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),0,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0


In [7]:
# Avoid stupid indexing issues because Y values naturally would have been floats
Y_train = np.array(train, dtype=int)
Y_test = np.array(test, dtype=int)

In [8]:
M = max(max(Y_train[:,0]), max(Y_test[:,0])).astype(int) # users
N = max(max(Y_train[:,1]), max(Y_test[:,1])).astype(int) # movies

In [9]:
# For some reason, dtype=float is required
sparse_Y_train = csc_matrix((Y_train[:, 2], (Y_train[:, 0] - 1, Y_train[:, 1] - 1)), shape=(M, N), dtype=float)
sparse_Y_test = csc_matrix((Y_test[:, 2], (Y_test[:, 0] - 1, Y_test[:, 1] - 1)), shape=(M, N), dtype=float)

## Our Homework Code

In [10]:
def normal_model(Y_train, Y_test):
    M = max(max(Y_train[:,0]), max(Y_test[:,0])).astype(int) # users
    N = max(max(Y_train[:,1]), max(Y_test[:,1])).astype(int) # movies
    print("Factorizing with ", M, " users, ", N, " movies.")

    reg = 0.1 # arbitrary regularization constant
    eta = 0.03 # learning rate

    K = 20
    U,V, err = normal_train_model(M, N, K, eta, reg, Y_train)
    return U, V

In [11]:
Ut_normal, V_normal= normal_model(Y_train, Y_test)

Factorizing with  943  users,  1682  movies.


In [13]:
# you can pass in a reg constant for get_err, but we forgo this for consistency
print ('Error: ' + str(normal_get_err(Ut_normal, V_normal, Y_test)))

Error: 0.449379710529932


## Our Homework Code with global bias terms

In [14]:
def global_bias_model(Y_train, Y_test):
    M = max(max(Y_train[:,0]), max(Y_test[:,0])).astype(int) # users
    N = max(max(Y_train[:,1]), max(Y_test[:,1])).astype(int) # movies
    print("Factorizing with ", M, " users, ", N, " movies.")

    reg = 0.1 # arbitrary regularization constant
    eta = 0.03 # learning rate

    K = 20
    U, V, a, b, err = global_bias_train_model(M, N, K, eta, reg, Y_train)
    return U, V, a, b

In [15]:
Ut_global_bias, V_global_bias, a_bias, b_bias = global_bias_model(Y_train, Y_test)

Factorizing with  943  users,  1682  movies.


In [16]:
print ('Error: ' + str(global_bias_get_err(Ut_global_bias, V_global_bias, Y_test, a_bias, b_bias)))

Error: 0.4786573217055345


# Off the shelf SVD implementation

## SVD with scipy.sparse.linalg.svds

In [17]:
Ut_svds, S, V_svds = svds(sparse_Y_train, k = 20)

In [18]:
print ('Error: ' + str(normal_get_err(Ut_svds, V_svds.transpose(), Y_test)))
print ('Error: ' + str(normal_get_err(Ut_svds, np.matmul(np.diag(S), V_svds).transpose(), Y_test)))

Error: 6.8565008493622805
Error: 3.1310127156459457


In [20]:
A, S2, B = svds(V_svds, k = 2)

In [22]:
V_tilde = np.matmul(A.transpose(), V_svds)

In [23]:
V_tilde.transpose()

array([[-0.06771096, -0.03643044],
       [ 0.04638156, -0.00697047],
       [ 0.03252391, -0.02049579],
       ...,
       [ 0.0002737 ,  0.00022347],
       [-0.00026716, -0.00124309],
       [-0.00079197, -0.0006335 ]])

## Surprise!

In [24]:
from surprise.prediction_algorithms.matrix_factorization import SVD
from surprise import Dataset
from surprise import Reader
from surprise import accuracy

read = Reader(line_format=u'user item rating', sep='\t')
model = SVD(n_factors=20)
train_data = Dataset.load_from_file('data/data.txt', read)

test_data = Dataset.load_from_file('data/test.txt', read)

model.fit(train_data.build_full_trainset())

predict = model.test(test_data.build_full_trainset().build_testset())

err = accuracy.rmse(predict) ** 2

V_surprise = model.qi
U_surprise = model.pu

RMSE: 0.8424


In [25]:
print ('Error: ' + str(normal_get_err(U_surprise, V_surprise, Y_test)))

Error: 6.877905043295834
