# CS 155 Miniproject 2

## Karthik Karnik, Anvita Mishra, Kapil Sinha

## Matrix Factorization Visualizations

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(color_codes=True)

from scipy.sparse.linalg import svds
from scipy.sparse import csc_matrix

from prob2utils_skeleton import train_model as normal_train_model
from prob2utils_global_bias_skeleton import train_model as global_bias_train_model

In [2]:
# encoding must be latin-1 to handle the format of this data
data = np.loadtxt('data/data.txt', delimiter='\t', encoding='latin-1')
# dtype must be None for the function to infer type based on the column
movies = np.genfromtxt('data/movies.txt', delimiter='\t', encoding='latin-1', dtype=None)

In [3]:
train = np.loadtxt('data/train.txt', delimiter='\t', encoding='latin-1')
test = np.loadtxt('data/test.txt', delimiter='\t', encoding='latin-1')

In [4]:
data_df = pd.DataFrame(data)
data_df.columns = ["User ID", "Movie ID", "Rating"]
movies_df = pd.DataFrame(movies)
movies_df.columns = ["Movie ID", "Movie Title", "Unknown", "Action", "Adventure", "Animation", \
                     "Childrens", "Comedy", "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", \
                     "Horror", "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"]
train_df = pd.DataFrame(train)
train_df.columns = ["User ID", "Movie ID", "Rating"]
test_df = pd.DataFrame(test)
test_df.columns = ["User ID", "Movie ID", "Rating"]

In [5]:
data_df.head()

Unnamed: 0,User ID,Movie ID,Rating
0,196.0,242.0,3.0
1,186.0,302.0,3.0
2,22.0,377.0,1.0
3,244.0,51.0,2.0
4,166.0,346.0,1.0


In [6]:
movies_df.head()

Unnamed: 0,Movie ID,Movie Title,Unknown,Action,Adventure,Animation,Childrens,Comedy,Crime,Documentary,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),0,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0


In [7]:
from sklearn.metrics import mean_squared_error
from math import sqrt
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

In [8]:
# Avoid stupid indexing issues because Y values naturally would have been floats
Y_train = np.array(train, dtype=int)
Y_test = np.array(test, dtype=int)

In [9]:
M = max(max(Y_train[:,0]), max(Y_test[:,0])).astype(int) # users
N = max(max(Y_train[:,1]), max(Y_test[:,1])).astype(int) # movies

In [10]:
# For some reason, dtype=float is required
sparse_Y_train = csc_matrix((Y_train[:, 2], (Y_train[:, 0] - 1, Y_train[:, 1] - 1)), shape=(M, N), dtype=float)
sparse_Y_test = csc_matrix((Y_test[:, 2], (Y_test[:, 0] - 1, Y_test[:, 1] - 1)), shape=(M, N), dtype=float)

## Our Homework Code

In [11]:
def normal_model(Y_train, Y_test):
    M = max(max(Y_train[:,0]), max(Y_test[:,0])).astype(int) # users
    N = max(max(Y_train[:,1]), max(Y_test[:,1])).astype(int) # movies
    print("Factorizing with ", M, " users, ", N, " movies.")

    reg = 0.1 # arbitrary regularization constant
    eta = 0.03 # learning rate

    K = 20
    U,V, err = normal_train_model(M, N, K, eta, reg, Y_train)
    return U, V

In [12]:
Ut_normal, V_normal= normal_model(Y_train, Y_test)

Factorizing with  943  users,  1682  movies.


In [13]:
X_pred_normal = np.dot(Ut_normal, V_normal.transpose())
print ('Error: ' + str(rmse(X_pred_normal, sparse_Y_test.toarray())))

Error: 0.945746392600535


## Our Homework Code with global bias terms

In [14]:
def global_bias_model(Y_train, Y_test):
    M = max(max(Y_train[:,0]), max(Y_test[:,0])).astype(int) # users
    N = max(max(Y_train[:,1]), max(Y_test[:,1])).astype(int) # movies
    print("Factorizing with ", M, " users, ", N, " movies.")

    reg = 0.1 # arbitrary regularization constant
    eta = 0.03 # learning rate

    K = 20
    U, V, a, b, err = global_bias_train_model(M, N, K, eta, reg, Y_train)
    return U, V, a, b

In [15]:
Ut_global_bias, V_global_bias, a_bias, b_bias = global_bias_model(Y_train, Y_test)

Factorizing with  943  users,  1682  movies.


In [73]:
X_pred_global_bias = np.dot(Ut_global_bias, V_global_bias.transpose())
print ('Error: ' + str(rmse(X_pred_global_bias, sparse_Y_test.toarray())))

Error: 1.0758928657536277


# Off the shelf SVD implementation

## SVD with scipy.sparse.linalg.svds

In [47]:
Ut, S, V = svds(sparse_Y_train, k = 20)

In [58]:
s_diag_matrix=np.diag(S)
X_pred = np.dot(np.dot(Ut, s_diag_matrix), V)
print ('Error: ' + str(rmse(X_pred, sparse_Y_test.toarray())))

Error: 2.50240393048203


In [19]:
Ut

array([[-0.0294572 , -0.02872727,  0.09958267, ...,  0.00532886,
        -0.00646452, -0.06858009],
       [ 0.02126707,  0.01648492,  0.00031876, ..., -0.05162523,
         0.04947412, -0.01456516],
       [-0.01676204, -0.01507433,  0.00460181, ..., -0.02484103,
         0.02738109, -0.00619795],
       ...,
       [ 0.04468681, -0.0022922 ,  0.01187261, ..., -0.00641867,
         0.02807787, -0.00839252],
       [ 0.01665304, -0.02876654,  0.00460046, ..., -0.02251532,
        -0.0081022 , -0.02389851],
       [-0.07882847,  0.00834073, -0.02471898, ...,  0.06118871,
         0.00675117, -0.04082959]])

In [20]:
V

array([[-1.03083330e-02,  9.79796043e-03, -5.17887985e-02, ...,
        -9.21480226e-04,  1.40381080e-03, -3.76491445e-03],
       [ 4.58842540e-02,  3.29048301e-02, -8.13820678e-03, ...,
        -1.33031796e-04, -1.54253189e-03, -1.04975515e-03],
       [ 1.50523556e-02,  2.81282526e-03, -1.10328026e-02, ...,
        -1.09347307e-04,  1.27155574e-03,  1.41332310e-03],
       ...,
       [ 1.47819672e-02,  6.43509404e-02,  1.13746252e-02, ...,
        -5.94434363e-04,  6.60396035e-04,  3.22627259e-04],
       [ 9.26171052e-02,  3.70110538e-03,  2.70581195e-02, ...,
         5.10081390e-04, -8.18333632e-05, -3.12961813e-04],
       [-9.78700296e-02, -3.53071809e-02, -1.92372756e-02, ...,
        -3.50995852e-05, -3.76607079e-04, -3.60137496e-04]])

In [21]:
Ut.shape, S.shape, V.shape

((943, 20), (20,), (20, 1682))

In [22]:
A, S2, B = svds(V, k = 2)

In [23]:
V_tilde = np.matmul(A.transpose(), V)

In [24]:
V_tilde.transpose()

array([[-0.06771096, -0.03643044],
       [ 0.04638156, -0.00697047],
       [ 0.03252391, -0.02049579],
       ...,
       [ 0.0002737 ,  0.00022347],
       [-0.00026716, -0.00124309],
       [-0.00079197, -0.0006335 ]])

# Surprise!

In [24]:
from surprise.prediction_algorithms.matrix_factorization import SVD
from surprise import Dataset
from surprise import Reader
from surprise import accuracy

read = Reader(line_format=u'user item rating', sep='\t')
model = SVD(n_factors=20)
train_data = Dataset.load_from_file('data/data.txt', read)

test_data = Dataset.load_from_file('data/test.txt', read)

model.fit(train_data.build_full_trainset())

predict = model.test(test_data.build_full_trainset().build_testset())

err = accuracy.rmse(predict) ** 2

V = model.qi



RMSE: 0.8421


# How do we use Y_test?