# CS 155 Miniproject 2

## Karthik Karnik, Anvita Mishra, Kapil Sinha

## Matrix Factorization Visualizations

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(color_codes=True)

from scipy.sparse.linalg import svds
from scipy.sparse import csc_matrix

from prob2utils_skeleton import train_model as normal_train_model
from prob2utils_global_bias_skeleton import train_model as global_bias_train_model

In [2]:
# encoding must be latin-1 to handle the format of this data
data = np.loadtxt('data/data.txt', delimiter='\t', encoding='latin-1')
# dtype must be None for the function to infer type based on the column
movies = np.genfromtxt('data/movies.txt', delimiter='\t', encoding='latin-1', dtype=None)

In [3]:
train = np.loadtxt('data/train.txt', delimiter='\t', encoding='latin-1')
test = np.loadtxt('data/test.txt', delimiter='\t', encoding='latin-1')

In [4]:
data_df = pd.DataFrame(data)
data_df.columns = ["User ID", "Movie ID", "Rating"]
movies_df = pd.DataFrame(movies)
movies_df.columns = ["Movie ID", "Movie Title", "Unknown", "Action", "Adventure", "Animation", \
                     "Childrens", "Comedy", "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", \
                     "Horror", "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"]
train_df = pd.DataFrame(train)
train_df.columns = ["User ID", "Movie ID", "Rating"]
test_df = pd.DataFrame(test)
test_df.columns = ["User ID", "Movie ID", "Rating"]

In [18]:
data_df.head()

Unnamed: 0,User ID,Movie ID,Rating
0,196.0,242.0,3.0
1,186.0,302.0,3.0
2,22.0,377.0,1.0
3,244.0,51.0,2.0
4,166.0,346.0,1.0


In [6]:
movies_df.head()

Unnamed: 0,Movie ID,Movie Title,Unknown,Action,Adventure,Animation,Childrens,Comedy,Crime,Documentary,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),0,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0


In [20]:
# Avoid stupid indexing issues because Y values naturally would have been floats
Y_train = np.array(train, dtype=int)
Y_test = np.array(test, dtype=int)

## Our Homework Code

In [8]:
def normal_model(Y_train, Y_test):
    M = max(max(Y_train[:,0]), max(Y_test[:,0])).astype(int) # users
    N = max(max(Y_train[:,1]), max(Y_test[:,1])).astype(int) # movies
    print("Factorizing with ", M, " users, ", N, " movies.")

    reg = 0.1 # arbitrary regularization constant
    eta = 0.03 # learning rate

    K = 20
    U,V, err = normal_train_model(M, N, K, eta, reg, Y_train)
    return U, V

In [9]:
Ut_normal, V_normal = normal_model(Y_train, Y_test)

Factorizing with  943  users,  1682  movies.


In [12]:
Ut_normal

array([[-0.43026795,  0.34554861, -0.03229387, ..., -0.43249246,
         0.47351649, -0.12387663],
       [-0.55576087,  0.25679424,  0.23652058, ..., -0.48138301,
         0.4283185 ,  0.33536624],
       [-0.30784572, -0.64141794,  0.11347604, ..., -0.06628263,
         0.22891959, -0.42773196],
       ...,
       [-0.69798888,  0.11180515,  0.32679471, ..., -0.41472712,
         0.27336326,  0.05800823],
       [-0.4126295 ,  0.19313172,  0.37289517, ..., -0.60606952,
         0.18533661,  0.22067823],
       [-0.68232312, -0.4387875 ,  0.36343807, ..., -0.40783446,
         0.34441589, -0.09633531]])

In [13]:
V_normal

array([[-0.64821936,  0.21496395,  0.65779388, ..., -0.67507861,
         0.45583782,  0.21585892],
       [-0.43143929,  0.16178595,  0.29910135, ..., -0.53696445,
         0.13753832,  0.19372301],
       [-0.1051763 , -0.1333233 , -0.20901092, ..., -0.74715256,
        -0.08454288, -0.72257973],
       ...,
       [ 0.03494777,  0.25302254, -0.41286688, ..., -0.10436417,
         0.34349404,  0.15035889],
       [-0.00527843, -0.04096055,  0.77073263, ..., -0.25894539,
         0.26408879, -0.08861736],
       [-0.10979314,  0.09654585, -0.17285402, ..., -0.12054226,
         0.37095311, -0.21563747]])

## Our Homework Code with global bias terms

In [10]:
def global_bias_model(Y_train, Y_test):
    M = max(max(Y_train[:,0]), max(Y_test[:,0])).astype(int) # users
    N = max(max(Y_train[:,1]), max(Y_test[:,1])).astype(int) # movies
    print("Factorizing with ", M, " users, ", N, " movies.")

    reg = 0.1 # arbitrary regularization constant
    eta = 0.03 # learning rate

    K = 20
    U,V, err = global_bias_train_model(M, N, K, eta, reg, Y_train)
    return U, V

In [11]:
Ut_global_bias, V_global_bias = global_bias_model(Y_train, Y_test)

Factorizing with  943  users,  1682  movies.


In [14]:
Ut_global_bias

array([[-0.32992777,  1.22748851,  0.25627435, ...,  0.04698692,
         0.33059619, -0.65877357],
       [-0.27688168,  1.48154055, -0.00214309, ...,  0.53295645,
         0.43157264, -1.00154701],
       [-0.37257436,  0.69990856,  0.28980274, ...,  0.1113398 ,
         0.4508133 , -0.45487497],
       ...,
       [-0.37556301,  1.48867554,  0.20777512, ...,  0.06509199,
         0.78686843, -0.93657968],
       [-0.24551388,  1.48086493,  0.3381958 , ...,  0.33963079,
         0.44567329, -0.48851089],
       [-0.28074007,  1.74027183,  0.12111802, ...,  0.51066505,
         0.29378188, -0.38892242]])

In [15]:
V_global_bias

array([[-0.61794671,  1.2645151 ,  0.23939471, ...,  0.15275272,
         0.27995885, -0.57358897],
       [-0.26754781,  1.48937659,  0.35240143, ...,  0.13169123,
         0.02692133, -0.01619357],
       [-0.49861157,  1.07477935,  0.18234766, ...,  0.71155125,
         0.34382568, -0.70218723],
       ...,
       [-0.42002348, -0.26244785,  0.43791567, ..., -0.30584044,
         0.47076878, -0.33534995],
       [ 0.30125199,  0.62198818, -0.38120269, ..., -0.31312772,
         0.17118216,  0.24016797],
       [ 0.21907202, -0.04333329,  0.37857935, ..., -0.31807078,
         0.33344849,  0.38833583]])

# Off the shelf SVD implementation

## SVD with scipy.sparse.linalg.svds

In [16]:
M = max(max(Y_train[:,0]), max(Y_test[:,0])).astype(int) # users
N = max(max(Y_train[:,1]), max(Y_test[:,1])).astype(int) # movies

In [23]:
# For some reason, dtype=float is required
Y = csc_matrix((Y_train[:, 2], (Y_train[:, 0] - 1, Y_train[:, 1] - 1)), shape=(M, N), dtype=float)

In [24]:
Ut, S, V = svds(Y, k = 20)

In [29]:
Ut

array([[-0.0294572 , -0.02872727,  0.09958267, ...,  0.00532886,
        -0.00646452, -0.06858009],
       [ 0.02126707,  0.01648492,  0.00031876, ..., -0.05162523,
         0.04947412, -0.01456516],
       [-0.01676204, -0.01507433,  0.00460181, ..., -0.02484103,
         0.02738109, -0.00619795],
       ...,
       [ 0.04468681, -0.0022922 ,  0.01187261, ..., -0.00641867,
         0.02807787, -0.00839252],
       [ 0.01665304, -0.02876654,  0.00460046, ..., -0.02251532,
        -0.0081022 , -0.02389851],
       [-0.07882847,  0.00834073, -0.02471898, ...,  0.06118871,
         0.00675117, -0.04082959]])

In [30]:
V

array([[-1.03083330e-02,  9.79796043e-03, -5.17887985e-02, ...,
        -9.21480226e-04,  1.40381080e-03, -3.76491445e-03],
       [ 4.58842540e-02,  3.29048301e-02, -8.13820678e-03, ...,
        -1.33031796e-04, -1.54253189e-03, -1.04975515e-03],
       [ 1.50523556e-02,  2.81282526e-03, -1.10328026e-02, ...,
        -1.09347307e-04,  1.27155574e-03,  1.41332310e-03],
       ...,
       [ 1.47819672e-02,  6.43509404e-02,  1.13746252e-02, ...,
        -5.94434363e-04,  6.60396035e-04,  3.22627259e-04],
       [ 9.26171052e-02,  3.70110538e-03,  2.70581195e-02, ...,
         5.10081390e-04, -8.18333632e-05, -3.12961813e-04],
       [-9.78700296e-02, -3.53071809e-02, -1.92372756e-02, ...,
        -3.50995852e-05, -3.76607079e-04, -3.60137496e-04]])

In [25]:
Ut.shape, S.shape, V.shape

((943, 20), (20,), (20, 1682))

In [26]:
A, S2, B = svds(V, k = 2)

In [27]:
V_tilde = np.matmul(A.transpose(), V)

In [28]:
V_tilde.transpose()

array([[-0.06771096, -0.03643044],
       [ 0.04638156, -0.00697047],
       [ 0.03252391, -0.02049579],
       ...,
       [ 0.0002737 ,  0.00022347],
       [-0.00026716, -0.00124309],
       [-0.00079197, -0.0006335 ]])

# How do we use Y_test?