<a href="https://colab.research.google.com/github/heathjohn62/CS155-Fake-Deep/blob/main/project2/get_UV.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Miniproject 2: MovieLens
Authors: Julia Sloan, Ayooluwa Odemuyiwa, Randall Pulido, John Heath

In [109]:
import numpy as np
import pandas as pd
import urllib.request
import matplotlib.pyplot as plt
from sklearn.utils.extmath import randomized_svd
import bokeh, bokeh.plotting, bokeh.io, bokeh.models
bokeh.io.output_notebook()

Fill in these functions to train the SVD

In [2]:
def grad_U(Ui, Yij, Vj, reg, eta):
    """
    Takes as input Ui (the ith row of U), a training point Yij, the column
    vector Vj (jth column of V^T), reg (the regularization parameter lambda),
    and eta (the learning rate).

    Returns the gradient of the regularized loss function with
    respect to Ui multiplied by eta.
    """
    grad = reg * Ui - (Yij - np.dot(Ui.T, Vj)) * Vj
    return eta * grad


def grad_V(Vj, Yij, Ui, reg, eta):
    """
    Takes as input the column vector Vj (jth column of V^T), a training point Yij,
    Ui (the ith row of U), reg (the regularization parameter lambda),
    and eta (the learning rate).

    Returns the gradient of the regularized loss function with
    respect to Vj multiplied by eta.
    """
    grad = reg * Vj - (Yij - np.dot(Ui.T, Vj)) * Ui
    return eta * grad

def get_err(U, V, Y, reg=0.0):
    """
    Takes as input a matrix Y of triples (i, j, Y_ij) where i is the index of a user,
    j is the index of a movie, and Y_ij is user i's rating of movie j and
    user/movie matrices U and V.

    Returns the mean regularized squared-error of predictions made by
    estimating Y_{ij} as the dot product of the ith row of U and the jth column of V^T.
    """
    u_norm = np.linalg.norm(U)
    v_norm = np.linalg.norm(V)
    norms = (reg / 2.0) * ((u_norm ** 2) + (v_norm ** 2))

    error = norms
    for y in Y:
        y_ij = y[2]
        i = y[0] - 1
        j = y[1] - 1

        est = np.dot(U[i], V[j])
        error += .5 * ((y_ij - est) ** 2)

    return error / len(Y)


def train_model(M, N, K, eta, reg, Y, eps=0.0001, max_epochs=300):
    """
    Given a training data matrix Y containing rows (i, j, Y_ij)
    where Y_ij is user i's rating on movie j, learns an
    M x K matrix U and N x K matrix V such that rating Y_ij is approximated
    by (UV^T)_ij.

    Uses a learning rate of <eta> and regularization of <reg>. Stops after
    <max_epochs> epochs, or once the magnitude of the decrease in regularized
    MSE between epochs is smaller than a fraction <eps> of the decrease in
    MSE after the first epoch.

    Returns a tuple (U, V, err) consisting of U, V, and the unregularized MSE
    of the model.
    """
    # Initialize U and V to contain small random numbers between -.5 and .5
    U = np.random.rand(M, K) - 0.5
    V = np.random.rand(N, K) - 0.5

    NUM_EPOCHS = 300
    error0 = get_err(U, V, Y, reg)
    prev_error = error0
    error = prev_error + 5.0    
    
    # Loop over defined number of epochs
    for n in range(NUM_EPOCHS):
        # Loop over all points in Y randomly
        indices = np.random.permutation(len(Y))
        for m in indices:
            i = Y[m][0] - 1
            j = Y[m][1] - 1
            y = Y[m][2]

            U[i] -= grad_U(U[i], y, V[j], reg, eta)
            V[j] -= grad_V(V[j], y, U[i], reg, eta)

        
        error = get_err(U, V, Y, reg)
        if n == 0:
            error1 = error

        if ((np.abs(prev_error - error) / np.abs(error1 - error0)) <= eps):
            return (U, V, prev_error)

        prev_error = error

    return (U, V, prev_error)

Run the cell below to find U and V

In [19]:
# Data columns: User ID, Movie ID, Rating
Y_train = np.loadtxt('https://raw.githubusercontent.com/lakigigar/Caltech-CS155-2021/main/projects/project2/data/train.txt').astype(int)
Y_test = np.loadtxt('https://raw.githubusercontent.com/lakigigar/Caltech-CS155-2021/main/projects/project2/data/test.txt').astype(int)

M = max(max(Y_train[:,0]), max(Y_test[:,0])).astype(int) # users
N = max(max(Y_train[:,1]), max(Y_test[:,1])).astype(int) # movies
print("Factorizing with ", M, " users, ", N, " movies.")

# Use k=20 as specified in project specs
K = 20

reg = 0.10
eta = 0.03 # learning rate
E_in = []
E_out = []

# Use to compute Ein and Eout
U_T,V_T, err = train_model(M, N, K, eta, reg, Y_train)
E_in = err
E_out = get_err(U, V, Y_test)
print("E_in: %.3f"%E_in)
print("E_out: %.3f"%E_out)

Factorizing with  943  users,  1682  movies.
E_in: 0.311
E_out: 0.447


Now I will apply SVD to V. 

In [20]:
A, Sigma_V, B_T = randomized_svd( np.transpose(V_T),
                                  n_components=2,
                                  n_oversamples=10,
                                  n_iter='auto',
                                  random_state=None)
C, Sigma_U, D_T = randomized_svd( np.transpose(U_T),
                                  n_components=2,
                                  n_oversamples=10,
                                  n_iter='auto',
                                  random_state=None)

Next, I evaluate the projection of U and V into 2-D. 

In [29]:
U = np.transpose(U_T)
V = np.transpose(V_T)
u_tilde = np.matmul(np.transpose(C), U)
v_tilde = np.matmul(np.transpose(A), V)

Next, let's package $\tilde{U}$ and $\tilde{V}$ into a pandas dataframe with the other movie data. 

In [40]:
all_data = np.concatenate([Y_train, Y_test])
data_df = pd.DataFrame(all_data, columns = ["User", "Movie", "Rating"])
data_df

Unnamed: 0,User,Movie,Rating
0,196,242,3
1,186,302,3
2,22,377,1
3,166,346,1
4,298,474,4
...,...,...,...
99995,13,858,1
99996,7,79,4
99997,588,1058,2
99998,916,727,4


I'm also going to want to evaluate the average rating and the number of ratings for each movie, and I'll start out by making a dataframe to store this information. 

In [51]:
N = 1682
M = 943
users = list(range(1, M + 1))
movies = list(range(1, N + 1))

movie_df = pd.DataFrame(movies, columns = ["Movie"])
movie_df["x"] = v_tilde[0, :]
movie_df["y"] = v_tilde[1, :]
movie_df

Unnamed: 0,Movie,x,y
0,1,-2.148976,0.361219
1,2,-1.586554,0.544424
2,3,-1.671685,0.058531
3,4,-1.622485,-0.192566
4,5,-1.788037,0.664841
...,...,...,...
1677,1678,0.136724,-0.145023
1678,1679,-0.802910,-0.245996
1679,1680,-0.388831,0.402577
1680,1681,-1.257763,0.148415


Let's evaluate the average rating and the number of ratings for each movie. 

In [54]:
avg_ratings = np.zeros(N)
num_ratings = np.zeros(N, dtype = int)
for i in range(N):
  mov = i + 1
  df_mov = data_df[data_df["Movie"] == mov]
  num_ratings[i] = len(df_mov)
  avg_ratings[i] = np.mean(df_mov["Rating"].values)
movie_df["Average Rating"] = avg_ratings
movie_df["Number of Ratings"] = num_ratings
movie_df

Unnamed: 0,Movie,x,y,Average Rating,Number of Ratings
0,1,-2.148976,0.361219,3.878319,452
1,2,-1.586554,0.544424,3.206107,131
2,3,-1.671685,0.058531,3.033333,90
3,4,-1.622485,-0.192566,3.550239,209
4,5,-1.788037,0.664841,3.302326,86
...,...,...,...,...,...
1677,1678,0.136724,-0.145023,1.000000,1
1678,1679,-0.802910,-0.245996,3.000000,1
1679,1680,-0.388831,0.402577,2.000000,1
1680,1681,-1.257763,0.148415,3.000000,1


In [87]:
cols = ["Movie Id", "Movie Title", "Unknown", "Action", "Adventure", 
        "Animation", "Children's", "Comedy", "Crime", "Documentary",
        "Drama", "Fantasy", "Film-Noir", "Horror", "Musical", "Mystery", 
        "Romance", "Sci-Fi", "Thriller", "War", "Western"]
movie_url ='https://raw.githubusercontent.com/lakigigar/Caltech-CS155-2021/main/projects/project2/data/movies.txt'
metadata = pd.read_csv(movie_url, delimiter="\t", header=None, 
                       encoding = 'latin-1', names = cols)
metadata.head(5)

Unnamed: 0,Movie Id,Movie Title,Unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0


# Part A
I will choose ten movies and gather their ids. 

In [86]:
ten_choice_movies = ["Seven (Se7en) (1995)",
                     "Twelve Monkeys (1995)",
                     "Pulp Fiction (1994)",
                     "Silence of the Lambs, The (1991)",
                     "Shawshank Redemption, The (1994)",
                     "Free Willy (1993)",
                     "Ace Ventura: Pet Detective (1994)",
                     "Star Wars (1977)",
                     "Free Willy 2: The Adventure Home (1995)",
                     "Lion King, The (1994)"]
ten_choice_ids = np.zeros(10, dtype=int)
for i in range(10):
  index = np.where(metadata["Movie Title"].values == ten_choice_movies[i])[0][0]
  ten_choice_ids[i] = index + 1
ten_choice_ids

array([11,  7, 56, 98, 64, 78, 67, 50, 35, 71])

Next I'm going to try and write a general plotting function that will work for all of these plots. 

In [102]:
def get_plot(movie_ids, fig_title, xlabel="Component 1", ylabel="Component 2"):
  """Returns a bokeh plot in which the movies are plotted onto a 2D projection. 
  Each movie is represented by its title on the plot."""
  fig = bokeh.plotting.Figure(width = 700,
                              height = 500,
                              title = fig_title,
                              x_axis_label = xlabel,
                              y_axis_label = ylabel)
  indices = movie_ids - 1
  x = np.zeros(len(movie_ids))
  y = np.zeros(len(movie_ids))
  names = np.zeros(len(movie_ids), dtype=object)
  i = 0
  for index in indices:
    x[i] = movie_df["x"].values[index]
    y[i] = movie_df["y"].values[index]
    names[i] = metadata["Movie Title"].values[index]
    i += 1
  
  fig.circle(x, y)

  source = bokeh.models.ColumnDataSource(dict(x=x, y=y, text=names))
  # fig.text(x="x", y="y", text="names", source=source)

  glyph = bokeh.models.Text(x="x", y="y", text="text", text_color="black")
  fig.add_glyph(source, glyph)

  return fig

In [110]:
plot = get_plot(ten_choice_ids, "Visualization of Ten Choice Movies", 
                xlabel="Component 1", ylabel="Component 2")
bokeh.io.show(plot)