<a href="https://colab.research.google.com/github/heathjohn62/CS155-Fake-Deep/blob/main/project2/get_UV.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Miniproject 2: MovieLens
Authors: Julia Sloan, Ayooluwa Odemuyiwa, Randall Pulido, John Heath

In [1]:
!pip install surprise

Collecting surprise
  Downloading https://files.pythonhosted.org/packages/61/de/e5cba8682201fcf9c3719a6fdda95693468ed061945493dea2dd37c5618b/surprise-0.1-py2.py3-none-any.whl
Collecting scikit-surprise
[?25l  Downloading https://files.pythonhosted.org/packages/97/37/5d334adaf5ddd65da99fc65f6507e0e4599d092ba048f4302fe8775619e8/scikit-surprise-1.1.1.tar.gz (11.8MB)
[K     |████████████████████████████████| 11.8MB 4.7MB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.1-cp37-cp37m-linux_x86_64.whl size=1615277 sha256=d149ca61bc0b26caace26ece1dfedd6c18308b262e3bf88134a4a0e68a95db2c
  Stored in directory: /root/.cache/pip/wheels/78/9c/3d/41b419c9d2aff5b6e2b4c0fc8d25c538202834058f9ed110d0
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.1 surprise-0.1


In [2]:
import numpy as np
import pandas as pd
import urllib.request
import matplotlib.pyplot as plt
from sklearn.utils.extmath import randomized_svd
import bokeh, bokeh.plotting, bokeh.io, bokeh.models, bokeh.layouts
from surprise import Reader, Dataset, SVD, accuracy
bokeh.io.output_notebook()
np.random.seed(62)

Fill in these functions to train the SVD

In [3]:
def grad_U(Ui, Yij, Vj, reg, eta):
    """
    Takes as input Ui (the ith row of U), a training point Yij, the column
    vector Vj (jth column of V^T), reg (the regularization parameter lambda),
    and eta (the learning rate).

    Returns the gradient of the regularized loss function with
    respect to Ui multiplied by eta.
    """
    grad = reg * Ui - (Yij - np.dot(Ui.T, Vj)) * Vj
    return eta * grad


def grad_V(Vj, Yij, Ui, reg, eta):
    """
    Takes as input the column vector Vj (jth column of V^T), a training point Yij,
    Ui (the ith row of U), reg (the regularization parameter lambda),
    and eta (the learning rate).

    Returns the gradient of the regularized loss function with
    respect to Vj multiplied by eta.
    """
    grad = reg * Vj - (Yij - np.dot(Ui.T, Vj)) * Ui
    return eta * grad

def get_err(U, V, Y, reg=0.0):
    """
    Takes as input a matrix Y of triples (i, j, Y_ij) where i is the index of a user,
    j is the index of a movie, and Y_ij is user i's rating of movie j and
    user/movie matrices U and V.

    Returns the mean regularized squared-error of predictions made by
    estimating Y_{ij} as the dot product of the ith row of U and the jth column of V^T.
    """
    u_norm = np.linalg.norm(U)
    v_norm = np.linalg.norm(V)
    norms = (reg / 2.0) * ((u_norm ** 2) + (v_norm ** 2))

    error = norms
    for y in Y:
        y_ij = y[2]
        i = y[0] - 1
        j = y[1] - 1

        est = np.dot(U[i], V[j])
        error += .5 * ((y_ij - est) ** 2)

    return error / len(Y)


def train_model(M, N, K, eta, reg, Y, eps=0.0001, max_epochs=300):
    """
    Given a training data matrix Y containing rows (i, j, Y_ij)
    where Y_ij is user i's rating on movie j, learns an
    M x K matrix U and N x K matrix V such that rating Y_ij is approximated
    by (UV^T)_ij.

    Uses a learning rate of <eta> and regularization of <reg>. Stops after
    <max_epochs> epochs, or once the magnitude of the decrease in regularized
    MSE between epochs is smaller than a fraction <eps> of the decrease in
    MSE after the first epoch.

    Returns a tuple (U, V, err) consisting of U, V, and the unregularized MSE
    of the model.
    """
    # Initialize U and V to contain small random numbers between -.5 and .5
    U = np.random.rand(M, K) - 0.5
    V = np.random.rand(N, K) - 0.5

    NUM_EPOCHS = 300
    error0 = get_err(U, V, Y, reg)
    prev_error = error0
    error = prev_error + 5.0    
    
    # Loop over defined number of epochs
    for n in range(NUM_EPOCHS):
        # Loop over all points in Y randomly
        indices = np.random.permutation(len(Y))
        for m in indices:
            i = Y[m][0] - 1
            j = Y[m][1] - 1
            y = Y[m][2]

            U[i] -= grad_U(U[i], y, V[j], reg, eta)
            V[j] -= grad_V(V[j], y, U[i], reg, eta)

        
        error = get_err(U, V, Y, reg)
        if n == 0:
            error1 = error

        if ((np.abs(prev_error - error) / np.abs(error1 - error0)) <= eps):
            return (U, V, prev_error)

        prev_error = error

    return (U, V, prev_error)

Run the cell below to find U and V

In [4]:
# Data columns: User ID, Movie ID, Rating
Y_train = np.loadtxt('https://raw.githubusercontent.com/lakigigar/Caltech-CS155-2021/main/projects/project2/data/train.txt').astype(int)
Y_test = np.loadtxt('https://raw.githubusercontent.com/lakigigar/Caltech-CS155-2021/main/projects/project2/data/test.txt').astype(int)

M = max(max(Y_train[:,0]), max(Y_test[:,0])).astype(int) # users
N = max(max(Y_train[:,1]), max(Y_test[:,1])).astype(int) # movies
print("Factorizing with ", M, " users, ", N, " movies.")

# Use k=20 as specified in project specs
K = 20

reg = 0.10
eta = 0.03 # learning rate
E_in = []
E_out = []

# Use to compute Ein and Eout
U_T,V_T, err = train_model(M, N, K, eta, reg, Y_train)
V = np.transpose(V_T)
E_in = err
E_out = get_err(U_T, V_T, Y_test)
print("E_in: %.3f"%E_in)
print("E_out: %.3f"%E_out)

Factorizing with  943  users,  1682  movies.
E_in: 0.280
E_out: 0.439


We also want to find U and V using methods (2) and (3) in the problem description. Let's do this using the suprise package. |

In [5]:
def off_the_shelf(trn_data, biased=False):
  # Matrix factorization using SVD, the surprise package
  # Input: traning data (in form of a Dataframe)
  # Output: the training dataset, the algorithm, factorized matrix (U, V)
  reader = Reader()
  trn = Dataset.load_from_df(trn_data, reader)
  trn = trn.build_full_trainset()
  # Change bias on or off
  alg = SVD(n_factors=20, biased=biased, random_state=62)
  alg.fit(trn)
  return trn, alg, alg.pu, alg.qi


def test_alg(alg, tst_data, trn_data):
  # Test a surpirse algorithm
  # Input: an algorithm, test dataset
  # Output: accuracy of the algorithm
  pred2 = alg.test(trn_data)
  pred = alg.test(tst_data)

  # We multiply by 1/2 here so that the error measure is consistent between
  # method 1 and this method. 
  print("Training Error: %.3f."% (0.5 * accuracy.mse(pred2, verbose=False)))
  print("Testing Error: %.3f."% (0.5 * accuracy.mse(pred, verbose=False)))
  

# Read the data into dataframes
trn_data = pd.read_csv(
    'https://raw.githubusercontent.com/lakigigar/Caltech-CS155-2021/main/projects/project2/data/train.txt',
    delimiter = '\t', header = None, names = ["User", "Movie", "Rating"])
tst_data = pd.read_csv(
    'https://raw.githubusercontent.com/lakigigar/Caltech-CS155-2021/main/projects/project2/data/test.txt',
    delimiter = '\t', header = None, names = ["User", "Movie", "Rating"])

# Train the model and find U and V
trn, alg2, U2_bad_index, V2_bad_index = off_the_shelf(trn_data, biased=True)
trn, alg3, U3_bad_index, V3_bad_index = off_the_shelf(trn_data, biased=False)


print("With bias:")
test_alg(alg2, tst_data.to_numpy(), trn_data.to_numpy())
print("No bias:")
test_alg(alg3, tst_data.to_numpy(), trn_data.to_numpy())

With bias:
Training Error: 0.356.
Testing Error: 0.434.
No bias:
Training Error: 0.347.
Testing Error: 0.439.


The suprise matrices are in a very wierd format in comparison to the matrices we really want. I will write a function to re-index them properly. 

In [6]:
def convert_suprise_matrix(V_bad_index, U_bad_index, trn):
  """Accepts the matrices output by Suprise's NMF algorithmm, as well as a training
  set object, and returns correctly indexed U and V matrices. Note that if a user 
  or a movie is not in the training set, then I kept the corresponding matrix 
  column zero. """
  V = np.zeros([K, N])
  U = np.zeros([K, M])
  for i in range(1, N + 1):
    try:
      id = trn.to_inner_iid(i)
      V[:, i-1] = V_bad_index[id]
    except ValueError:
      V[:, i-1] = np.zeros(K)
  for i in range(1, M + 1):
    try:
      id = trn.to_inner_uid(i)
      U[:, i-1] = U_bad_index[id]
    except ValueError:
      U[:, i-1] = np.zeros(K)
  return U, V

# Convert the matrices.
U2, V2 = convert_suprise_matrix(V2_bad_index, U2_bad_index, trn)
U3, V3 = convert_suprise_matrix(V3_bad_index, U3_bad_index, trn)

# Find the transpose matrix
U2_T = np.transpose(U2)
V2_T = np.transpose(V2)
U3_T = np.transpose(U3)
V3_T = np.transpose(V3)

# Now, as a sanity check, let's recalculate the error using the function we used
# in method #1. 
E_in = get_err(U3_T, V3_T, Y_train)
E_out = get_err(U3_T, V3_T, Y_test)
print("No Bias:")
print("E_in: %.3f"%E_in)
print("E_out: %.3f"%E_out)

No Bias:
E_in: 0.348
E_out: 0.444


This looks about right. Now I will apply SVD to V for all three methods.

In [7]:
A, _, _ = randomized_svd( np.transpose(V_T),
                          n_components=2,
                          n_oversamples=10,
                          n_iter='auto',
                          random_state=62)
A2, _, _ = randomized_svd(V2,
                          n_components=2,
                          n_oversamples=10,
                          n_iter='auto',
                          random_state=62)
A3, _, _ = randomized_svd(V3,
                          n_components=2,
                          n_oversamples=10,
                          n_iter='auto',
                          random_state=62)

Next, I evaluate the projection of V into 2-D. 

In [8]:
v_tilde = np.matmul(np.transpose(A), V)
v2_tilde = np.matmul(np.transpose(A2), V2)
v3_tilde = np.matmul(np.transpose(A3), V3)

Next, let's package $\tilde{U}$ and $\tilde{V}$ into a pandas dataframe with the other movie data. 

In [9]:
all_data = np.concatenate([Y_train, Y_test])
data_df = pd.DataFrame(all_data, columns = ["User", "Movie", "Rating"])
data_df

Unnamed: 0,User,Movie,Rating
0,196,242,3
1,186,302,3
2,22,377,1
3,166,346,1
4,298,474,4
...,...,...,...
99995,13,858,1
99996,7,79,4
99997,588,1058,2
99998,916,727,4


I'm also going to want to evaluate the average rating and the number of ratings for each movie, and I'll start out by making a dataframe to store this information. 

In [10]:
N = 1682
M = 943
users = list(range(1, M + 1))
movies = list(range(1, N + 1))

movie_df = pd.DataFrame(movies, columns = ["Movie"])
movie_df["x"] = v_tilde[0, :]
movie_df["y"] = v_tilde[1, :]
movie_df["x2"] = v2_tilde[0, :]
movie_df["y2"] = v2_tilde[1, :]
movie_df["x3"] = v3_tilde[0, :]
movie_df["y3"] = v3_tilde[1, :]
movie_df

Unnamed: 0,Movie,x,y,x2,y2,x3,y3
0,1,-2.013017,-0.286566,0.118467,0.221348,-1.952675,-0.344396
1,2,-1.740218,-0.520055,0.064167,-0.113319,-1.659021,-0.263836
2,3,-1.495850,-0.067501,-0.218448,-0.232450,-1.549681,-0.017890
3,4,-1.862405,0.162890,0.086272,0.238187,-1.762672,-0.001423
4,5,-1.731287,-0.614927,0.414083,-0.000404,-1.692985,-0.578998
...,...,...,...,...,...,...,...
1677,1678,0.258241,-0.072192,0.000000,0.000000,0.000000,0.000000
1678,1679,-0.983659,-0.147659,-0.107811,-0.064935,-0.420683,0.126325
1679,1680,-0.924068,-0.025866,0.010306,0.177098,-0.245842,0.076767
1680,1681,-1.328897,0.269510,-0.070143,0.115342,-0.458186,0.058283


Let's evaluate the average rating and the number of ratings for each movie. 

In [11]:
avg_ratings = np.zeros(N)
num_ratings = np.zeros(N, dtype = int)
for i in range(N):
  mov = i + 1
  df_mov = data_df[data_df["Movie"] == mov]
  num_ratings[i] = len(df_mov)
  avg_ratings[i] = np.mean(df_mov["Rating"].values)
movie_df["Average Rating"] = avg_ratings
movie_df["Number of Ratings"] = num_ratings
movie_df

Unnamed: 0,Movie,x,y,x2,y2,x3,y3,Average Rating,Number of Ratings
0,1,-2.013017,-0.286566,0.118467,0.221348,-1.952675,-0.344396,3.878319,452
1,2,-1.740218,-0.520055,0.064167,-0.113319,-1.659021,-0.263836,3.206107,131
2,3,-1.495850,-0.067501,-0.218448,-0.232450,-1.549681,-0.017890,3.033333,90
3,4,-1.862405,0.162890,0.086272,0.238187,-1.762672,-0.001423,3.550239,209
4,5,-1.731287,-0.614927,0.414083,-0.000404,-1.692985,-0.578998,3.302326,86
...,...,...,...,...,...,...,...,...,...
1677,1678,0.258241,-0.072192,0.000000,0.000000,0.000000,0.000000,1.000000,1
1678,1679,-0.983659,-0.147659,-0.107811,-0.064935,-0.420683,0.126325,3.000000,1
1679,1680,-0.924068,-0.025866,0.010306,0.177098,-0.245842,0.076767,2.000000,1
1680,1681,-1.328897,0.269510,-0.070143,0.115342,-0.458186,0.058283,3.000000,1


In [12]:
cols = ["Movie Id", "Movie Title", "Unknown", "Action", "Adventure", 
        "Animation", "Children's", "Comedy", "Crime", "Documentary",
        "Drama", "Fantasy", "Film-Noir", "Horror", "Musical", "Mystery", 
        "Romance", "Sci-Fi", "Thriller", "War", "Western"]
movie_url ='https://raw.githubusercontent.com/lakigigar/Caltech-CS155-2021/main/projects/project2/data/movies.txt'
metadata = pd.read_csv(movie_url, delimiter="\t", header=None, 
                       encoding = 'latin-1', names = cols)
metadata.head(5)

Unnamed: 0,Movie Id,Movie Title,Unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0


# Part A

I'm going to try and write a general plotting function that will work for all of these plots. 

In [13]:
def get_plot(movie_ids, 
             fig_title, 
             xlabel="Component 1", 
             ylabel="Component 2",
             y_offsets = None,
             x_offsets = None,
             component_1 = "x",
             component_2 = "y",
             x_range=None,
             y_range=None):
  """Returns a bokeh plot in which the movies are plotted onto a 2D projection. 
  Each movie is represented by its title on the plot."""
  # If we didn't pass offsets to the labels, we assume label positions do not
  # require adjusting and so set the adjustments to zero. 
  if type(y_offsets) == type(None):
    y_offsets = np.zeros(len(movie_ids))
  if type(x_offsets) == type(None):
    x_offsets = np.zeros(len(movie_ids))

  # Generate the figure
  fig = bokeh.plotting.Figure(width = 400,
                              height = 600,
                              title = fig_title,
                              x_axis_label = xlabel,
                              y_axis_label = ylabel,
                              x_range=x_range,
                              y_range=y_range)
  
  # Obtain the coordinates and the names of the chosen movies. 
  indices = movie_ids - 1
  x = np.zeros(len(movie_ids))
  y = np.zeros(len(movie_ids))
  num_rat = np.zeros(len(movie_ids))
  names = np.zeros(len(movie_ids), dtype=object)
  i = 0
  for index in indices:
    x[i] = movie_df[component_1].values[index]
    y[i] = movie_df[component_2].values[index]
    num_rat[i] = movie_df["Number of Ratings"].values[index]
    names[i] = metadata["Movie Title"].values[index]
    i += 1
  
  num_rat = num_rat / np.max(num_rat) * 15
  # Plot a dot corresponding to each movie
  fig.circle(x, y, size = num_rat, color="red")

  # Also plot the movie name next to the dot. 
  # I adjust the positions of the labels so that they do not overlap. 
  x = x + x_offsets
  y = y + y_offsets
  source = bokeh.models.ColumnDataSource(dict(x=x, y=y, text=names))
  glyph = bokeh.models.Text(x="x", y="y", text="text", text_color="black",
                            text_font_size = "8pt")
  fig.add_glyph(source, glyph)
  
  return fig

Next, I will choose ten movies and gather their ids. 

In [14]:
ten_choice_movies = ["Seven (Se7en) (1995)",
                     "Twelve Monkeys (1995)",
                     "Pulp Fiction (1994)",
                     "Silence of the Lambs, The (1991)",
                     "Shawshank Redemption, The (1994)",
                     "Free Willy (1993)",
                     "Ace Ventura: Pet Detective (1994)",
                     "Star Wars (1977)",
                     "Free Willy 2: The Adventure Home (1995)",
                     "Lion King, The (1994)"]
ten_choice_ids = np.zeros(10, dtype=int)
for i in range(10):
  index = np.where(metadata["Movie Title"].values == ten_choice_movies[i])[0][0]
  ten_choice_ids[i] = index + 1

Now I will produce the plots, manually adjusting the label positions.

In [15]:
# Method 1
x_offsets_1 = np.array([0, 0, 0, 0, 0, -0.05, -1,  0.02, -1.22, 0]) + 0.02
y_offsets_1 = np.array([0, 0, 0, 0, 0,  0.01,  0, -0.02,     0, 0])
plot1 = get_plot(ten_choice_ids, "Visualization of ten movies using homework 5 code.", 
                x_offsets = x_offsets_1, y_offsets = y_offsets_1)
# Method 2
x_offsets_2 = np.array([0, 0,  0.01,  0.01,  -0.64, 0, -0.6, 0, 0, 0]) + 0.02
y_offsets_2 = np.array([0, 0, -0.015, -0.01,     0, 0,    0, 0, 0, 0])
plot2 = get_plot(ten_choice_ids, "Visualization of ten movies using SVD with bias.", 
                x_offsets = x_offsets_2, y_offsets = y_offsets_2, 
                component_1 = "x2", component_2 = "y2")
# Method 3
x_offsets_3 = np.array([0, 0, 0, 0, 0,  -0.32, -0.68, 0, -0.82, 0]) + 0.02
y_offsets_3 = np.array([0, 0, 0, 0, 0, -0.04,    0, 0,     0, 0])
plot3 = get_plot(ten_choice_ids, "Visualization of ten movies using SVD without bias.",
                x_offsets = x_offsets_3, y_offsets = y_offsets_3, 
                component_1 = "x3", component_2 = "y3")
row = bokeh.layouts.row([plot1, plot2, plot3])
bokeh.io.show(row)

# Part B
Next I will visualize the top ten most popular movies. 

In [16]:
sorted_df = movie_df.sort_values("Number of Ratings", ascending=False)
pop_indices = sorted_df["Movie"].values[:10] - 1
for index in pop_indices:
  print(metadata["Movie Title"].values[index])

Star Wars (1977)
Contact (1997)
Fargo (1996)
Return of the Jedi (1983)
Liar Liar (1997)
English Patient, The (1996)
Scream (1996)
Toy Story (1995)
Air Force One (1997)
Independence Day (ID4) (1996)


In [73]:
# Method 1
x_offsets_1 = np.array([0.013, 0.02, 0.01, 0.025, -0.18, -0.31, -0.19, -0.2, -0.25, -0.37])
y_offsets_1 = np.array([0, 0, 0.017, 0, -0.025, -0.03, -0.02, -0.02, -0.03, -0.02])
plot1 = get_plot(pop_indices + 1, "Viz. of ten most popular movies using homework 5 code.", 
                x_offsets = x_offsets_1, y_offsets = y_offsets_1)
# Method 2
x_offsets_2 =  np.array([-0.43, 0.05, 0.01, -0.5, -0.05, -0.4, -0.4, -0.35, -0.4, -0.7])
y_offsets_2 = np.array([0.02, -0.02, 0.017, -0.05, 0.02, -0.04, -0.035, 0.02, 0.02, -0.04])
plot2 = get_plot(pop_indices + 1, "Viz. of ten most popular movies using SVD with bias.", 
                x_offsets = x_offsets_2, y_offsets = y_offsets_2, 
                component_1 = "x2", component_2 = "y2")
# Method 3
x_offsets_3 = np.array([0.013, 0.02,0.01,0.025,-0.16, -0.26, -0.16,-0.18, -0.21, -0.31])
y_offsets_3 = np.array([0, 0, 0.017, -0.04, -0.025, -0.03, 0.03, 0.02, -0.03, -0.02])
plot3 = get_plot(pop_indices + 1, "Viz. of ten most popular movies using SVD without bias.",
                x_offsets = x_offsets_3, y_offsets = y_offsets_3, 
                component_1 = "x3", component_2 = "y3")
row = bokeh.layouts.row([plot1, plot2, plot3])
bokeh.io.show(row)

# Part C
Next I will create visualizations for the ten highest rated movies. 

In [18]:
sorted_df = movie_df.sort_values("Average Rating", ascending=False)
top_indices = sorted_df["Movie"].values[:10] - 1
for index in top_indices:
  print(metadata["Movie Title"].values[index])

Great Day in Harlem, A (1994)
Someone Else's America (1995)
Marlene Dietrich: Shadow and Light (1996) 
They Made Me a Criminal (1939)
Entertaining Angels: The Dorothy Day Story (1996)
Star Kid (1997)
Santa with Muscles (1996)
Prefontaine (1997)
Aiqing wansui (1994)
Saint of Fort Washington, The (1993)


In [83]:
x_offsets = np.array(
    [0, 
     -0.36, 
     0, 
     0, 
     -0.7, 
     0, 
     0, 
     0, 
     -0.24, 
     0])+0.01
y_offsets = np.array(
    [-0.04, 
     0, 
     0, 
     0, 
     0, 
     0, 
     0, 
     0, 
     0, 
     0])+0.03
plot = get_plot(top_indices + 1, "Visualization of Ten Most Highly Rated Movies", 
                x_offsets = x_offsets, y_offsets = y_offsets)
bokeh.io.show(plot)

# Part D
Visualization of 10 Animation movies

In [20]:
# Get indices of animation movies
anim_df = metadata[metadata["Animation"].values == 1]
anim_ids = np.array([x[0] for x in np.array(anim_df)])[:10]

for index in anim_ids:
    print(metadata["Movie Title"].values[index - 1])

Toy Story (1995)
Lion King, The (1994)
Aladdin (1992)
Snow White and the Seven Dwarfs (1937)
Heavy Metal (1981)
Aristocats, The (1970)
All Dogs Go to Heaven 2 (1996)
Wallace & Gromit: The Best of Aardman Animation (1996)
Wrong Trousers, The (1993)
Grand Day Out, A (1992)


In [21]:
x_offsets = np.array([0, 0.018, -0.29, 0, 0, 0, -0.61, 0, 0, 0])
y_offsets = np.array([0, -0.038, 0, 0, 0, 0, -0.041, 0, 0, 0])

anim_plot = get_plot(anim_ids, "Visualization of Ten Animation Movies", 
                     "Component 1", "Component 2",
                     x_offsets = x_offsets, y_offsets = y_offsets)
bokeh.io.show(anim_plot)

Visualization of 10 action movies

In [22]:
# Get indices of action movies
action_df = metadata[metadata["Action"].values == 1]
action_ids = np.array([x[0] for x in np.array(action_df)])[:10]

for index in action_ids:
    print(metadata["Movie Title"].values[index - 1])

GoldenEye (1995)
Get Shorty (1995)
From Dusk Till Dawn (1996)
Muppet Treasure Island (1996)
Braveheart (1995)
Rumble in the Bronx (1995)
Bad Boys (1995)
Apollo 13 (1995)
Batman Forever (1995)
Desperado (1995)


In [23]:
x_offsets = np.array([0, 0, -0.265, -0.28, 0, -0.26, 0, 0, -0.22, 0])
y_offsets = np.array([0, 0, 0, -0.042, 0, 0, 0, 0, 0, 0])

anim_plot = get_plot(action_ids, "Visualization of Ten Action Movies", 
                     "Component 1", "Component 2",
                     x_offsets = x_offsets, y_offsets = y_offsets)
bokeh.io.show(anim_plot)

Visualization of 10 crime movies

In [24]:
# Get indices of crime movies
crime_df = metadata[metadata["Crime"].values == 1]
crime_ids = np.array([x[0] for x in np.array(crime_df)])[:10]

for index in crime_ids:
    print(metadata["Movie Title"].values[index - 1])

Copycat (1995)
Seven (Se7en) (1995)
Usual Suspects, The (1995)
From Dusk Till Dawn (1996)
Rumble in the Bronx (1995)
Batman Forever (1995)
Strange Days (1995)
Professional, The (1994)
Pulp Fiction (1994)
Mask, The (1994)


In [25]:
x_offsets = np.array([0, 0, 0, -0.36, -0.35, -0.3, 0, -0.32, 0, 0])
y_offsets = np.array([0, -0.05, -0.05, 0, -0.05, 0, -0.05, 0.01, 0, 0])

anim_plot = get_plot(crime_ids, "Visualization of Ten Crime Movies", 
                     "Component 1", "Component 2",
                     x_offsets = x_offsets, y_offsets = y_offsets)
bokeh.io.show(anim_plot)