# Setup

In [None]:
# this mounts your Google Drive to the Colab VM.
from google.colab import drive
drive.mount('/content/drive', force_remount=True)


FOLDERNAME = 'cs229_proj/'


assert FOLDERNAME is not None, "[!] Enter the foldername."

# now that we've mounted your Drive, this ensures that
# the Python interpreter of the Colab VM can load
# python files from within it.
import sys
sys.path.append('/content/drive/My Drive/{}'.format(FOLDERNAME))

%cd drive/My\ Drive/$FOLDERNAME/

Mounted at /content/drive
/content/drive/My Drive/cs229_proj


In [None]:
!pwd
!ls

In [None]:
from IPython.display import Image
import json
import random
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import math

import torch
from torch import nn

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.utils import shuffle
from sklearn.impute import SimpleImputer

# Load Files

In [None]:
movies_full_path = '/content/drive/My Drive/cs229_proj/movies_full.csv'
user_ratings_full_path = '/content/drive/My Drive/cs229_proj/user_ratings_full.csv'
movies_embeddings_full_path = '/content/drive/My Drive/cs229_proj/movies_embeddings_full.csv'
movies_svd_path = '/content/drive/My Drive/cs229_proj/movies_svd_full.csv'

movies_full_df = pd.read_csv(movies_full_path)
ratings_full_df = pd.read_csv(user_ratings_full_path)
movies_embeddings_df = pd.read_csv(movies_embeddings_full_path)
movies_svd_df = pd.read_csv(movies_svd_path)

In [None]:
print(movies_full_df.shape)
print(ratings_full_df.shape)
print(movies_embeddings_df.shape)
print(movies_svd_df.shape)

(2220, 55)
(392551, 3)
(2220, 1567)


In [None]:
# print(movies_full_df.columns)

all_columns = ['movie_id', 'poster_path', 'title',
       'year', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10',
       'tmdb_budget', 'imdb_budget', 'tmdb_revenue', 'imdb_revenue',
       'tmdb_vote_average', 'tmdb_vote_count', 'imdb_vote_average',
       'imdb_vote_count', 'tmdb_popularity', 'tmdb_runtime', 'imdb_runtime',
       'main_genre', 'genres', 'director', 'writer', 'main_actor',
       'mpaa_rating', 'overview', 'tagline', 'keywords', 'release_date',
       'main_prod_company', 'production_companies', 'country',
       'production_countries', 'original_language', 'spoken_languages',
       'rt_info', 'critics_consensus', 'actors', 'tm_status', 'tm_rating',
       'tm_count', 'audience_status', 'audience_rating', 'audience_count',
       'tm_top_critics_count', 'tm_fresh_critics_count',
       'tm_rotten_critics_count', 'ebert_rating', 'boxd_vote_average']
print(len(all_columns))

55


# Features (and data preprocessing)

In [None]:
# All features split up into many categories

regression_features = ['year', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10',
       'tmdb_budget', 'imdb_budget', 'tmdb_revenue', 'imdb_revenue', 'tmdb_vote_average', 'tmdb_vote_count', 'imdb_vote_average',
       'imdb_vote_count', 'tmdb_popularity', 'tmdb_runtime', 'imdb_runtime', 'tm_rating',
       'tm_count', 'audience_rating', 'audience_count',
       'tm_top_critics_count', 'tm_fresh_critics_count',
       'tm_rotten_critics_count', 'ebert_rating', 'boxd_vote_average']

classification_features = ['main_genre', 'mpaa_rating', 'country', 'main_prod_company', 'tm_status', 'audience_status', 
                           'original_language']

complex_features = ['genres', 'director', 'writer', 'main_actor', 'overview', 'tagline', 'keywords', 'release_date', 
                    'production_companies', 'production_countries', 'spoken_languages', 'rt_info', 'critics_consensus', 
                    'actors']

other_features = ['movie_id', 'poster_path', 'title']

print(len(regression_features) + len(classification_features) + len(complex_features) + len(other_features))

55


In [None]:
# Remove rows that have at least one NaN value (to make LinReg easier)
def remove_missing_values(X):
    old_len = X.shape[0]
    X = X[~np.isnan(X).any(axis=1)]
    new_len = X.shape[0]
    print("Removed " + str(old_len - new_len) + " out of " + str(old_len) + " elements or " + str((old_len - new_len)/old_len*100) + "%")
    print(str(new_len) + " elements remain")
    return X

In [None]:
def get_num_missing(X):
    old_len = X.shape[0]
    X = X[~np.isnan(X).any(axis=1)]
    new_len = X.shape[0]
    return old_len - new_len

In [None]:
def train_test_split(X, y, frac=0.8, verbose=False):
    train_len = int(X.shape[0] * frac)
    X_train = X[:train_len, :]
    y_train = y[:train_len]
    X_test = X[train_len:, :]
    y_test = y[train_len:]
    if verbose:
        print("X_train:", X_train.shape)
        print("y_train:", y_train.shape)
        print("X_test:", X_test.shape)
        print("y_test:", y_test.shape)
    return X_train, y_train, X_test, y_test

# Models (for predicting average Letterboxd rating)


## Multi-Layer Perceptron (Regression Features Only)

In [None]:
# No embeddings

feature_lst = ['year', 'tmdb_budget', 'imdb_budget', 'tmdb_revenue', 'imdb_revenue', 'tmdb_runtime', 'imdb_runtime', 
                     'tm_rating', 'tm_count', 'tm_top_critics_count', 'tm_fresh_critics_count', 
                     'tm_rotten_critics_count', 'ebert_rating']

output_label = 'boxd_vote_average'

X_all = movies_full_df[feature_lst + [output_label]]

print(get_num_missing(X_all))

X_noembed = remove_missing_values(X_all)

print(X_noembed.shape)

In [None]:
from sklearn.utils import shuffle

features_noembed = X_noembed[feature_lst].to_numpy()
outputs_noembed = X_noembed[output_label].to_numpy()
X_ne, y_ne = shuffle(features_noembed, outputs_noembed, random_state=1)
print(X_ne.shape)
print(y_ne.shape)


In [None]:
X_ne_train, y_ne_train, X_ne_test, y_ne_test = train_test_split(X_ne, y_ne,frac=0.9, verbose=True)

In [None]:
import torch
from torch import nn
from torch.utils.data import DataLoader
from sklearn.preprocessing import StandardScaler

def get_rmse(y_pred, y_true):
    return np.sqrt(np.mean((y_pred-y_true)**2))

#this model learns to minimize MAE
def mae_loss(y_pred, y_true):
    mae = torch.abs(y_true - y_pred).mean()
    return mae

#this model learns to minimize RMSE
def rmse_loss(y_pred, y_true):
    return torch.sqrt(torch.mean((y_pred-y_true)**2))

def get_train_test_datasets(X_train, y_train, X_test, y_test):
    return MovieDataset(X_train, y_train), X_train, y_train, torch.tensor(X_test), torch.tensor(y_test)

In [None]:
class MovieDataset(torch.utils.data.Dataset):
  '''
  Prepare the Movie dataset for regression
  '''

  def __init__(self, X, y, scale_data=True):
    if not torch.is_tensor(X) and not torch.is_tensor(y):
      # Apply scaling if necessary
      if scale_data:
          X = StandardScaler().fit_transform(X)
      self.X = torch.from_numpy(X)
      self.y = torch.from_numpy(y)

  def __len__(self):
      return len(self.X)

  def __getitem__(self, i):
      return self.X[i], self.y[i]

In [None]:
class MLP(nn.Module):
  '''
    Multilayer Perceptron for regression.
  '''
  def __init__(self):
    super().__init__()
    self.layers = nn.Sequential(
      nn.Linear(13, 64),
      nn.Dropout(),
      nn.ReLU(),
      nn.Linear(64, 32),
      nn.ReLU(),
      nn.Linear(32, 1)
    )


  def forward(self, x):
    '''
      Forward pass
    '''
    return self.layers(x)

In [None]:
# Set fixed random number seed
torch.manual_seed(42)

# Prepare dataset
train_data, X_train, y_train, X_test, y_test = get_train_test_datasets(X_ne_train, y_ne_train, X_ne_test, y_ne_test)
trainloader = torch.utils.data.DataLoader(train_data, batch_size=4, shuffle=True, num_workers=1)

# Initialize the MLP
mlp = MLP()

# Define the loss function and optimizer
loss_function = nn.L1Loss()
optimizer = torch.optim.Adam(mlp.parameters(), lr=1e-4, weight_decay=1e-5)

# Run the training loop
for epoch in range(0, 200): # 5 epochs at maximum
  
  # Print epoch
  print(f'Starting epoch {epoch+1}')
  
  # Set current loss value
  current_loss = 0.0
  
  # Iterate over the DataLoader for training data
  for i, data in enumerate(trainloader, 0):
    
    # Get and prepare inputs
    inputs, targets = data
    inputs, targets = inputs.float(), targets.float()
    targets = targets.reshape((targets.shape[0], 1))
    
    # Zero the gradients
    optimizer.zero_grad()
    
    # Perform forward pass
    outputs = mlp(inputs)
    
    # Compute loss
    loss = mae_loss(outputs, targets)
    
    # Perform backward pass
    loss.backward()
    
    # Perform optimization
    optimizer.step()
    
    # Print statistics
    current_loss += loss.item()
    if i % 100 == 0:
        print('Loss after mini-batch %5d: %.3f' %
              (i + 1, current_loss / 500))
        current_loss = 0.0

# Process is complete.
print('Training process has finished.')

Starting epoch 1
Loss after mini-batch     1: 0.013
Loss after mini-batch   101: 1.271
Loss after mini-batch   201: 1.242
Starting epoch 2
Loss after mini-batch     1: 0.011
Loss after mini-batch   101: 1.093
Loss after mini-batch   201: 0.954
Starting epoch 3
Loss after mini-batch     1: 0.005
Loss after mini-batch   101: 0.662
Loss after mini-batch   201: 0.503
Starting epoch 4
Loss after mini-batch     1: 0.006
Loss after mini-batch   101: 0.399
Loss after mini-batch   201: 0.376
Starting epoch 5
Loss after mini-batch     1: 0.006
Loss after mini-batch   101: 0.348
Loss after mini-batch   201: 0.348
Starting epoch 6
Loss after mini-batch     1: 0.006
Loss after mini-batch   101: 0.341
Loss after mini-batch   201: 0.303
Starting epoch 7
Loss after mini-batch     1: 0.005
Loss after mini-batch   101: 0.301
Loss after mini-batch   201: 0.295
Starting epoch 8
Loss after mini-batch     1: 0.004
Loss after mini-batch   101: 0.310
Loss after mini-batch   201: 0.309
Starting epoch 9
Loss af

In [None]:
# Train RMSE

mlp.eval()
y_preds = mlp(torch.tensor(StandardScaler().fit_transform(X_train)).float())
y_preds = y_preds.detach().numpy().reshape(y_preds.shape[0])
print(y_preds.shape)
print(torch.tensor(y_train).float())
print(get_rmse(y_preds, y_train))

(1161,)
tensor([6.7758, 7.3979, 7.4826,  ..., 5.4723, 5.7397, 6.6798])
0.5663755457230754


In [None]:
# Test RMSE

y_pred_test = mlp(torch.tensor(StandardScaler().fit_transform(X_test)).float())
y_pred_test = y_pred_test.detach().numpy().reshape(y_pred_test.shape[0])
print(y_pred_test.shape)
print(get_rmse(y_pred_test, y_test.numpy()))

(129,)
0.5352834947772126


### Best - No embeddings

Train RMSE = 0.5663755457230754

Test RMSE = 0.5352834947772126

In [None]:
# Best settings below 

def __init__(self):
    super().__init__()
    self.layers = nn.Sequential(
      nn.Linear(13, 64), # 11
      nn.Dropout(),
      nn.ReLU(),
      nn.Linear(64, 32),
      nn.ReLU(),
      nn.Linear(32, 1)
    )

mae_loss

trainloader = torch.utils.data.DataLoader(train_data, batch_size=4, shuffle=True, num_workers=1)
optimizer = torch.optim.Adam(mlp.parameters(), lr=1e-4, weight_decay=1e-5)
for epoch in range(0, 200)

## Hybrid Approach (MLP + CNN Embeddings)

In [None]:
# ALL EMBEDDINGS
resnet_ct = 1000
clip_ct = 512

feature_resnet_lst = ['year', 'tmdb_budget', 'tmdb_revenue', 'imdb_revenue', 'tmdb_runtime', 'imdb_runtime', 
                     'tm_rating', 'tm_count', 'tm_top_critics_count', 'tm_fresh_critics_count', 
                     'tm_rotten_critics_count', 'ebert_rating', 'imdb_budget']
feature_clip_lst = ['year', 'tmdb_budget', 'tmdb_revenue', 'imdb_revenue', 'tmdb_runtime', 'imdb_runtime', 
                     'tm_rating', 'tm_count', 'tm_top_critics_count', 'tm_fresh_critics_count', 
                     'tm_rotten_critics_count', 'ebert_rating', 'imdb_budget']
for i in range(1, resnet_ct + 1):
    feature_resnet_lst.append('resnet-' + str(i))
for i in range(1, clip_ct + 1):
    feature_clip_lst.append('clip-' + str(i))

output_label = 'boxd_vote_average'

extract_resnet_lst = feature_resnet_lst + [output_label]
extract_clip_lst = feature_clip_lst + [output_label]

X_resnet_all = movies_embeddings_df[extract_resnet_lst]
X_clip_all = movies_embeddings_df[extract_clip_lst]

print(get_num_missing(X_resnet_all))

X_resnet = remove_missing_values(X_resnet_all)
X_clip = remove_missing_values(X_clip_all)

print(X_resnet.shape)

In [None]:
# SVD -- Dimension reduction
resnet_svd_ct = 25 # from 1000
clip_svd_ct = 100 # from 512

feature_resnet_lst = ['year', 'tmdb_budget', 'tmdb_revenue', 'imdb_revenue', 'tmdb_runtime', 'imdb_runtime', 
                     'tm_rating', 'tm_count', 'tm_top_critics_count', 'tm_fresh_critics_count', 
                     'tm_rotten_critics_count', 'ebert_rating', 'imdb_budget']
feature_clip_lst = ['year', 'tmdb_budget', 'tmdb_revenue', 'imdb_revenue', 'tmdb_runtime', 'imdb_runtime', 
                     'tm_rating', 'tm_count', 'tm_top_critics_count', 'tm_fresh_critics_count', 
                     'tm_rotten_critics_count', 'ebert_rating', 'imdb_budget']
for i in range(1, resnet_svd_ct + 1):
    feature_resnet_lst.append('resnet-svd-' + str(i))
for i in range(1, clip_svd_ct + 1):
    feature_clip_lst.append('clip-svd-' + str(i))

output_label = 'boxd_vote_average'

extract_resnet_lst = feature_resnet_lst + [output_label]
extract_clip_lst = feature_clip_lst + [output_label]

X_resnet_all = movies_svd_df[extract_resnet_lst]
X_clip_all = movies_svd_df[extract_clip_lst]

print(get_num_missing(X_resnet_all))

X_resnet = remove_missing_values(X_resnet_all)
X_clip = remove_missing_values(X_clip_all)

print(X_resnet.shape)
print(X_clip.shape)

In [None]:
from sklearn.utils import shuffle

features_resnet = X_resnet[feature_resnet_lst].to_numpy()
outputs_resnet = X_resnet[output_label].to_numpy()
X_rn, y_rn = shuffle(features_resnet, outputs_resnet, random_state=0)
print(X_rn.shape)
print(y_rn.shape)

features_clip = X_clip[feature_clip_lst].to_numpy()
outputs_clip = X_clip[output_label].to_numpy()
X_cp, y_cp = shuffle(features_clip, outputs_clip, random_state=0)
print(X_cp.shape)
print(y_cp.shape)

In [None]:
X_rn_train, y_rn_train, X_rn_test, y_rn_test = train_test_split(X_rn, y_rn, frac=0.9, verbose=True)
X_cp_train, y_cp_train, X_cp_test, y_cp_test = train_test_split(X_cp, y_cp, frac=0.9, verbose=True)

In [None]:
import torch
from torch import nn
from torch.utils.data import DataLoader
from sklearn.preprocessing import StandardScaler

def get_rmse(y_pred, y_true):
    return np.sqrt(np.mean((y_pred-y_true)**2))

#this model learns to minimize MAE
def mae_loss(y_pred, y_true):
    mae = torch.abs(y_true - y_pred).mean()
    return mae

#this model learns to minimize RMSE
def rmse_loss(y_pred, y_true):
    return torch.sqrt(torch.mean((y_pred-y_true)**2))

In [None]:
def get_train_test_datasets(X_train, y_train, X_test, y_test):
    return MovieDataset(X_train, y_train), X_train, y_train, torch.tensor(X_test), torch.tensor(y_test)

In [None]:
class MovieDataset(torch.utils.data.Dataset):
  '''
  Prepare the Movie dataset for regression
  '''

  def __init__(self, X, y, scale_data=True):
    if not torch.is_tensor(X) and not torch.is_tensor(y):
      # Apply scaling if necessary
      if scale_data:
          X = StandardScaler().fit_transform(X)
      self.X = torch.from_numpy(X)
      self.y = torch.from_numpy(y)

  def __len__(self):
      return len(self.X)

  def __getitem__(self, i):
      return self.X[i], self.y[i]

In [None]:
class MLP(nn.Module):
  '''
    Multilayer Perceptron for regression.
  '''
  def __init__(self):
    super().__init__()
    self.layers = nn.Sequential(
      nn.Linear(38, 128), # 1011 vs 523 OR 36 vs 111
      nn.Dropout(),
      nn.ReLU(),
      nn.Linear(128, 64),
      nn.ReLU(),
      nn.Linear(64, 1)
    )

  def forward(self, x):
    '''
      Forward pass
    '''
    return self.layers(x)

In [None]:
# Set fixed random number seed
torch.manual_seed(42)

# Flip between ResNet and ClipNet embeddings
train_data, X_train, y_train, X_test, y_test = get_train_test_datasets(X_rn_train, y_rn_train, X_rn_test, y_rn_test)
# train_data, X_train, y_train, X_test, y_test = get_train_test_datasets(X_cp_train, y_cp_train, X_cp_test, y_cp_test)
trainloader = torch.utils.data.DataLoader(train_data, batch_size=4, shuffle=True, num_workers=1)

# Initialize the MLP
mlp = MLP()

# Define the loss function and optimizer
loss_function = nn.L1Loss()
optimizer = torch.optim.Adam(mlp.parameters(), lr=1e-4, weight_decay=1e-5)

# Run the training loop
for epoch in range(0, 200): # 5 epochs at maximum
  
  # Print epoch
  print(f'Starting epoch {epoch+1}')
  
  # Set current loss value
  current_loss = 0.0
  
  # Iterate over the DataLoader for training data
  for i, data in enumerate(trainloader, 0):
    
    # Get and prepare inputs
    inputs, targets = data
    inputs, targets = inputs.float(), targets.float()
    targets = targets.reshape((targets.shape[0], 1))
    
    # Zero the gradients
    optimizer.zero_grad()
    
    # Perform forward pass
    outputs = mlp(inputs)
    
    # Compute loss
    loss = mae_loss(outputs, targets)
  
    # Perform backward pass
    loss.backward()
    
    # Perform optimization
    optimizer.step()
    
    # Print statistics
    current_loss += loss.item()
    if i % 100 == 0:
        print('Loss after mini-batch %5d: %.3f' %
              (i + 1, current_loss / 500))
        current_loss = 0.0

# Process is complete.
print('Training process has finished.')

Starting epoch 1
Loss after mini-batch     1: 0.014
Loss after mini-batch   101: 1.296
Loss after mini-batch   201: 1.184
Starting epoch 2
Loss after mini-batch     1: 0.009
Loss after mini-batch   101: 0.841
Loss after mini-batch   201: 0.574
Starting epoch 3
Loss after mini-batch     1: 0.003
Loss after mini-batch   101: 0.274
Loss after mini-batch   201: 0.255
Starting epoch 4
Loss after mini-batch     1: 0.002
Loss after mini-batch   101: 0.241
Loss after mini-batch   201: 0.235
Starting epoch 5
Loss after mini-batch     1: 0.004
Loss after mini-batch   101: 0.228
Loss after mini-batch   201: 0.234
Starting epoch 6
Loss after mini-batch     1: 0.002
Loss after mini-batch   101: 0.214
Loss after mini-batch   201: 0.228
Starting epoch 7
Loss after mini-batch     1: 0.002
Loss after mini-batch   101: 0.224
Loss after mini-batch   201: 0.207
Starting epoch 8
Loss after mini-batch     1: 0.001
Loss after mini-batch   101: 0.209
Loss after mini-batch   201: 0.222
Starting epoch 9
Loss af

In [None]:
# Train RMSE

mlp.eval()
y_preds = mlp(torch.tensor(StandardScaler().fit_transform(X_train)).float())
y_preds = y_preds.detach().numpy().reshape(y_preds.shape[0])
print(y_preds.shape)
print(torch.tensor(y_train).float())
print(get_rmse(y_preds, y_train))

In [None]:
# Test RMSE

y_pred_test = mlp(torch.tensor(StandardScaler().fit_transform(X_test)).float())
y_pred_test = y_pred_test.detach().numpy().reshape(y_pred_test.shape[0])
print(y_pred_test.shape)
print(get_rmse(y_pred_test, y_test.numpy()))

### Best - ResNet-50 (SVD of Embeddings = 25 components)

Train RMSE = 0.4218106118942377

Test RMSE = 0.48053026270642585

In [None]:
# Best settings

self.layers = nn.Sequential(
      nn.Linear(38, 128), # 1011 vs 523 OR 36 vs 111
      nn.Dropout(),
      nn.ReLU(),
      nn.Linear(128, 64),
      nn.ReLU(),
      nn.Linear(64, 1)
    )

mae_loss

trainloader = torch.utils.data.DataLoader(train_data, batch_size=4, shuffle=True, num_workers=1)
optimizer = torch.optim.Adam(mlp.parameters(), lr=1e-4, weight_decay=1e-5)
for epoch in range(0, 200): # 5 epochs at maximum

### Best - CLIP (SVD of Embeddings = 100 components)

Train RMSE = 0.29143314082098626

Test RMSE = 0.5397942532279681

In [None]:
# Best settings

self.layers = nn.Sequential(
      nn.Linear(113, 128), # 36 vs 111
      nn.Dropout(),
      nn.ReLU(),
      nn.Linear(128, 32),
      nn.ReLU(),
      nn.Linear(32, 1)
    )

mae_loss

trainloader = torch.utils.data.DataLoader(train_data, batch_size=4, shuffle=True, num_workers=1)
optimizer = torch.optim.Adam(mlp.parameters(), lr=1e-4, weight_decay=1e-5)
for epoch in range(0, 200): # 5 epochs at maximum

### Best - ResNet-50 (1000 embeddings)

Train RMSE = 0.5604313187795152

Test RMSE = 0.6657604022378522

In [None]:
# Best settings

self.layers = nn.Sequential(
      nn.Linear(1013, 64), # 1011 vs 523
      nn.Dropout(),
      nn.ReLU(),
      nn.Linear(64, 32),
      nn.ReLU(),
      nn.Linear(32, 1)
    )

trainloader = torch.utils.data.DataLoader(train_data, batch_size=4, shuffle=True, num_workers=1)
optimizer = torch.optim.Adam(mlp.parameters(), lr=1e-4, weight_decay=1e-5)
for epoch in range(0, 100): # 5 epochs at maximum

### BEST - CLIP (512 embeddings)

Train RMSE = 0.3406810939517073

Test RMSE = 0.5907930360256579

In [None]:
# Best settings

self.layers = nn.Sequential(
      nn.Linear(525, 64), # 1011 vs 523
      nn.Dropout(),
      nn.ReLU(),
      nn.Linear(64, 32),
      nn.ReLU(),
      nn.Linear(32, 1)
    )

trainloader = torch.utils.data.DataLoader(train_data, batch_size=4, shuffle=True, num_workers=1)
optimizer = torch.optim.Adam(mlp.parameters(), lr=1e-4, weight_decay=1e-5)
for epoch in range(0, 100): # 5 epochs at maximum