<a href="https://colab.research.google.com/github/jaroorhmodi/recommendation-systems-projects/blob/main/recommendation_systems.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Recommendation Systems

In this notebook I will follow the lesson in [this colab from Google](https://colab.research.google.com/github/google/eng-edu/blob/main/ml/recommendation-systems/recommendation-systems.ipynb?utm_source=ss-recommendation-systems&utm_campaign=colab-external&utm_medium=referral&utm_content=recommendation-systems#scrollTo=Le0Z54X68_iq) to learn modern methods in Recommendation Systems. To add a layer of challenge and better internalize the information, instead of copying it I will reproduce the work in Pytorch.

#Setup

In [1]:
# @title Imports

#General Utilities
import os
import json

import collections
import numpy as np
import pandas as pd
import sklearn
import sklearn.manifold
import tqdm

#Visualization tools
from matplotlib import pyplot as plt
import altair as alt

alt.data_transformers.enable('default', max_rows=None)
alt.renderers.enable('colab')

#Pytorch
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

#Select device, cuda if available
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

#These utility functions are defined in the referenced colab
#Defining here in case this is necessary
mask = lambda df, key, function: df[function(df[key])]
def flatten_cols(df):
  df.columns = [' '.join(col).strip() for col in df.columns.values]
  return df

pd.DataFrame.mask = mask
pd.DataFrame.flatten_cols = flatten_cols


In [2]:
# @title Download MovieLens
from urllib.request import urlretrieve
import zipfile

#pick a size: 100k, 1m, 10m, 20m, 25m, 32m
ML_INDEX = ['100k', '1m', '10m', '20m', '25m', '32m']
ML_SIZE = ML_INDEX[1]

DS_URL = f"http://files.grouplens.org/datasets/movielens/ml-{ML_SIZE}.zip"
DS_LOC = f"movielens-{ML_SIZE}.zip"

if not os.path.isfile(DS_LOC):
  print(f"DOWNLOADING {DS_LOC}".upper())
  urlretrieve(DS_URL, DS_LOC)

zip_ref = zipfile.ZipFile(DS_LOC, 'r')
zip_ref.extractall()
print("Done. Dataset contains:")
if ML_SIZE == ML_INDEX[0]:
  print(zip_ref.read(f'ml-{ML_SIZE}/u.info'))

zip_ref.close()

DOWNLOADING MOVIELENS-1M.ZIP
Done. Dataset contains:


In [3]:
#The data loading part in the Colab is almost identical because it's just simple
#pandas csv reading. We can get an idea of the schema at the link below:
#http://files.grouplens.org/datasets/movielens/ml-25m-README.html

def get_filenames():
  #Returns filenames and seperators for user, ratings, movie
  folder = f"ml-{ML_SIZE.upper()}100K" if ML_SIZE == ML_INDEX[2] else f"ml-{ML_SIZE}"
  if ML_SIZE == ML_INDEX[0]:
    user = (os.path.join(folder, 'u.user'), '|')
    ratings = (os.path.join(folder, 'u.data'), '\t')
    movies = (os.path.join(folder,'u.item'), '|')
    # return user, ratings, movies
  if ML_SIZE == ML_INDEX[1] or ML_SIZE == ML_INDEX[2]:
    user = (os.path.join(folder, 'users.dat'), '::')
    ratings = (os.path.join(folder, 'ratings.dat'), '::')
    movies = (os.path.join(folder,'movies.dat'), '::')
  if ML_SIZE == ML_INDEX[3] or ML_SIZE == ML_INDEX[4] or ML_SIZE == ML_INDEX[5]:
    user = (os.path.join(folder, 'users.csv'), '::')
    ratings = (os.path.join(folder, 'ratings.csv'), '::')
    movies = (os.path.join(folder,'movies.csv'), '::')
  return user, ratings, movies

u, r, m = get_filenames()
ufile, usep = u
rfile, rsep = r
mfile, msep = m

#I have altered it to be able to handle other MovieLens datasets.

# USERS_FILENAME =

# Load each data set (users, movies, and ratings).
users_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code'] if ML_SIZE == ML_INDEX[0] else ['user_id', 'sex', 'age', 'occupation', 'zip_code']
users = pd.read_csv(
    ufile, sep=usep, names=users_cols, encoding='latin-1')

ratings_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv(
    rfile, sep=rsep, names=ratings_cols, encoding='latin-1')

# The movies file contains a binary feature for each genre.
genre_cols = [
    "genre_unknown", "Action", "Adventure", "Animation", "Children", "Comedy",
    "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror",
    "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"
] if ML_SIZE == ML_INDEX[0] else ['genres']
movies_base_cols = [
    'movie_id', 'title', 'release_date', "video_release_date", "imdb_url"
] if ML_SIZE == ML_INDEX[0] else ['movie_id', 'title']
movies_cols =  movies_base_cols + genre_cols
movies = pd.read_csv(
    mfile, sep=msep, names=movies_cols, encoding='latin-1')

# Since the ids start at 1, we shift them to start at 0.
users["user_id"] = users["user_id"].apply(lambda x: int(x-1))
movies["movie_id"] = movies["movie_id"].apply(lambda x: int(x-1))
if ML_SIZE == ML_INDEX[0]:
  movies["year"] = movies['release_date'].apply(lambda x: str(x).split('-')[-1])
ratings["movie_id"] = ratings["movie_id"].apply(lambda x: int(x-1))
ratings["user_id"] = ratings["user_id"].apply(lambda x: int(x-1))
ratings["rating"] = ratings["rating"].apply(lambda x: float(x))


  users = pd.read_csv(
  ratings = pd.read_csv(
  movies = pd.read_csv(


In [4]:
occupation_dict = {
    0:  "other",
    1:  "academic/educator",
    2:  "artist",
    3:  "clerical/admin",
    4:  "college/grad student",
    5:  "customer service",
    6:  "doctor/health care",
    7:  "executive/managerial",
    8:  "farmer",
    9:  "homemaker",
    10:  "K-12 student",
    11:  "lawyer",
    12:  "programmer",
    13:  "retired",
    14:  "sales/marketing",
    15:  "scientist",
    16:  "self-employed",
    17:  "technician/engineer",
    18:  "tradesman/craftsman",
    19:  "unemployed",
    20:  "writer"
}
def occupation_mapping(occupation):
  if occupation in occupation_dict:
    return occupation_dict[occupation]
  return occupation

In [5]:
users['occupation'] = users['occupation'].apply(lambda x: occupation_mapping(x))

In [6]:
movies.head()

Unnamed: 0,movie_id,title,genres
0,0,Toy Story (1995),Animation|Children's|Comedy
1,1,Jumanji (1995),Adventure|Children's|Fantasy
2,2,Grumpier Old Men (1995),Comedy|Romance
3,3,Waiting to Exhale (1995),Comedy|Drama
4,4,Father of the Bride Part II (1995),Comedy


In [7]:
# @title Preparing our Dataframes

#In the colab, the data is processed so we create a string of all genres
#a film belongs to (since some movies belong to more than one) and then
#create a randomly sampled "genre" column to provide a single genre label
#for each movie.
def add_genres_100k(movies, genres):
  def all_genres(row):
    active = [genre for genre, g in zip(genres, row) if g==1]
    if len(active) == 0:
      return 'Other'
    return '|'.join(active)
  def random_sample_genre(row):
    active = [genre for genre, g in zip(genres, row) if g==1]
    if len(active) == 0:
      return 'Other'
    return np.random.choice(active)
  movies['genre'] = [
      random_sample_genre(gs) for gs in zip(*[movies[genre] for genre in genres])]
  movies['all_genres'] = [
      all_genres(gs) for gs in zip(*[movies[genre] for genre in genres])]

def add_genres(movies):
  def random_sample_genre(genres):
    active = genres.split('|')
    if len(active) == 0:
      return 'other'
    return np.random.choice(active)
  movies['genre'] = movies['genres'].apply(lambda x: random_sample_genre(x))

if ML_SIZE == ML_INDEX[0]:
  add_genres_100k(movies, genre_cols)
else:
  add_genres(movies)

def train_test_val(df, val=True, holdout=0.1, random_state=None):
  #returns train, test, val with test, val being half of the holdout each
  #if val = False then train is the size of the holdout fraction
  rs = random_state if random_state else np.random.randint(0, 100000)
  if val:
    m1 = 1-holdout/2
    m2 = 1-holdout
    return np.split(df.sample(frac=1, random_state=rs), [int(m1*len(df)), int(m2*len(df))])
  else:
    return np.split(df.sample(frac=1, random_state=rs), [int((1-holdout)*len(df))]), None

In [8]:
movies.head()

Unnamed: 0,movie_id,title,genres,genre
0,0,Toy Story (1995),Animation|Children's|Comedy,Comedy
1,1,Jumanji (1995),Adventure|Children's|Fantasy,Adventure
2,2,Grumpier Old Men (1995),Comedy|Romance,Comedy
3,3,Waiting to Exhale (1995),Comedy|Drama,Comedy
4,4,Father of the Bride Part II (1995),Comedy,Comedy


#Data Exploration

## Ratings

In [9]:
ratings.describe()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
count,1000209.0,1000209.0,1000209.0,1000209.0
mean,3023.512,1864.54,3.581564,972243700.0
std,1728.413,1096.041,1.117102,12152560.0
min,0.0,0.0,1.0,956703900.0
25%,1505.0,1029.0,3.0,965302600.0
50%,3069.0,1834.0,4.0,973018000.0
75%,4475.0,2769.0,4.0,975220900.0
max,6039.0,3951.0,5.0,1046455000.0


In [10]:
#We sort by user and timestamps so we can make a
#validation dataset by picking latest ratings per user
ratings = ratings.sort_values(by=['user_id', 'unix_timestamp'])
ratings.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
31,0,3185,4.0,978300019
22,0,1269,5.0,978300055
27,0,1720,4.0,978300055
37,0,1021,5.0,978300055
24,0,2339,3.0,978300103


In [11]:
#Check how many ratings per user, min, max, etc
ratings_breakdown = ratings.groupby('user_id', as_index = False).agg({'rating':['count', 'mean']}).flatten_cols()

#Percentile rank of each user by how many reviews they have made
ratings_breakdown['presence'] = ratings_breakdown['rating count'].rank(pct=True)*100

#Round it to the nearest multiple of BASE for visualization
BASE = 5
ratings_breakdown['presence'] = ratings_breakdown['presence'].apply(lambda x: int(BASE*round(x/BASE)))

ratings_breakdown.head()


Unnamed: 0,user_id,rating count,rating mean,presence
0,0,53,4.188679,30
1,1,129,3.713178,60
2,2,51,3.901961,30
3,3,21,4.190476,0
4,4,198,3.146465,75


So we see that there are at minimum 20 ratings per user and a median of 65 ratings per user.

In [12]:
#@title Altair Visualization
percentile_filter = alt.selection_multi(fields=['presence'])

rating_means = alt.Chart().mark_bar().encode(
    x="presence:N", #specify x
    y=alt.Y("median(rating mean)"), #specify means as y
    color=alt.condition(
        percentile_filter, #condition on presence click
        alt.Color("presence:N", scale=alt.Scale(scheme='category20')), #if clicked color it
        alt.value("lightgray")), #if not clicked gray it out
).properties(width=300, height=300, selection=percentile_filter)

rating_counts = alt.Chart().mark_bar().encode(
    x="presence:N",
    y="mean(rating count)",
    color=alt.condition(
        percentile_filter, #condition on presence click
        alt.Color("presence:N", scale=alt.Scale(scheme='category20')), #if clicked color it
        alt.value("lightgray")), #if not clicked gray it out
).properties(width=300, height=300, selection=percentile_filter)

alt.hconcat(
    rating_means,
    rating_counts,
    data=ratings_breakdown
) #adding data after the fact

We see that the users who give the greatest number of ratings are slightly more critical than the rest.

##Users

In [13]:
users.describe()

Unnamed: 0,user_id,age
count,6040.0,6040.0
mean,3019.5,30.639238
std,1743.742145,12.895962
min,0.0,1.0
25%,1509.75,25.0
50%,3019.5,25.0
75%,4529.25,35.0
max,6039.0,56.0


In [14]:
users.describe(include=[object])


Unnamed: 0,sex,occupation,zip_code
count,6040,6040,6040
unique,2,21,3439
top,M,college/grad student,48104
freq,4331,759,19


We see that there are 21 unique occupations in the data, 2 unique sexes and 795 unique zip codes. Based on our knowledge of the world, occupation might have a useful link to the preferences of users (as would sex and to a lesser degree zip code). Let's visualize.

In [15]:
# @title Altair Visualization
#This is great, I want to practice visualization and understand Altair better.
users_ratings = ( #get count and mean of ratings by user
    ratings
    .groupby('user_id', as_index=False)
    .agg({'rating': ['count', 'mean']})
    .flatten_cols()
    .merge(users, on='user_id')
)
#This specifies an event where a click occurs on one of the occupations
occupation_filter = alt.selection_multi(fields=['occupation'])
#You can add the data later too!
occupation_chart = alt.Chart().mark_bar().encode(
    x="count()", #specify count of entries
    y=alt.Y("occupation:N"), #specify occupations a
    color=alt.condition(
        occupation_filter, #condition on occupation click
        alt.Color("occupation:N", scale=alt.Scale(scheme='category20')), #if clicked color it
        alt.value("lightgray")), #if not clicked gray it out
).properties(width=300, height=300, selection=occupation_filter)

occupation_chart.properties(data=users_ratings) #adding data after the fact

In [16]:
users_ratings.sort_values(by='rating count', ascending=False).head()

Unnamed: 0,user_id,rating count,rating mean,sex,age,occupation,zip_code
4168,4168,2314,3.551858,M,50,other,66048
1679,1679,1850,3.555676,M,25,writer,95380
4276,4276,1743,4.134825,M,35,self-employed,98133
1940,1940,1595,3.054545,M,35,technician/engineer,94550
1180,1180,1521,2.815911,M,35,executive/managerial,20716


Notice that students and educators dominate the data. This is important to note if we want to customize weights for each entry like we can do with **Weighted Matrix Factorization**.

In [17]:
#Taken directly from the tutorial colab, commented as an explainer
# A function that generates a histogram of filtered data.
def filtered_hist(field, label, filter):
  """Creates a layered chart of histograms.
  The first layer (light gray) contains the histogram of the full data, and the
  second contains the histogram of the filtered data.
  Args:
    field: the field for which to generate the histogram.
    label: String label of the histogram.
    filter: an alt.Selection object to be used to filter the data.
  """
  #The filter passed in is any logical filter
  #https://altair-viz.github.io/user_guide/generated/core/altair.FilterTransform.html#altair.FilterTransform
  base = alt.Chart().mark_bar().encode( #we will add the data later!
      x=alt.X(field, bin=alt.Bin(maxbins=20), title=label), #10 bins
      y="count()", #count items
  ).properties(
      width=300,
  )
  return alt.layer(
      base.transform_filter(filter),
      base.encode(color=alt.value('lightgray'), opacity=alt.value(.7)),
  ).resolve_scale(y='independent')

#We can even concat multiple charts!
alt.hconcat(
    filtered_hist('rating count', '# ratings / user', occupation_filter),
    filtered_hist('rating mean', 'mean user rating', occupation_filter),
    occupation_chart,
    data=users_ratings)


From **chart 1**: a small subset of users are overrepresented in the ratings data, which means their opinions will be overweighted in the data if not accounted for.

From **chart 2**: ratings have a slightly positive skew from the "totally average" rating of a 3.0.

We already mentioned chart 3 above. But an added note, when we click through the charts to see breakdowns of the mean ratings and ratings per user: we see that some occupations are more critical of movies than others are.

##Movies

We need to develop an understanding of the movies data as well. Let's see how review data is distributed along the various genres of movies.

In [18]:
movies.describe()

Unnamed: 0,movie_id
count,3883.0
mean,1985.049446
std,1146.778349
min,0.0
25%,981.5
50%,2009.0
75%,2979.5
max,3951.0


In [19]:
len(movies['movie_id'].unique()) == max(movies.movie_id)+1

False

In [20]:
movies.describe(include=[object])

Unnamed: 0,title,genres,genre
count,3883,3883,3883
unique,3883,301,18
top,Toy Story (1995),Drama,Drama
freq,1,843,1176


In [21]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
31,0,3185,4.0,978300019
22,0,1269,5.0,978300055
27,0,1720,4.0,978300055
37,0,1021,5.0,978300055
24,0,2339,3.0,978300103


In [22]:
movie_ratings = movies.merge( #get count and mean of ratings by movie
    ratings
    .groupby("movie_id", as_index = False)
    .agg({"rating":["count", "mean"]})
    .flatten_cols(),
    on="movie_id"
)

In [23]:
# @title Visualize by Genre
movies_ratings = movies.merge(
    ratings
    .groupby('movie_id', as_index=False)
    .agg({'rating': ['count', 'mean']})
    .flatten_cols(),
    on='movie_id')

genre_filter = alt.selection_multi(fields=['genre'])
genre_chart = alt.Chart().mark_bar().encode(
    x="count()",
    y=alt.Y('genre'),
    color=alt.condition(
        genre_filter,
        alt.Color("genre:N"),
        alt.value('lightgray'))
).properties(height=300, selection=genre_filter)

In [24]:
(movies_ratings[['title', 'rating count', 'rating mean']]
 .sort_values('rating count', ascending=False)
 .head(10))

Unnamed: 0,title,rating count,rating mean
2651,American Beauty (1999),3428,4.317386
253,Star Wars: Episode IV - A New Hope (1977),2991,4.453694
1106,Star Wars: Episode V - The Empire Strikes Back...,2990,4.292977
1120,Star Wars: Episode VI - Return of the Jedi (1983),2883,4.022893
466,Jurassic Park (1993),2672,3.763847
1848,Saving Private Ryan (1998),2653,4.337354
575,Terminator 2: Judgment Day (1991),2649,4.058513
2374,"Matrix, The (1999)",2590,4.31583
1178,Back to the Future (1985),2583,3.990321
579,"Silence of the Lambs, The (1991)",2578,4.351823


In [25]:
(movies_ratings[['title', 'rating count', 'rating mean']]
 .mask('rating count', lambda x: x > 20)
 .sort_values('rating mean', ascending=False)
 .head(10))

Unnamed: 0,title,rating count,rating mean
2698,Sanjuro (1962),69,4.608696
1839,Seven Samurai (The Magnificent Seven) (Shichin...,628,4.56051
309,"Shawshank Redemption, The (1994)",2227,4.554558
802,"Godfather, The (1972)",2223,4.524966
708,"Close Shave, A (1995)",657,4.520548
49,"Usual Suspects, The (1995)",1783,4.517106
513,Schindler's List (1993),2304,4.510417
1066,"Wrong Trousers, The (1993)",882,4.507937
861,Sunset Blvd. (a.k.a. Sunset Boulevard) (1950),470,4.491489
1108,Raiders of the Lost Ark (1981),2514,4.477725


In [26]:
alt.hconcat(
    filtered_hist('rating count', '# ratings / movie', genre_filter),
    filtered_hist('rating mean', 'mean movie rating', genre_filter),
    genre_chart,
    data=movies_ratings)

When clicking through the charts, we see that different genres have very different distributions of movie ratings and frequency of reviews. Like documentaries have a lot of 5 and 4.5 star mean ratings while comedies (which are a large cohort) have mostly 2-3 star mean ratings.

Also note that dramas and comedies dominate the genre distribution.

#Preliminaries

In [27]:
#merge all three dataframes into one
movielens = ratings.merge(movies, on='movie_id').merge(users, on='user_id')
movielens = movielens.sort_values(by=['user_id', 'unix_timestamp']) #for train-test splits
movielens.describe()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp,age
count,1000209.0,1000209.0,1000209.0,1000209.0,1000209.0
mean,3023.512,1864.54,3.581564,972243700.0,29.73831
std,1728.413,1096.041,1.117102,12152560.0,11.75198
min,0.0,0.0,1.0,956703900.0,1.0
25%,1505.0,1029.0,3.0,965302600.0,25.0
50%,3069.0,1834.0,4.0,973018000.0,25.0
75%,4475.0,2769.0,4.0,975220900.0,35.0
max,6039.0,3951.0,5.0,1046455000.0,56.0


In [28]:
movielens.describe(include = [object])

Unnamed: 0,title,genres,genre,sex,occupation,zip_code
count,1000209,1000209,1000209,1000209,1000209,1000209
unique,3706,301,18,2,21,3439
top,American Beauty (1999),Comedy,Comedy,M,college/grad student,94110
freq,3428,116883,233470,753769,131032,3802


In [29]:
#select up until last 5 timestamps per user for train, rest for test
select_cols = ['user_id', 'movie_id', 'rating'] #for now, these are the only cols we use
movielens_train = movielens.groupby('user_id').head(-5).reset_index(drop=True)
movielens_test = movielens.groupby('user_id').tail(5).reset_index(drop=True)


In [30]:
movielens_train = movielens_train[select_cols]
movielens_train.describe()

Unnamed: 0,user_id,movie_id,rating
count,970009.0,970009.0,970009.0
mean,3023.637267,1858.203633,3.581729
std,1727.937634,1093.598873,1.115827
min,0.0,0.0,1.0
25%,1505.0,1027.0,3.0
50%,3072.0,1815.0,4.0
75%,4472.0,2761.0,4.0
max,6039.0,3951.0,5.0


In [31]:
movielens_test = movielens_test[select_cols]
movielens_test.describe()

Unnamed: 0,user_id,movie_id,rating
count,30200.0,30200.0,30200.0
mean,3019.5,2068.057583,3.576291
std,1743.626657,1153.41813,1.15732
min,0.0,0.0,1.0
25%,1509.75,1147.0,3.0
50%,3019.5,2101.0,4.0
75%,4529.25,3080.0,4.0
max,6039.0,3951.0,5.0


In [32]:
#@title Creating a Dataset for Our Model

class MovielensDataset(Dataset):
  def __init__(self, df):
    self.len = len(df)
    self.X = list(zip(df.user_id.values, df.movie_id.values))
    self.y = df['rating'].values

  def __len__(self):
    return self.len

  def __getitem__(self, index):
    #get example with user_id and movie_id
    return self.X[index], self.y[index]

train_dataset = MovielensDataset(movielens_train)
test_dataset = MovielensDataset(movielens_test)


In [33]:
#@title Sparse Rating Tensor (UNUSED)
#In the tutorial it wants a function to create tf.SparseTensor objects
#to hold the datasets, since we am doing this in pytorch we will adjust accordingly.

def build_ratings_sparse_tensor(ratings_df):
  indices = torch.Tensor(ratings_df[['user_id', 'movie_id']].values).transpose(0,-1)
  values = torch.Tensor(ratings_df['rating'].values).transpose(0,-1)
  shape = (users.shape[0], movies.shape[0])
  # shape = (942,1681)
  print(shape)
  # print(f"{indices.shape=}")
  # print(f"{values.shape=}")
  # print(f"{shape=}")
  return torch.sparse_coo_tensor(indices, values, shape)



In [34]:
#we got the sparse tensor to work!
sp_ratings = build_ratings_sparse_tensor(ratings).coalesce()
sp_ratings

(6040, 3883)


tensor(indices=tensor([[   0,    0,    0,  ..., 6039, 6039, 6039],
                       [   0,   47,  149,  ..., 3734, 3750, 3818]]),
       values=tensor([5., 5., 5.,  ..., 4., 4., 5.]),
       size=(6040, 3883), nnz=999699, layout=torch.sparse_coo)

In [35]:
#@title Create Sparse Loss Function (MSE) (UNUSED)

#helper for MSE
def create_mask_from_indices(sp_tensor):
  indices = sp_tensor.indices()
  mask = torch.zeros(sp_tensor.shape, dtype=torch.bool)
  mask[indices[0, :], indices[1, :]] = True
  return mask

#MSE
def sparse_mean_square_loss(sparse_ratings, user_embeddings, movie_embeddings):
  #Get predictions from Matmul of embeddings
  #The embeddings are the learned output of our matrix factorization model.
  prediction_mask = create_mask_from_indices(sparse_ratings)
  predictions = torch.matmul(user_embeddings, movie_embeddings.transpose(0,-1)).masked_select(prediction_mask)
  return F.mse_loss(predictions, sparse_ratings.values())


In [36]:
#test if masked select works as I expected
#Check if our mask selects in the same order
# #of examples as the values in the sparse tensor
# selection_mask = create_mask_from_indices(sp_ratings)
# dense_ratings = sp_ratings.to_dense()
# selection = dense_ratings.masked_select(selection_mask)
# torch.sum(selection-sp_ratings.values()) == 0

#Training a Matrix Factorization Model

Since PyTorch and Tensorflow differ so much in how models are defined, we will have to change how we set up the CF model here quite a bit from the tutorial colab I am following.

In [37]:
#@title Model Definition
class CollaborativeFilteringModel(nn.Module):
  def __init__(self, num_queries, num_items, embedding_dim):
    super().__init__()
    self.embedding_dim = embedding_dim
    self.user_embedding = nn.Embedding(num_queries, embedding_dim)
    self.movie_embedding = nn.Embedding(num_items, embedding_dim)

  def forward(self, query, item):
    #we only need to compute product per user and item
    #unsqueeze so shape is (BATCH_SIZE, 1, EMBEDDING_DIM)
    query_embedding = self.user_embedding(query).unsqueeze(1)
    item_embedding = self.movie_embedding(item).unsqueeze(1)
    # print(f'{query_embedding=}')
    # print(f'{item_embedding=}')
    #batch matrix multiplication
    return torch.matmul(query_embedding, item_embedding.transpose(-2,-1)).squeeze()



In [38]:
#@title Trainer Class
class DummyScheduler():
  def __init__(self):
    self.lr = 0
  def step(epoch=0, value=0):
    return

class CFTrainer():
  def __init__(self, model, train_dataloader, test_dataloader, loss_fn, optimizer, scheduler=None, log_freq = 1):
    self.model = model
    self.train_dataloader = train_dataloader
    self.test_dataloader = test_dataloader
    self.loss_fn = loss_fn
    self.optimizer = optimizer
    self.scheduler = scheduler if scheduler is not None else DummyScheduler()
    self.train_losses = []
    self.test_losses = []
    self.log_freq = log_freq

  def train(self, num_epochs):
    for epoch in range(num_epochs):
      self.train_epoch(epoch)
      self.test_epoch(epoch)

  def train_epoch(self, epoch):
    running_loss = 0
    data_iter = tqdm.tqdm(
      enumerate(self.train_dataloader),
      desc=f"EPOCH_TRAIN:{epoch}",
      total=len(self.train_dataloader),
      bar_format="{l_bar}{bar}{r_bar}"
    )
    for i, batch in data_iter:
      self.model.train()
      X, y = (batch[0][0].to(DEVICE, dtype=torch.long), batch[0][1].to(DEVICE, dtype=torch.long)), batch[1].to(DEVICE, dtype=torch.float)

      #forward pass
      prediction = self.model(X[0], X[1])

      # print(f"{prediction=}")
      # print(f"{y=}")

      #calculate loss
      loss = self.loss_fn(prediction, y)

      #zero out gradient
      self.optimizer.zero_grad()

      #backward pass
      loss.backward()

      #update weights
      self.optimizer.step()

      running_loss+=loss.item()
    self.train_losses.append(running_loss/len(data_iter))
    post_fix = {
        "epoch_train": epoch,
        "iter": i,
        "avg_loss": self.train_losses[-1],
    }
    if i % self.log_freq == 0:
      data_iter.write(str(post_fix))


  def test_epoch(self, epoch):
    running_loss = 0
    data_iter = tqdm.tqdm(
      enumerate(self.test_dataloader),
      desc=f"EPOCH_TEST:{epoch}",
      total=len(self.test_dataloader),
      bar_format="{l_bar}{bar}{r_bar}"
    )
    for i, batch in data_iter:
      self.model.eval()
      X, y = (batch[0][0].to(DEVICE, dtype=torch.long), batch[0][1].to(DEVICE, dtype=torch.long)), batch[1].to(DEVICE, dtype=torch.float)

      with torch.no_grad():#disable gradient calculation
        prediction = self.model(X[0], X[1])

        loss = self.loss_fn(prediction, y)

        running_loss += loss.item()

    self.test_losses.append(running_loss/len(data_iter))
    #we will use ReduceLRonPlateau scheduler with validation loss
    self.scheduler.step(self.test_losses[-1])
    post_fix = {
        "epoch_test": epoch,
        "iter": i,
        "avg_loss": self.test_losses[-1],
    }
    if i % self.log_freq == 0:
      data_iter.write(str(post_fix)+'\n')




In [39]:
movielens.describe()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp,age
count,1000209.0,1000209.0,1000209.0,1000209.0,1000209.0
mean,3023.512,1864.54,3.581564,972243700.0,29.73831
std,1728.413,1096.041,1.117102,12152560.0,11.75198
min,0.0,0.0,1.0,956703900.0,1.0
25%,1505.0,1029.0,3.0,965302600.0,25.0
50%,3069.0,1834.0,4.0,973018000.0,25.0
75%,4475.0,2769.0,4.0,975220900.0,35.0
max,6039.0,3951.0,5.0,1046455000.0,56.0


In [40]:
print(max(movielens['movie_id']))
print(len(movielens['movie_id'].unique()))


3951
3706


In [41]:
print(len(movielens['rating']))
print(len(ratings['rating']))
print('---------------')
print(len(movielens['movie_id'].unique()))
print(len(ratings['movie_id'].unique()))
print('---------------')
print(max(movielens['movie_id']))
print(max(ratings['movie_id']))
print('---------------')
print(len(movielens['user_id'].unique()))
print(len(ratings['user_id'].unique()))
print('---------------')
print(max(movielens['user_id']))
print(max(ratings['user_id']))


1000209
1000209
---------------
3706
3706
---------------
3951
3951
---------------
6040
6040
---------------
6039
6039


In [42]:
BATCH_SIZE = 64
NUM_EPOCHS = 300
LR = 0.01
EMBEDDING_DIM = 8
MOMENTUM = 0.9

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True)

NUM_QUERIES = max(movielens.user_id)+1
NUM_ITEMS = max(movielens.movie_id)+1

model = CollaborativeFilteringModel(NUM_QUERIES, NUM_ITEMS, EMBEDDING_DIM).to(DEVICE)

loss_fn = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LR, momentum=MOMENTUM)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, verbose=True)

trainer = CFTrainer(model, train_dataloader, test_dataloader, loss_fn, optimizer, scheduler)

In [43]:
TRAIN_NEW_MODEL = False

if TRAIN_NEW_MODEL:
  trainer.train(NUM_EPOCHS)

In [44]:
HYPERPARAM_DICT = {
    'BATCH_SIZE' : BATCH_SIZE,
    'NUM_EPOCHS' : NUM_EPOCHS,
    'LR' : LR,
    'EMBEDDING_DIM' : EMBEDDING_DIM,
    'MOMENTUM' : MOMENTUM,
    'TRAIN_LOSSES' : trainer.train_losses,
    'TEST_LOSSES' : trainer.test_losses
}

In [45]:
PERSIST_LOC_DRIVE = '/content/drive/MyDrive/Colab Notebooks/Git Notebooks/recommendation-systems-files'

#UPDATE MODEL VERSION
MODEL_VERSION = '1_1'

#SAVE MODEL
MODEL_PATH = os.path.join(PERSIST_LOC_DRIVE, f'model_ml{ML_SIZE}_v{MODEL_VERSION}.pth')
OPTIM_PATH = os.path.join(PERSIST_LOC_DRIVE, f'optim_ml{ML_SIZE}_v{MODEL_VERSION}.pth')
SCHEDULER_PATH = os.path.join(PERSIST_LOC_DRIVE, f'scheduler_ml{ML_SIZE}_v{MODEL_VERSION}.pth')
HYPERPARAM_PATH = os.path.join(PERSIST_LOC_DRIVE, f'hyperparams_ml{ML_SIZE}_v{MODEL_VERSION}.json')

if TRAIN_NEW_MODEL:
  torch.save(trainer.model.state_dict(), MODEL_PATH)
  torch.save(trainer.optimizer.state_dict(), OPTIM_PATH)
  torch.save(trainer.scheduler.state_dict(), SCHEDULER_PATH)
  with open(HYPERPARAM_PATH, 'w') as f:
    json.dump(HYPERPARAM_DICT, f, indent=4)


After a few tests I learned that on the 100k dataset, the model with `EMBEDDING_DIM = 8` has `train_loss = 0.6091` with `test_loss = 1.6098` while the model with `EMBEDDING_DIM = 4` has `train_loss = 0.6891` and `test_loss = 1.2625` with all else being equal. This indicates that while both are overfitting, the choice of 8 latent features is overfitting far more on this dataset.

Perhaps more latent features would be useful when we work on a larger dataset.

In [48]:
#@title Load Model

#UPDATE MODEL VERSION
CHOOSE_VERSION = '1'

#SAVE MODEL
CHOOSE_MODEL_PATH = os.path.join(PERSIST_LOC_DRIVE, f'model_ml{ML_SIZE}_v{CHOOSE_VERSION}.pth')
CHOOSE_HYPERPARAM_PATH = os.path.join(PERSIST_LOC_DRIVE, f'hyperparams_ml{ML_SIZE}_v{CHOOSE_VERSION}.json')

model_hp = {}
with open(CHOOSE_HYPERPARAM_PATH, 'r') as f:
  model_hp = json.load(f)

model_hp['NUM_QUERIES'] = NUM_QUERIES
model_hp['NUM_ITEMS'] = NUM_ITEMS

loaded_model = CollaborativeFilteringModel(model_hp['NUM_QUERIES'], model_hp['NUM_ITEMS'], model_hp['EMBEDDING_DIM']).to(DEVICE)
loaded_model.load_state_dict(torch.load(CHOOSE_MODEL_PATH, map_location=torch.device(DEVICE)))
loaded_model.eval()



CollaborativeFilteringModel(
  (user_embedding): Embedding(6040, 8)
  (movie_embedding): Embedding(3952, 8)
)

#Similarity Scores and Inference

Now we create a function to infer the score a user would give a movie based on the embeddings we learned.

There are two ways to compute similarity:
- dot product: the score of item j is $\langle u, V_j \rangle$.
- cosine: the score of item j is $\frac{\langle u, V_j \rangle}{\|u\|\|V_j\|}$.

In [106]:
dot = 'dot'
cosine = 'cosine'

def compute_scores(query_embedding, item_embeddings, measure = dot):
  u = query_embedding
  V = item_embeddings
  if measure == cosine:
    u = u/torch.linalg.norm(u)
    V = V/torch.linalg.norm(V, dim = 0)
  scores = torch.matmul(u, V.transpose(0,-1))
  return scores



In [66]:
user_embeddings = loaded_model.state_dict()['user_embedding.weight']
user_embeddings

tensor([[ 1.2855,  0.6378,  0.0209,  ..., -1.2941,  0.8058,  0.7680],
        [ 0.7280,  1.5683,  1.0441,  ..., -0.9626, -0.8971, -0.2082],
        [ 0.5603,  2.2786,  0.3552,  ..., -1.1104, -0.7154,  0.5382],
        ...,
        [ 0.7379,  0.4430, -1.0387,  ..., -1.5558,  0.6545,  1.8255],
        [ 0.5253,  1.3903,  0.0396,  ..., -0.8550,  0.0321,  0.5688],
        [ 0.1291,  1.0161,  0.0436,  ..., -1.6876,  0.6001, -1.2039]])

In [65]:
movie_embeddings = loaded_model.state_dict()['movie_embedding.weight']
movie_embeddings

tensor([[ 1.3713,  0.5763,  1.0702,  ..., -1.6291, -0.0270,  0.2540],
        [ 1.0396,  0.3996,  0.7620,  ..., -1.1655, -0.2196,  0.5768],
        [ 1.0549,  0.5294,  1.1182,  ..., -0.8763, -0.1840,  0.4400],
        ...,
        [ 1.0174,  0.5763,  0.1680,  ..., -1.7936, -0.1162,  0.2742],
        [ 1.9244, -0.0039,  1.9682,  ..., -0.7860,  0.1677, -1.0445],
        [ 1.0383,  0.8426,  0.9882,  ..., -1.4601,  0.0818,  0.2383]])

In [108]:
test_user_scores = compute_scores(user_embeddings[4168,:], movie_embeddings, dot)
print(test_user_scores)
cos_test_user_scores = compute_scores(user_embeddings[4168,:], movie_embeddings, cosine)
print(cos_test_user_scores)


tensor([4.4605, 3.0760, 2.9029,  ..., 4.0559, 4.4248, 4.0174])
tensor([0.0363, 0.0240, 0.0234,  ..., 0.0327, 0.0380, 0.0336])


In [110]:
movies_ratings.sort_values(by = 'rating count', ascending = False).head()

Unnamed: 0,movie_id,title,genres,genre,rating count,rating mean
2651,2857,American Beauty (1999),Comedy|Drama,Comedy,3428,4.317386
253,259,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Fantasy|Sci-Fi,Adventure,2991,4.453694
1106,1195,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Drama|Sci-Fi|War,Drama,2990,4.292977
1120,1209,Star Wars: Episode VI - Return of the Jedi (1983),Action|Adventure|Romance|Sci-Fi|War,Sci-Fi,2883,4.022893
466,479,Jurassic Park (1993),Action|Adventure|Sci-Fi,Sci-Fi,2672,3.763847


In [117]:
#SANITY CHECK TO MAKE SURE THE MOVIE ID LINES UP WITH PREDICTION INDEX CORRECTLY
print(f'SOUTHPARK: {torch.mean(torch.matmul(user_embeddings, movie_embeddings[2699, :].transpose(0, -1)))}')
print(f'WWW: {torch.mean(torch.matmul(user_embeddings, movie_embeddings[2700, :].transpose(0, -1)))}')
print(f'SUMMEROFSAM: {torch.mean(torch.matmul(user_embeddings, movie_embeddings[2701, :].transpose(0, -1)))}')


SOUTHPARK: 3.616091251373291
WWW: 2.0203795433044434
SUMMEROFSAM: 3.1637587547302246


In [115]:
movies_ratings[movies_ratings['movie_id'].between(2695, 2705)].head(10)

Unnamed: 0,movie_id,title,genres,genre,rating count,rating mean
2492,2695,"Dinner Game, The (Le Dîner de cons) (1998)",Comedy,Comedy,81,3.691358
2493,2696,My Son the Fanatic (1998),Comedy|Drama|Romance,Romance,48,3.729167
2494,2698,Arachnophobia (1990),Action|Comedy|Sci-Fi|Thriller,Comedy,1367,3.002926
2495,2699,"South Park: Bigger, Longer and Uncut (1999)",Animation|Comedy,Comedy,1269,3.760441
2496,2700,Wild Wild West (1999),Action|Sci-Fi|Western,Action,902,2.158537
2497,2701,Summer of Sam (1999),Drama,Drama,395,3.070886
2498,2702,Broken Vessels (1998),Drama,Drama,1,3.0
2499,2703,"Lovers on the Bridge, The (Les Amants du Pont-...",Drama|Romance,Romance,41,3.073171
2500,2704,"Late August, Early September (Fin août, début ...",Drama,Drama,7,2.714286
2501,2705,American Pie (1999),Comedy,Comedy,1389,3.709863


In [114]:
movies_ratings[movies_ratings['rating count']>500].sort_values(by = 'rating mean', ascending = True).head(10)


Unnamed: 0,movie_id,title,genres,genre,rating count,rating mean
2496,2700,Wild Wild West (1999),Action|Sci-Fi|Western,Action,902,2.158537
154,159,Congo (1995),Action|Adventure|Mystery|Sci-Fi,Mystery,565,2.238938
1435,1561,Batman & Robin (1997),Action|Adventure|Crime,Action,606,2.257426
167,172,Judge Dredd (1995),Action|Adventure|Sci-Fi,Sci-Fi,564,2.308511
2440,2641,Superman III (1983),Action|Adventure|Sci-Fi,Sci-Fi,511,2.336595
796,848,Escape from L.A. (1996),Action|Adventure|Sci-Fi|Thriller,Sci-Fi,511,2.510763
1653,1830,Lost in Space (1998),Action|Sci-Fi|Thriller,Action,667,2.584708
3126,3353,Mission to Mars (2000),Sci-Fi,Sci-Fi,793,2.595208
421,434,Coneheads (1993),Comedy|Sci-Fi,Sci-Fi,533,2.606004
651,672,Space Jam (1996),Adventure|Animation|Children's|Comedy|Fantasy,Animation,563,2.619893


In [118]:
movies_ratings.sort_values(by = 'rating count', ascending = False).head(10)

Unnamed: 0,movie_id,title,genres,genre,rating count,rating mean
2651,2857,American Beauty (1999),Comedy|Drama,Comedy,3428,4.317386
253,259,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Fantasy|Sci-Fi,Adventure,2991,4.453694
1106,1195,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Drama|Sci-Fi|War,Drama,2990,4.292977
1120,1209,Star Wars: Episode VI - Return of the Jedi (1983),Action|Adventure|Romance|Sci-Fi|War,Sci-Fi,2883,4.022893
466,479,Jurassic Park (1993),Action|Adventure|Sci-Fi,Sci-Fi,2672,3.763847
1848,2027,Saving Private Ryan (1998),Action|Drama|War,War,2653,4.337354
575,588,Terminator 2: Judgment Day (1991),Action|Sci-Fi|Thriller,Sci-Fi,2649,4.058513
2374,2570,"Matrix, The (1999)",Action|Sci-Fi|Thriller,Sci-Fi,2590,4.31583
1178,1269,Back to the Future (1985),Comedy|Sci-Fi,Sci-Fi,2583,3.990321
579,592,"Silence of the Lambs, The (1991)",Drama|Thriller,Thriller,2578,4.351823


In [126]:
def get_similar_movies(movie_embeddings, movie_index, measure = dot, k=10):
  #get k nearest neighbors of movie
  queried_movie = movie_embeddings[movie_index, :]
  scores = compute_scores(queried_movie, movie_embeddings, measure)
  df = movies.sort_values('movie_id', ascending = True)
  scores_list = []
  for movie_id in df['movie_id'].values:
    scores_list.append(scores[movie_id])
  df['score'] = scores_list
  df = df.sort_values(by = 'score', ascending = False)
  return df.head(k)

dot_new_hope = get_similar_movies(movie_embeddings, 259, dot)
cos_new_hope = get_similar_movies(movie_embeddings, 259, cosine)

In [128]:
dot_new_hope.head(10)

Unnamed: 0,movie_id,title,genres,genre,score
1298,1317,Blue Juice (1995),Comedy|Drama,Drama,tensor(8.2552)
1101,1116,"Eighth Day, The (Le Huitième jour ) (1996)",Drama,Drama,tensor(7.7248)
2434,2502,"Apple, The (Sib) (1998)",Drama,Drama,tensor(7.4729)
315,317,"Shawshank Redemption, The (1994)",Drama,Drama,tensor(7.4463)
523,526,Schindler's List (1993),Drama|War,Drama,tensor(7.4110)
3023,3091,Chushingura (1962),Drama,Drama,tensor(7.4110)
1180,1197,Raiders of the Lost Ark (1981),Action|Adventure,Action,tensor(7.3829)
1365,1385,Terror in a Texas Town (1958),Western,Western,tensor(7.3766)
1810,1878,"Hanging Garden, The (1997)",Drama,Drama,tensor(7.3653)
847,857,"Godfather, The (1972)",Action|Crime|Drama,Drama,tensor(7.3346)


In [129]:
dot_new_hope.head(10)

Unnamed: 0,movie_id,title,genres,genre,score
1298,1317,Blue Juice (1995),Comedy|Drama,Drama,tensor(8.2552)
1101,1116,"Eighth Day, The (Le Huitième jour ) (1996)",Drama,Drama,tensor(7.7248)
2434,2502,"Apple, The (Sib) (1998)",Drama,Drama,tensor(7.4729)
315,317,"Shawshank Redemption, The (1994)",Drama,Drama,tensor(7.4463)
523,526,Schindler's List (1993),Drama|War,Drama,tensor(7.4110)
3023,3091,Chushingura (1962),Drama,Drama,tensor(7.4110)
1180,1197,Raiders of the Lost Ark (1981),Action|Adventure,Action,tensor(7.3829)
1365,1385,Terror in a Texas Town (1958),Western,Western,tensor(7.3766)
1810,1878,"Hanging Garden, The (1997)",Drama,Drama,tensor(7.3653)
847,857,"Godfather, The (1972)",Action|Crime|Drama,Drama,tensor(7.3346)


In [131]:
cos_new_hope.head(10)

Unnamed: 0,movie_id,title,genres,genre,score
1298,1317,Blue Juice (1995),Comedy|Drama,Drama,tensor(0.0481)
3023,3091,Chushingura (1962),Drama,Drama,tensor(0.0465)
1101,1116,"Eighth Day, The (Le Huitième jour ) (1996)",Drama,Drama,tensor(0.0463)
3386,3454,Buddy Boy (1999),Drama|Thriller,Drama,tensor(0.0456)
2128,2196,Firelight (1997),Drama,Drama,tensor(0.0447)
1810,1878,"Hanging Garden, The (1997)",Drama,Drama,tensor(0.0427)
119,120,"Boys of St. Vincent, The (1993)",Drama,Drama,tensor(0.0423)
315,317,"Shawshank Redemption, The (1994)",Drama,Drama,tensor(0.0414)
1180,1197,Raiders of the Lost Ark (1981),Action|Adventure,Action,tensor(0.0411)
523,526,Schindler's List (1993),Drama|War,Drama,tensor(0.0411)


In [134]:
dot_congo = get_similar_movies(movie_embeddings, 159, dot)
cos_congo = get_similar_movies(movie_embeddings, 159, cosine)

In [135]:
dot_congo.head(10)

Unnamed: 0,movie_id,title,genres,genre,score
3341,3409,Soft Fruit (1999),Comedy|Drama,Comedy,tensor(5.1868)
643,648,Cold Fever (Á köldum klaka) (1994),Comedy|Drama,Drama,tensor(4.9755)
594,597,Window to Paris (1994),Comedy,Comedy,tensor(4.5925)
1497,1533,"Bonheur, Le (1965)",Drama,Drama,tensor(4.4926)
989,1001,Ed's Next Move (1996),Comedy,Comedy,tensor(4.4516)
1613,1658,Hurricane Streets (1998),Drama,Drama,tensor(4.4245)
2510,2578,Following (1998),Drama,Drama,tensor(4.4171)
499,502,"New Age, The (1994)",Drama,Drama,tensor(4.3827)
2943,3011,Battling Butler (1926),Comedy,Comedy,tensor(4.2258)
2604,2672,Eternity and a Day (Mia eoniotita ke mia mera ...,Drama,Drama,tensor(4.2032)


In [136]:
cos_congo.head(10)

Unnamed: 0,movie_id,title,genres,genre,score
594,597,Window to Paris (1994),Comedy,Comedy,tensor(0.0537)
1497,1533,"Bonheur, Le (1965)",Drama,Drama,tensor(0.0537)
3341,3409,Soft Fruit (1999),Comedy|Drama,Comedy,tensor(0.0535)
643,648,Cold Fever (Á köldum klaka) (1994),Comedy|Drama,Drama,tensor(0.0529)
1476,1509,"Brother's Kiss, A (1997)",Drama,Drama,tensor(0.0521)
1613,1658,Hurricane Streets (1998),Drama,Drama,tensor(0.0521)
2418,2486,"Blood, Guts, Bullets and Octane (1998)",Action|Comedy,Comedy,tensor(0.0498)
989,1001,Ed's Next Move (1996),Comedy,Comedy,tensor(0.0490)
3055,3123,Agnes Browne (1999),Comedy|Drama,Drama,tensor(0.0478)
717,725,Last Dance (1996),Drama,Drama,tensor(0.0455)
