
Lesson: https://youtu.be/p4ZZq0736Po?feature=shared&t=3770

Dataset: https://www.kaggle.com/datasets/grouplens/movielens-20m-dataset

Dataset citation:
>F. Maxwell Harper and Joseph A. Konstan. 2015. The MovieLens Datasets: History and Context. ACM Transactions on Interactive Intelligent Systems (TiiS) 5, 4, Article 19 (December 2015), 19 pages. DOI=http://dx.doi.org/10.1145/2827872

In [4]:
!pip install kagglehub
!pip install fastai

[0m

In [19]:
import numpy as np
import pandas as pd
import torch
import kagglehub

# Homebrew
#from kaggle_util import KaggleUtil

In [36]:
from fastai.collab import (
    untar_data, CollabDataLoaders,
    one_hot
)

from fastai.tabular.all import (
    URLs, set_seed
)

In [21]:


# This 20m dataset is too large for our purpose. Let's use the dataset from fastai.
#path = kagglehub.dataset_download("grouplens/movielens-20m-dataset")
#print("Path to dataset files:", path)

#ratings = pd.read_csv(path, #header=None,
#                        #names=['user_id','movie_id','rating','timestamp'],
#                        low_memory=False
#                     )
#display(ratings.head())

In [22]:
# JH uses untar_data from fastai
path = untar_data(URLs.ML_100k)
ratings = pd.read_csv(path/'u.data', delimiter='\t', header=None,
                        names=['userId','movieId','rating','timestamp']
                     )
display(ratings.head())

Unnamed: 0,userId,movieId,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


If we rearrange into a cross-tab format, it could look like this.

In [23]:
cross = pd.crosstab(ratings['userId'], ratings['movieId'],values=ratings['rating'], aggfunc=np.mean)
display(cross.head())
print(cross.shape)

  cross = pd.crosstab(ratings['userId'], ratings['movieId'],values=ratings['rating'], aggfunc=np.mean)


movieId,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,3.0,,,,,,,,,...,,,,,,,,,,


(943, 1682)


There are a lot of NaN's. Let's see what we can do about those...

In [24]:
movies = pd.read_csv(path/'u.item', delimiter='|', encoding='latin-1',
                     usecols=(0,1),names=('movie','title'), header=None
                    )
display(movies.head())

Unnamed: 0,movie,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [26]:
drop_cols = ['movie','timestamp']
mrg_ratings = ratings.merge(movies, left_on='movieId', right_on='movie').drop(drop_cols,axis=1)
display(mrg_ratings.head())

Unnamed: 0,userId,movieId,rating,title
0,196,242,3,Kolya (1996)
1,186,302,3,L.A. Confidential (1997)
2,22,377,1,Heavyweights (1994)
3,244,51,2,Legends of the Fall (1994)
4,166,346,1,Jackie Brown (1997)


In [31]:
#Create the data loader
dls = CollabDataLoaders.from_df(mrg_ratings, item_name='title', bs=64)
dls.show_batch()

Unnamed: 0,userId,title,rating
0,796,Grease (1978),5
1,710,Snow White and the Seven Dwarfs (1937),4
2,648,Tales From the Crypt Presents: Demon Knight (1995),1
3,173,In & Out (1997),5
4,222,Tales from the Crypt Presents: Bordello of Blood (1996),3
5,209,"Full Monty, The (1997)",2
6,115,Star Trek III: The Search for Spock (1984),3
7,345,Indiana Jones and the Last Crusade (1989),4
8,881,Barb Wire (1996),1
9,256,"Program, The (1993)",4


In [48]:
n_users = len(dls.classes['userId'])
n_movies = len(dls.classes['title'])
n_factors = 5

user_factors = torch.randn(n_users, n_factors)
print(user_factors.t().shape)
print(user_factors.t())
movie_factors = torch.randn(n_movies, n_factors)

torch.Size([5, 944])
tensor([[-2.7299,  0.8703,  0.1197,  ..., -0.7043,  0.7329,  0.9216],
        [-0.4825, -2.7075,  0.4175,  ..., -0.2253, -1.0185, -1.3266],
        [ 3.5403,  1.2361,  0.5858,  ...,  0.5949, -0.5532,  0.0861],
        [ 0.4696, -2.7921,  0.7853,  ..., -1.7375,  1.0445,  1.4808],
        [-0.5206,  1.2285, -0.2528,  ..., -1.7355, -0.2933,  0.1073]])


In [47]:
#Creates a 1D tensor/vector that has all 0's except for at user inex 3.
one_hot_3 = one_hot(3, n_users).float()
print(one_hot_3.shape)
print(one_hot_3[:5])

torch.Size([944])
tensor([0., 0., 0., 1., 0.])


In [44]:
print(user_factors.t().shape)
user_factors.t() @ one_hot_3

torch.Size([5, 944])


tensor([ 0.9638, -0.4569,  0.7485, -1.8326,  1.2591])