In [2]:
# Data Citation:
# F. Maxwell Harper and Joseph A. Konstan. 2015. The MovieLens Datasets: History and Context. ACM Transactions on
# Interactive Intelligent Systems (TiiS) 5, 4: 19:1–19:19. <https://doi.org/10.1145/2827872>

! curl http://files.grouplens.org/datasets/movielens/ml-latest-small.zip -o ml-latest-small.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  955k  100  955k    0     0  1974k      0 --:--:-- --:--:-- --:--:-- 1977k


In [3]:
import zipfile
with zipfile.ZipFile('ml-latest-small.zip', 'r') as zip_ref:
    zip_ref.extractall('data')

In [4]:
# import the dataset
import pandas as pd
movies_df = pd.read_csv('data/ml-latest-small/movies.csv')
ratings_df = pd.read_csv('data/ml-latest-small/ratings.csv')

In [5]:
print('The dimensions of movies dataframe are:', movies_df.shape,'\nThe dimensions of ratings dataframe are:', ratings_df.shape)

The dimensions of movies dataframe are: (9742, 3) 
The dimensions of ratings dataframe are: (100836, 4)


In [6]:
# Take a look at movies_df
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [7]:
# Take a look at ratings_df
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [8]:
# Movie ID to movie name mapping
movie_names = movies_df.set_index('movieId')['title'].to_dict()
n_users = len(ratings_df.userId.unique())
n_items = len(ratings_df.movieId.unique())
print("Number of unique users:", n_users)
print("Number of unique movies:", n_items)
print("The full rating matrix will have:", n_users*n_items, 'elements.')
print('----------')
print("Number of ratings:", len(ratings_df))
print("Therefore: ", len(ratings_df) / (n_users*n_items) * 100, '% of the matrix is filled.')
print("We have an incredibly sparse matrix to work with here.")
print("And... as you can imagine, as the number of users and products grow, the number of elements will increase by n*2")
print("You are going to need a lot of memory to work with global scale... storing a full matrix in memory would be a challenge.")
print("One advantage here is that matrix factorization can realize the rating matrix implicitly, thus we don't need all the data")

Number of unique users: 610
Number of unique movies: 9724
The full rating matrix will have: 5931640 elements.
----------
Number of ratings: 100836
Therefore:  1.6999683055613624 % of the matrix is filled.
We have an incredibly sparse matrix to work with here.
And... as you can imagine, as the number of users and products grow, the number of elements will increase by n*2
You are going to need a lot of memory to work with global scale... storing a full matrix in memory would be a challenge.
One advantage here is that matrix factorization can realize the rating matrix implicitly, thus we don't need all the data


In [9]:
ratings = ratings_df.copy()

In [10]:
users = ratings_df.userId.unique()
movies = ratings_df.movieId.unique()

#--- Producing new continuous IDs for users and movies ---

# Unique values : index
userid2idx = {o:i for i,o in enumerate(users)}
movieid2idx = {o:i for i,o in enumerate(movies)}
idx2userid = {i:o for o,i in userid2idx.items()}
idx2movieid = {i:o for o,i in movieid2idx.items()}

# return the id from the indexed values as noted in the lambda function down below.
ratings.movieId = ratings_df.movieId.apply(lambda x: movieid2idx[x])
ratings.userId = ratings_df.userId.apply(lambda x: userid2idx[x])


x = ratings.drop(['rating', 'timestamp'], axis=1).values
y = ratings['rating'].values

In [11]:
x

array([[   0,    0],
       [   0,    1],
       [   0,    2],
       ...,
       [ 609, 3121],
       [ 609, 1392],
       [ 609, 2873]])

In [27]:
import torch
import numpy as np
from tqdm import tqdm_notebook as tqdm

class MatrixFactorization(torch.nn.Module):
    def __init__(self, n_users, n_items, n_factors=20):
        super().__init__()
        # create user embeddings
        self.user_factors = torch.nn.Embedding(n_users, n_factors) # think of this as a lookup table for the input.
        # create item embeddings
        self.item_factors = torch.nn.Embedding(n_items, n_factors) # think of this as a lookup table for the input.
        self.user_factors.weight.data.uniform_(0, 0.05)
        self.item_factors.weight.data.uniform_(0, 0.05)

    def forward(self, data):
        # matrix multiplication
        users, items = data[:,0], data[:,1]
        return (self.user_factors(users)*self.item_factors(items)).sum(1)
    # def forward(self, user, item):
    # 	# matrix multiplication
    #     return (self.user_factors(user)*self.item_factors(item)).sum(1)

    def predict(self, user, item):
        return self.forward(user, item)

In [25]:
import torch

class RecommendationModel(torch.nn.Module):
    def __init__(self, num_users, num_movies, n_factors=20):
        super(RecommendationModel, self).__init__()

        self.user_embeddings = torch.nn.Embedding(num_users, n_factors)
        self.movie_embeddings = torch.nn.Embedding(num_movies, n_factors)

        self.fc1 = torch.nn.Linear(256, 64)
        self.fc2 = torch.nn.Linear(64, 1)
        # self.fc3 = torch.nn.Linear(32, 1)

    def forward(self, data):

        user_ids, movie_ids = data[:,0], data[:,1]
        user_embeddings = self.user_embeddings(user_ids)
        movie_embeddings = self.movie_embeddings(movie_ids)
        x = torch.cat([user_embeddings, movie_embeddings], dim=1)

        x = self.fc1(x)
        x = torch.relu(x)
        x = self.fc2(x)
        # x = torch.relu(x)
        # x = self.fc3(x)

        return x

In [14]:
# Creating the dataloader (necessary for PyTorch)
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader # package that helps transform your data to machine learning readiness

# Note: This isn't 'good' practice, in a MLops sense but we'll roll with this since the data is already loaded in memory.
class Loader(Dataset):
    def __init__(self):
        self.ratings = ratings_df.copy()

        # Extract all user IDs and movie IDs
        users = ratings_df.userId.unique()
        movies = ratings_df.movieId.unique()

        #--- Producing new continuous IDs for users and movies ---

        # Unique values : index
        self.userid2idx = {o:i for i,o in enumerate(users)}
        self.movieid2idx = {o:i for i,o in enumerate(movies)}

        # Obtained continuous ID for users and movies
        self.idx2userid = {i:o for o,i in self.userid2idx.items()}
        self.idx2movieid = {i:o for o,i in self.movieid2idx.items()}

        # return the id from the indexed values as noted in the lambda function down below.
        self.ratings.movieId = ratings_df.movieId.apply(lambda x: self.movieid2idx[x])
        self.ratings.userId = ratings_df.userId.apply(lambda x: self.userid2idx[x])


        self.x = self.ratings.drop(['rating', 'timestamp'], axis=1).values
        self.y = self.ratings['rating'].values
        self.x, self.y = torch.tensor(self.x), torch.tensor(self.y) # Transforms the data to tensors (ready for torch models.)

    def __getitem__(self, index):
        return (self.x[index], self.y[index])

    def __len__(self):
        return len(self.ratings)

In [29]:
num_epochs_mat = 128
cuda = torch.cuda.is_available()

print("Is running on GPU:", cuda)

model = MatrixFactorization(n_users, n_items, n_factors=8)
print(model)
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)
# GPU enable if you have a GPU...
if cuda:
    model = model.cuda()

# MSE loss
loss_fn = torch.nn.MSELoss()

# ADAM optimizier
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# Train data
train_set = Loader()
train_loader = DataLoader(train_set, 128, shuffle=True)

Is running on GPU: False
MatrixFactorization(
  (user_factors): Embedding(610, 8)
  (item_factors): Embedding(9724, 8)
)
user_factors.weight tensor([[0.0158, 0.0297, 0.0390,  ..., 0.0448, 0.0181, 0.0045],
        [0.0279, 0.0323, 0.0402,  ..., 0.0115, 0.0241, 0.0114],
        [0.0496, 0.0434, 0.0053,  ..., 0.0294, 0.0498, 0.0221],
        ...,
        [0.0357, 0.0305, 0.0289,  ..., 0.0282, 0.0269, 0.0007],
        [0.0208, 0.0403, 0.0282,  ..., 0.0072, 0.0148, 0.0099],
        [0.0192, 0.0165, 0.0461,  ..., 0.0445, 0.0208, 0.0163]])
item_factors.weight tensor([[0.0006, 0.0119, 0.0303,  ..., 0.0152, 0.0081, 0.0160],
        [0.0169, 0.0355, 0.0022,  ..., 0.0408, 0.0235, 0.0072],
        [0.0217, 0.0104, 0.0352,  ..., 0.0109, 0.0171, 0.0455],
        ...,
        [0.0490, 0.0422, 0.0076,  ..., 0.0172, 0.0474, 0.0421],
        [0.0082, 0.0431, 0.0165,  ..., 0.0341, 0.0099, 0.0280],
        [0.0371, 0.0002, 0.0059,  ..., 0.0302, 0.0385, 0.0296]])


In [24]:
num_epochs = 64
cuda = torch.cuda.is_available()

print("Is running on GPU:", cuda)

model_nn = RecommendationModel(n_users, n_items, n_factors=128)
print(model_nn)
for name, param in model_nn.named_parameters():
    if param.requires_grad:
        print(name, param.data)
# GPU enable if you have a GPU...
if cuda:
    model = model.cuda()

# MSE loss
loss_fn = torch.nn.MSELoss()

# ADAM optimizier
optimizer = torch.optim.Adam(model_nn.parameters(), lr=1e-3)

# Train data
train_set = Loader()
train_loader = DataLoader(train_set, 128, shuffle=True)

Is running on GPU: False
RecommendationModel(
  (user_embeddings): Embedding(610, 128)
  (movie_embeddings): Embedding(9724, 128)
  (fc1): Linear(in_features=256, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=1, bias=True)
)
user_embeddings.weight tensor([[-0.6135,  1.7210, -0.1599,  ..., -1.6421,  0.2061, -0.7526],
        [ 1.4370,  0.3737,  0.6136,  ..., -0.2904, -0.0839, -1.4177],
        [ 1.7600,  0.1670,  0.4755,  ..., -1.5579, -0.3314,  1.5254],
        ...,
        [ 1.9546,  1.3759, -0.3824,  ...,  0.1184,  0.6724, -0.2157],
        [-0.2388,  1.0188, -0.3917,  ...,  1.1143,  0.8725, -1.4506],
        [-0.3727, -1.2597, -0.6588,  ...,  0.8843,  0.3818,  0.2300]])
movie_embeddings.weight tensor([[-0.0846, -2.0020,  0.4100,  ..., -1.0920,  0.8406,  0.1410],
        [ 1.7086,  1.1986,  0.4740,  ..., -0.1609, -0.3793, -1.6882],
        [-0.6377,  0.5295, -0.2706,  ...,  0.2033,  0.2353,  1.5232],
        ...,
        [-1.0248, -1.1748,  0.4837,  ...,  0

In [26]:
for it in tqdm(range(num_epochs)):
    losses = []
    for x, y in train_loader:
        #  if cuda
        optimizer.zero_grad()
        outputs = model_nn(x)
        loss = loss_fn(outputs.squeeze(), y.type(torch.float32))
        losses.append(loss.item())
        loss.backward()
        optimizer.step()
    print("iter #{}".format(it), "Loss:", sum(losses) / len(losses))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for it in tqdm(range(num_epochs)):


  0%|          | 0/64 [00:00<?, ?it/s]

iter #0 Loss: 1.5107601568632318
iter #1 Loss: 0.8198215696714857
iter #2 Loss: 0.7418240353767642
iter #3 Loss: 0.689140333099111
iter #4 Loss: 0.6477761969045939
iter #5 Loss: 0.6121000149074545
iter #6 Loss: 0.580831232787994
iter #7 Loss: 0.550378552132149
iter #8 Loss: 0.5226917406024062
iter #9 Loss: 0.496018806538606
iter #10 Loss: 0.4704350588966142
iter #11 Loss: 0.4467457464806319
iter #12 Loss: 0.423398500960793
iter #13 Loss: 0.4008859511332464
iter #14 Loss: 0.379377096853583
iter #15 Loss: 0.3609100922299218
iter #16 Loss: 0.3432810819201966
iter #17 Loss: 0.32604242057591526
iter #18 Loss: 0.310706269676764
iter #19 Loss: 0.29682040524664266
iter #20 Loss: 0.2830753729579412
iter #21 Loss: 0.27110685748497243
iter #22 Loss: 0.25952147619638044
iter #23 Loss: 0.2486833576018435
iter #24 Loss: 0.23888929094646486
iter #25 Loss: 0.2300562516308678
iter #26 Loss: 0.22070442608127436
iter #27 Loss: 0.21258966704159218
iter #28 Loss: 0.20469606504184643
iter #29 Loss: 0.197670

In [30]:
for it in tqdm(range(num_epochs_mat)):
    losses = []
    for x, y in train_loader:
        #  if cuda
        optimizer.zero_grad()
        outputs = model(x)
        loss = loss_fn(outputs.squeeze(), y.type(torch.float32))
        losses.append(loss.item())
        loss.backward()
        optimizer.step()
    print("iter #{}".format(it), "Loss:", sum(losses) / len(losses))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for it in tqdm(range(num_epochs_mat)):


  0%|          | 0/128 [00:00<?, ?it/s]

iter #0 Loss: 11.059344142221557
iter #1 Loss: 4.740892121331946
iter #2 Loss: 2.4738204520063354
iter #3 Loss: 1.7207432414977077
iter #4 Loss: 1.3452331547053333
iter #5 Loss: 1.128278166736443
iter #6 Loss: 0.9913586689736033
iter #7 Loss: 0.9004022415519366
iter #8 Loss: 0.8369399686603982
iter #9 Loss: 0.7923291433356741
iter #10 Loss: 0.7594801195835704
iter #11 Loss: 0.7348270007423338
iter #12 Loss: 0.7158640117058294
iter #13 Loss: 0.7012862272295856
iter #14 Loss: 0.6902240023470773
iter #15 Loss: 0.6815961546023485
iter #16 Loss: 0.6746116018839899
iter #17 Loss: 0.669692274201945
iter #18 Loss: 0.6657433548781473
iter #19 Loss: 0.6628578049579853
iter #20 Loss: 0.6605175505510441
iter #21 Loss: 0.6588585228862496
iter #22 Loss: 0.6576252376216317
iter #23 Loss: 0.6565248373574412
iter #24 Loss: 0.6557389958407068
iter #25 Loss: 0.6550901041645084
iter #26 Loss: 0.6543130490002294
iter #27 Loss: 0.6532966546752126
iter #28 Loss: 0.6524081878780108
iter #29 Loss: 0.6510153767

In [31]:
# By training the model, we will have tuned latent factors for movies and users.
c = 0
uw = 0
iw = 0
for name, param in model_nn.named_parameters():
    if param.requires_grad:
        print(name, param.data)
        if c == 0:
          uw = param.data
          c +=1
        else:
          iw = param.data
        #print('param_data', param_data)

user_embeddings.weight tensor([[-8.3067e-01,  1.6122e+00, -3.9605e-01,  ..., -1.7000e+00,
          7.7312e-02, -7.6347e-01],
        [ 1.4475e+00,  1.1785e-01,  8.6269e-01,  ..., -4.8676e-01,
         -1.8523e-01, -2.1808e+00],
        [ 1.6910e+00,  2.9468e-01, -7.0035e-01,  ..., -7.9876e-01,
          2.5797e-01,  1.8464e+00],
        ...,
        [ 2.1090e+00,  7.7049e-01, -2.3833e-01,  ...,  3.9782e-01,
          1.0109e+00, -1.1517e-01],
        [ 3.8170e-01,  7.6467e-01,  1.6965e-03,  ...,  3.8055e-01,
          5.7299e-02, -1.2576e+00],
        [-3.2200e-01, -1.1270e+00, -1.4185e-01,  ...,  8.8795e-01,
          7.0850e-01,  2.1795e-01]])
movie_embeddings.weight tensor([[ 0.2433, -1.5735,  1.0338,  ..., -0.0927,  0.6372, -0.0042],
        [ 1.9606,  1.6241,  0.7624,  ...,  0.0933,  0.2626, -1.5506],
        [-0.0500,  1.4229, -0.4544,  ...,  0.8390, -0.7589,  0.9859],
        ...,
        [-1.1774, -1.0355,  0.2971,  ...,  0.7542,  0.9402,  0.4258],
        [-1.3996,  0.0591, -

In [32]:
# By training the model, we will have tuned latent factors for movies and users.
c = 0
uw = 0
iw = 0
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)
        if c == 0:
          uw = param.data
          c +=1
        else:
          iw = param.data
        #print('param_data', param_data)

user_factors.weight tensor([[ 1.2445e+00,  1.0308e+00,  1.1761e+00,  ...,  5.3574e-01,
          1.5689e+00,  2.0006e+00],
        [ 1.1768e+00,  1.2718e+00,  1.6569e+00,  ...,  3.6115e-05,
          2.6779e-02,  1.4169e+00],
        [ 1.2685e+00,  2.4203e+00,  1.0166e+00,  ...,  1.5312e+00,
         -1.5462e+00, -3.9144e-01],
        ...,
        [ 1.0708e+00,  1.5447e+00,  7.0781e-01,  ...,  8.0158e-01,
          1.7697e+00, -8.7455e-01],
        [ 1.2381e+00,  1.2381e+00,  5.7194e-01,  ...,  3.7518e-01,
          1.0772e+00,  3.1328e-01],
        [ 1.0549e+00,  1.3010e+00,  2.0052e+00,  ...,  1.3620e+00,
          3.9774e-01,  1.4048e+00]])
item_factors.weight tensor([[0.5112, 0.2716, 0.5045,  ..., 0.6818, 0.2675, 0.9253],
        [0.2294, 0.2747, 0.3002,  ..., 0.1982, 0.3663, 0.7980],
        [0.4945, 0.3911, 0.8212,  ..., 0.7234, 0.2587, 0.5113],
        ...,
        [0.3760, 0.3700, 0.3357,  ..., 0.3447, 0.3674, 0.3694],
        [0.3744, 0.4078, 0.3813,  ..., 0.3992, 0.3705, 0.39

In [33]:
trained_movie_embeddings = model_nn.movie_embeddings.weight.data.cpu().numpy()

In [34]:
trained_movie_embeddings_mf = model.item_factors.weight.data.cpu().numpy()

In [20]:
len(trained_movie_embeddings) # unique movie factor weights

9724

In [36]:
from sklearn.cluster import KMeans
# Fit the clusters based on the movie weights
kmeans = KMeans(n_clusters=10, random_state=0).fit(trained_movie_embeddings)



In [37]:
from sklearn.cluster import KMeans
# Fit the clusters based on the movie weights
kmeans_mf = KMeans(n_clusters=10, random_state=0).fit(trained_movie_embeddings_mf)



In [38]:
'''It can be seen here that the movies that are in the same cluster tend to have
similar genres. Also note that the algorithm is unfamiliar with the movie name
and only obtained the relationships by looking at the numbers representing how
users have responded to the movie selections.'''
for cluster in range(10):
  print("Cluster #{}".format(cluster))
  movs = []
  for movidx in np.where(kmeans.labels_ == cluster)[0]:
    movid = train_set.idx2movieid[movidx]
    rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
    movs.append((movie_names[movid], rat_count))
  for mov in sorted(movs, key=lambda tup: tup[1], reverse=True)[:10]:
    print("\t", mov[0])

Cluster #0
	 Forrest Gump (1994)
	 Star Wars: Episode IV - A New Hope (1977)
	 Lord of the Rings: The Fellowship of the Ring, The (2001)
	 Godfather, The (1972)
	 Gladiator (2000)
	 Mask, The (1994)
	 Dark Knight, The (2008)
	 Mrs. Doubtfire (1993)
	 Princess Bride, The (1987)
	 E.T. the Extra-Terrestrial (1982)
Cluster #1
	 Fight Club (1999)
	 Apollo 13 (1995)
	 Saving Private Ryan (1998)
	 Lord of the Rings: The Return of the King, The (2003)
	 Sixth Sense, The (1999)
	 Lion King, The (1994)
	 Alien (1979)
	 Beauty and the Beast (1991)
	 Die Hard (1988)
	 Good Will Hunting (1997)
Cluster #2
	 Jurassic Park (1993)
	 Toy Story (1995)
	 Usual Suspects, The (1995)
	 Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)
	 Fugitive, The (1993)
	 Aladdin (1992)
	 True Lies (1994)
	 Twelve Monkeys (a.k.a. 12 Monkeys) (1995)
	 Back to the Future (1985)
	 Mission: Impossible (1996)
Cluster #3
	 Silence of the Lambs, The (1991)
	 Matrix, The (1999)
	 Seven (a.k.a. Se7en

In [41]:
for cluster in range(10):
  print("Cluster #{}".format(cluster))
  movs = []
  for movidx in np.where(kmeans_mf.labels_ == cluster)[0]:
    movid = train_set.idx2movieid[movidx]
    rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
    movs.append((movie_names[movid], rat_count))
  for mov in sorted(movs, key=lambda tup: tup[1], reverse=True)[:10]:
    print("\t", mov[0])

Cluster #0
	 Ace Ventura: Pet Detective (1994)
	 Mask, The (1994)
	 Dumb & Dumber (Dumb and Dumber) (1994)
	 Happy Gilmore (1996)
	 Ace Ventura: When Nature Calls (1995)
	 Nutty Professor, The (1996)
	 Face/Off (1997)
	 RoboCop (1987)
	 Robin Hood: Men in Tights (1993)
	 Legends of the Fall (1994)
Cluster #1
	 Free Willy (1993)
	 Godzilla (1998)
	 Super Mario Bros. (1993)
	 Battlefield Earth (2000)
	 Rocky V (1990)
	 Superman IV: The Quest for Peace (1987)
	 Karate Kid, Part III, The (1989)
	 Rambo III (1988)
	 Problem Child (1990)
	 Flintstones in Viva Rock Vegas, The (2000)
Cluster #2
	 Forrest Gump (1994)
	 Star Wars: Episode IV - A New Hope (1977)
	 Terminator 2: Judgment Day (1991)
	 Schindler's List (1993)
	 Toy Story (1995)
	 Apollo 13 (1995)
	 Fugitive, The (1993)
	 Aladdin (1992)
	 Lion King, The (1994)
	 Back to the Future (1985)
Cluster #3
	 Shawshank Redemption, The (1994)
	 Silence of the Lambs, The (1991)
	 Matrix, The (1999)
	 Braveheart (1995)
	 Star Wars: Episode V - T