<a href="https://colab.research.google.com/github/immimimi/task-1-and-task-3/blob/main/Recomendation_algorithm_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# Using movie ratings data set and pytorch to develop a machine learning recomendation algorithum.
# The same code can be used on Rahber data to produce course recommendations for users.

# Data Citation:
# F. Maxwell Harper and Joseph A. Konstan. 2015. The MovieLens Datasets: History and Context. ACM Transactions on
# Interactive Intelligent Systems (TiiS) 5, 4: 19:1–19:19.

# downloading the data

! curl http://files.grouplens.org/datasets/movielens/ml-latest-small.zip -o ml-latest-small.zip


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  955k  100  955k    0     0  1719k      0 --:--:-- --:--:-- --:--:-- 1721k


In [None]:
#convert data to csv

import zipfile
with zipfile.ZipFile('ml-latest-small.zip', 'r') as zip_ref:
    zip_ref.extractall('data')

#import data

import pandas as pd
movies_df = pd.read_csv('data/ml-latest-small/movies.csv')
ratings_df = pd.read_csv('data/ml-latest-small/ratings.csv')



In [None]:
#check the format of the dataframes, I do this every time I import data
#Take a look at movies_df
movies_df.head()


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [None]:
#Take a look at ratings_df
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [None]:
#pytorch creates neural networks using matrix factorisation,
#the data sets will be put into matrix form with the following dimensions

print('The dimensions of movies dataframe are:', movies_df.shape,'\nThe dimensions of ratings dataframe are:', ratings_df.shape)


The dimensions of movies dataframe are: (9742, 3) 
The dimensions of ratings dataframe are: (100836, 4)


In [None]:
# Movie ID to movie name mapping
movie_names = movies_df.set_index('movieId')['title'].to_dict()
n_users = len(ratings_df.userId.unique())
n_items = len(ratings_df.movieId.unique())
print("Number of unique users:", n_users)
print("Number of unique movies:", n_items)
print("The full rating matrix will have:", n_users*n_items, 'elements.')
print('----------')
print("Number of ratings:", len(ratings_df))
print("Therefore: ", len(ratings_df) / (n_users*n_items) * 100, '% of the matrix is filled.')
print("We have an incredibly sparse matrix to work with here.")
print("One advantage here is that matrix factorization can realize the rating matrix implicitly, therefore we don't need all the data")


Number of unique users: 610
Number of unique movies: 9724
The full rating matrix will have: 5931640 elements.
----------
Number of ratings: 100836
Therefore:  1.6999683055613624 % of the matrix is filled.
We have an incredibly sparse matrix to work with here.
One advantage here is that matrix factorization can realize the rating matrix implicitly, therefore we don't need all the data


In [None]:
#import libraries

import torch
import numpy as np
from torch.autograd import Variable
from tqdm import tqdm_notebook as tqdm


#Initialising our matrixfactorisation function that will find our relationships
#this is done through neural networks, where weights are applied to items acording
#to the probability that user will rate the item highly.

class MatrixFactorization(torch.nn.Module):
    def __init__(self, n_users, n_items, n_factors=20):
        super().__init__()
        # create user embeddings
        self.user_factors = torch.nn.Embedding(n_users, n_factors) # lookup table for the input (users)
        # create item embeddings
        self.item_factors = torch.nn.Embedding(n_items, n_factors) # ookup table for the input (items)
        self.user_factors.weight.data.uniform_(0, 0.05) #these are the weights that will change over each epoch (an epoch is a trainign cycle)
        self.item_factors.weight.data.uniform_(0, 0.05)

    def forward(self, data):
        # matrix multiplication
        users, items = data[:,0], data[:,1]
        return (self.user_factors(users)*self.item_factors(items)).sum(1)
    # def forward(self, user, item):
    # 	# matrix multiplication
    #     return (self.user_factors(user)*self.item_factors(item)).sum(1)

    def predict(self, user, item):
        return self.forward(user, item)


In [None]:
# Creating the dataloader (necessary for PyTorch)
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader # package that helps transform your data to machine learning readiness

class Loader(Dataset):
    def __init__(self):
        self.ratings = ratings_df.copy()

        # Extract all user IDs and movie IDs
        users = ratings_df.userId.unique()
        movies = ratings_df.movieId.unique()

        #--- Producing new continuous IDs for users and movies ---

        # Unique values : index
        self.userid2idx = {o:i for i,o in enumerate(users)}
        self.movieid2idx = {o:i for i,o in enumerate(movies)}

        # Obtained continuous ID for users and movies
        self.idx2userid = {i:o for o,i in self.userid2idx.items()}
        self.idx2movieid = {i:o for o,i in self.movieid2idx.items()}

        # return the id from the indexed values as noted in the lambda function down below.
        self.ratings.movieId = ratings_df.movieId.apply(lambda x: self.movieid2idx[x])
        self.ratings.userId = ratings_df.userId.apply(lambda x: self.userid2idx[x])


        self.x = self.ratings.drop(['rating', 'timestamp'], axis=1).values
        self.y = self.ratings['rating'].values
        self.x, self.y = torch.tensor(self.x), torch.tensor(self.y) # Transforms the data to tensors (ready for torch models.)

    def __getitem__(self, index):
        return (self.x[index], self.y[index])

    def __len__(self):
        return len(self.ratings)

In [None]:

num_epochs = 128 #number of cycles, this can be adjusted if we are under or voer training the algorithm
cuda = torch.cuda.is_available()

model = MatrixFactorization(n_users, n_items, n_factors=8)
print(model)
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)
# GPU enable if you have a GPU
if cuda:
    model = model.cuda()

# MSE loss, this is the means squared error, this checks the reliability of the algorithm
loss_fn = torch.nn.MSELoss()

# ADAM optimizier, this optimizes our parameters before training
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# Train data
train_set = Loader()
train_loader = DataLoader(train_set, 128, shuffle=True)

MatrixFactorization(
  (user_factors): Embedding(610, 8)
  (item_factors): Embedding(9724, 8)
)
user_factors.weight tensor([[0.0142, 0.0267, 0.0215,  ..., 0.0055, 0.0454, 0.0440],
        [0.0107, 0.0184, 0.0154,  ..., 0.0273, 0.0423, 0.0431],
        [0.0061, 0.0292, 0.0188,  ..., 0.0297, 0.0185, 0.0090],
        ...,
        [0.0025, 0.0368, 0.0489,  ..., 0.0425, 0.0069, 0.0358],
        [0.0402, 0.0071, 0.0487,  ..., 0.0177, 0.0473, 0.0329],
        [0.0280, 0.0073, 0.0142,  ..., 0.0323, 0.0123, 0.0306]])
item_factors.weight tensor([[0.0068, 0.0375, 0.0307,  ..., 0.0042, 0.0479, 0.0277],
        [0.0272, 0.0230, 0.0421,  ..., 0.0139, 0.0101, 0.0426],
        [0.0403, 0.0248, 0.0333,  ..., 0.0352, 0.0471, 0.0448],
        ...,
        [0.0474, 0.0121, 0.0310,  ..., 0.0310, 0.0231, 0.0152],
        [0.0041, 0.0251, 0.0491,  ..., 0.0172, 0.0032, 0.0270],
        [0.0287, 0.0335, 0.0238,  ..., 0.0455, 0.0206, 0.0408]])


In [None]:
#training the model, this will take a moment to run
for it in tqdm(range(num_epochs)):
    losses = []
    for x, y in train_loader:
        if cuda:
            x, y = x.cuda(), y.cuda()  # Move data to GPU if CUDA is enabled
        optimizer.zero_grad()  # Clear previously calculated gradients
        outputs = model(x)  # Forward pass: compute predicted ratings
        loss = loss_fn(outputs.squeeze(), y.type(torch.float32))  # Compute loss
        losses.append(loss.item())  # Append loss value to list
        loss.backward()  # Backpropagation: compute gradients
        optimizer.step()  # Update model parameters

    # Check if losses list is empty before calculating the average
    if losses:
        print("iter #{}".format(it), "Loss:", sum(losses) / len(losses))  # Print average loss for the epoch
    else:
        print("iter #{}".format(it), "No losses recorded in this epoch.")



Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for it in tqdm(range(num_epochs)):


  0%|          | 0/128 [00:00<?, ?it/s]

iter #0 Loss: 11.055385371755222
iter #1 Loss: 4.737995774310252
iter #2 Loss: 2.473314007678008
iter #3 Loss: 1.7209328219370188
iter #4 Loss: 1.3453763327017654
iter #5 Loss: 1.1284210289341545
iter #6 Loss: 0.9912913434850383
iter #7 Loss: 0.900222719320791
iter #8 Loss: 0.8369399449093088
iter #9 Loss: 0.7918590947153604
iter #10 Loss: 0.7592092315633285
iter #11 Loss: 0.7345767186831702
iter #12 Loss: 0.7159432862615828
iter #13 Loss: 0.7016813089777976
iter #14 Loss: 0.6901029193310568
iter #15 Loss: 0.6820010738400033
iter #16 Loss: 0.6748707560718362
iter #17 Loss: 0.6695959423928697
iter #18 Loss: 0.6658409313880248
iter #19 Loss: 0.6625275436391685
iter #20 Loss: 0.6605830411544911
iter #21 Loss: 0.6589723232055679
iter #22 Loss: 0.6577203534732615
iter #23 Loss: 0.6567120375442626
iter #24 Loss: 0.6556656974069963
iter #25 Loss: 0.6549801850681982
iter #26 Loss: 0.6543584877086169
iter #27 Loss: 0.6534271276163571
iter #28 Loss: 0.6526853532582371
iter #29 Loss: 0.6513053869

In [None]:

#By training the model, we will have tuned latent factors for movies and users.
c = 0
uw = 0
iw = 0
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)
        if c == 0:
          uw = param.data
          c +=1
        else:
          iw = param.data
        #print('param_data', param_data)

user_factors.weight tensor([[ 0.7871,  1.3896,  1.8377,  ...,  0.8372,  1.5639,  1.3763],
        [ 1.7979,  0.4963,  1.2147,  ...,  1.4682,  1.1638,  1.1264],
        [ 1.7107,  1.5883,  1.1283,  ...,  1.7203, -0.3196, -0.2947],
        ...,
        [ 1.9852,  0.4762,  1.4789,  ...,  2.0947, -0.2450, -0.1023],
        [ 1.4072,  0.9287,  0.8699,  ...,  0.6310,  1.3615,  0.6978],
        [ 1.4331,  1.4944,  0.6663,  ...,  1.1634,  0.6251,  1.9096]])
item_factors.weight tensor([[0.3049, 0.5450, 0.1585,  ..., 0.3556, 0.4413, 0.7579],
        [0.4258, 0.1578, 0.8344,  ..., 0.2054, 0.7114, 0.6593],
        [0.6574, 0.3850, 0.2720,  ..., 0.6546, 0.0927, 0.5907],
        ...,
        [0.3773, 0.3419, 0.3595,  ..., 0.3613, 0.3523, 0.3448],
        [0.3950, 0.4169, 0.4404,  ..., 0.4087, 0.3959, 0.4182],
        [0.4097, 0.4143, 0.4057,  ..., 0.4259, 0.4036, 0.4207]])


In [None]:
trained_movie_embeddings = model.item_factors.weight.data.cpu().numpy()
len(trained_movie_embeddings) # unique movie factor weights


9724

In [None]:
from sklearn.cluster import KMeans
# Fit the clusters based on the movie weights
kmeans = KMeans(n_clusters=10, random_state=0).fit(trained_movie_embeddings)



In [None]:
#It can be seen here that the movies that are in the same cluster tend to have
#similar genres, so the system has recognised patterns in the data, and we know we've done somthing right.
for cluster in range(10):
  print("Cluster #{}".format(cluster))
  movs = []
  for movidx in np.where(kmeans.labels_ == cluster)[0]:
    movid = train_set.idx2movieid[movidx]
    rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
    movs.append((movie_names[movid], rat_count))
  for mov in sorted(movs, key=lambda tup: tup[1], reverse=True)[:10]:
    print("\t", mov[0])


Cluster #0
	 Pulp Fiction (1994)
	 Fight Club (1999)
	 Usual Suspects, The (1995)
	 American Beauty (1999)
	 Seven (a.k.a. Se7en) (1995)
	 Godfather, The (1972)
	 Fargo (1996)
	 Twelve Monkeys (a.k.a. 12 Monkeys) (1995)
	 Memento (2000)
	 Alien (1979)
Cluster #1
	 Batman & Robin (1997)
	 Little Nicky (2000)
	 Fantastic Four: Rise of the Silver Surfer (2007)
	 Pete's Dragon (1977)
	 Karate Kid, Part III, The (1989)
	 Last Man Standing (1996)
	 Mimic (1997)
	 Rambo III (1988)
	 Dukes of Hazzard, The (2005)
	 Problem Child (1990)
Cluster #2
	 Waterworld (1995)
	 Interview with the Vampire: The Vampire Chronicles (1994)
	 Donnie Darko (2001)
	 O Brother, Where Art Thou? (2000)
	 Natural Born Killers (1994)
	 Game, The (1997)
	 Unbreakable (2000)
	 Tombstone (1993)
	 Austin Powers in Goldmember (2002)
	 District 9 (2009)
Cluster #3
	 Charlie's Angels (2000)
	 Grease (1978)
	 Dead Man Walking (1995)
	 Sense and Sensibility (1995)
	 Legally Blonde (2001)
	 Mr. & Mrs. Smith (2005)
	 Beverly Hi