<a href="https://colab.research.google.com/github/jayasakthyasri/CODSOFT/blob/main/recommandationsys_py.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
! curl http://files.grouplens.org/datasets/movielens/ml-latest-small.zip -o ml-latest-small.zip


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  955k  100  955k    0     0  3993k      0 --:--:-- --:--:-- --:--:-- 3996k


In [3]:
import zipfile
with zipfile.ZipFile('ml-latest-small.zip', 'r') as zip_ref:
    zip_ref.extractall('data')


In [4]:
import pandas as pd
movies_df = pd.read_csv('data/ml-latest-small/movies.csv')
ratings_df = pd.read_csv('data/ml-latest-small/ratings.csv')

In [5]:
print('The dimensions of movies dataframe are:', movies_df.shape,'\nThe dimensions of ratings dataframe are:', ratings_df.shape)

The dimensions of movies dataframe are: (9742, 3) 
The dimensions of ratings dataframe are: (100836, 4)


In [6]:
movies_df.head()


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [7]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [8]:
movie_names = movies_df.set_index('movieId')['title'].to_dict()
n_users = len(ratings_df.userId.unique())
n_items = len(ratings_df.movieId.unique())
print("Number of unique users:", n_users)
print("Number of unique movies:", n_items)
print("The full rating matrix will have:", n_users*n_items, 'elements.')
print('----------')
print("Number of ratings:", len(ratings_df))
print("Therefore: ", len(ratings_df) / (n_users*n_items) * 100, '% of the matrix is filled.')
print("We have an incredibly sparse matrix to work with here.")
print("And... as you can imagine, as the number of users and products grow, the number of elements will increase by n*2")
print("You are going to need a lot of memory to work with global scale... storing a full matrix in memory would be a challenge.")
print("One advantage here is that matrix factorization can realize the rating matrix implicitly, thus we don't need all the data")


Number of unique users: 610
Number of unique movies: 9724
The full rating matrix will have: 5931640 elements.
----------
Number of ratings: 100836
Therefore:  1.6999683055613624 % of the matrix is filled.
We have an incredibly sparse matrix to work with here.
And... as you can imagine, as the number of users and products grow, the number of elements will increase by n*2
You are going to need a lot of memory to work with global scale... storing a full matrix in memory would be a challenge.
One advantage here is that matrix factorization can realize the rating matrix implicitly, thus we don't need all the data


In [9]:
import torch
import numpy as np
from torch.autograd import Variable
from tqdm import tqdm_notebook as tqdm

class MatrixFactorization(torch.nn.Module):
    def __init__(self, n_users, n_items, n_factors=20):
        super().__init__()

        self.user_factors = torch.nn.Embedding(n_users, n_factors)
        self.item_factors = torch.nn.Embedding(n_items, n_factors)
        self.user_factors.weight.data.uniform_(0, 0.05)
        self.item_factors.weight.data.uniform_(0, 0.05)

    def forward(self, data):

        users, items = data[:,0], data[:,1]
        return (self.user_factors(users)*self.item_factors(items)).sum(1)

    def predict(self, user, item):
        return self.forward(user, item)

In [10]:
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader


class Loader(Dataset):
    def __init__(self):
        self.ratings = ratings_df.copy()


        users = ratings_df.userId.unique()
        movies = ratings_df.movieId.unique()


        self.userid2idx = {o:i for i,o in enumerate(users)}
        self.movieid2idx = {o:i for i,o in enumerate(movies)}


        self.idx2userid = {i:o for o,i in self.userid2idx.items()}
        self.idx2movieid = {i:o for o,i in self.movieid2idx.items()}


        self.ratings.movieId = ratings_df.movieId.apply(lambda x: self.movieid2idx[x])
        self.ratings.userId = ratings_df.userId.apply(lambda x: self.userid2idx[x])


        self.x = self.ratings.drop(['rating', 'timestamp'], axis=1).values
        self.y = self.ratings['rating'].values
        self.x, self.y = torch.tensor(self.x), torch.tensor(self.y)

    def __getitem__(self, index):
        return (self.x[index], self.y[index])

    def __len__(self):
        return len(self.ratings)

In [11]:
num_epochs = 128
cuda = torch.cuda.is_available()

print("Is running on GPU:", cuda)

model = MatrixFactorization(n_users, n_items, n_factors=8)
print(model)
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)

if cuda:
    model = model.cuda()


loss_fn = torch.nn.MSELoss()


optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)


train_set = Loader()
train_loader = DataLoader(train_set, 128, shuffle=True)

Is running on GPU: True
MatrixFactorization(
  (user_factors): Embedding(610, 8)
  (item_factors): Embedding(9724, 8)
)
user_factors.weight tensor([[5.9619e-03, 2.4307e-02, 2.5913e-03,  ..., 1.9321e-02, 2.6121e-03,
         1.9555e-03],
        [7.6853e-04, 4.4067e-02, 3.2622e-03,  ..., 4.1562e-02, 8.7005e-03,
         2.7697e-02],
        [1.4773e-02, 1.6656e-02, 8.6524e-03,  ..., 1.8550e-03, 2.0694e-02,
         1.6189e-02],
        ...,
        [3.1160e-02, 1.6392e-02, 3.5141e-02,  ..., 3.1347e-02, 3.6314e-05,
         4.1648e-02],
        [2.7364e-02, 3.4864e-02, 4.7419e-02,  ..., 4.2681e-02, 4.2680e-02,
         1.3725e-02],
        [3.4623e-02, 4.5111e-02, 2.0628e-02,  ..., 4.9898e-02, 4.9511e-02,
         1.5713e-02]])
item_factors.weight tensor([[0.0003, 0.0320, 0.0022,  ..., 0.0164, 0.0119, 0.0198],
        [0.0242, 0.0014, 0.0496,  ..., 0.0390, 0.0121, 0.0002],
        [0.0277, 0.0416, 0.0311,  ..., 0.0136, 0.0117, 0.0124],
        ...,
        [0.0008, 0.0192, 0.0267,  ..., 

In [12]:
for it in tqdm(range(num_epochs)):
    losses = []
    for x, y in train_loader:
         if cuda:
            x, y = x.cuda(), y.cuda()
            optimizer.zero_grad()
            outputs = model(x)
            loss = loss_fn(outputs.squeeze(), y.type(torch.float32))
            losses.append(loss.item())
            loss.backward()
            optimizer.step()
    print("iter #{}".format(it), "Loss:", sum(losses) / len(losses))


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for it in tqdm(range(num_epochs)):


  0%|          | 0/128 [00:00<?, ?it/s]

iter #0 Loss: 11.062632897783658
iter #1 Loss: 4.737518822178623
iter #2 Loss: 2.4739736106795105
iter #3 Loss: 1.7203692054839304
iter #4 Loss: 1.34570810403013
iter #5 Loss: 1.1284301715151308
iter #6 Loss: 0.9912182977659448
iter #7 Loss: 0.9001993850100464
iter #8 Loss: 0.8372648847360297
iter #9 Loss: 0.7922510234308122
iter #10 Loss: 0.7591829693256901
iter #11 Loss: 0.7347679245971181
iter #12 Loss: 0.7162315335294922
iter #13 Loss: 0.7015601460855019
iter #14 Loss: 0.6903763844957812
iter #15 Loss: 0.681617126171359
iter #16 Loss: 0.6750618439867412
iter #17 Loss: 0.669634731363524
iter #18 Loss: 0.6656045874741476
iter #19 Loss: 0.6629414274759099
iter #20 Loss: 0.6607442573469302
iter #21 Loss: 0.6585850946340465
iter #22 Loss: 0.6575899763685192
iter #23 Loss: 0.6565454744445491
iter #24 Loss: 0.6556722618223447
iter #25 Loss: 0.6548405090337477
iter #26 Loss: 0.6542119691668428
iter #27 Loss: 0.6532892077253555
iter #28 Loss: 0.6521375618383364
iter #29 Loss: 0.650976763650

In [13]:
c = 0
uw = 0
iw = 0
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)
        if c == 0:
          uw = param.data
          c +=1
        else:
          iw = param.data

user_factors.weight tensor([[ 1.2653,  0.8060,  0.9261,  ...,  1.6514,  0.9113,  1.1798],
        [ 0.8791,  1.8652,  0.2571,  ...,  1.2364,  0.9748,  0.8785],
        [ 1.8504,  0.3993, -2.3619,  ..., -2.1394,  1.6398,  2.7899],
        ...,
        [ 1.4762, -0.1439,  1.6496,  ..., -0.5759,  1.0871,  1.7793],
        [ 0.8149,  1.0893,  1.0578,  ...,  0.9644,  1.2281,  0.4832],
        [ 2.1073,  1.0026,  0.4079,  ...,  1.2451,  1.5510,  0.8707]],
       device='cuda:0')
item_factors.weight tensor([[0.5904, 0.7759, 0.2783,  ..., 0.7226, 0.4538, 0.3702],
        [0.3130, 0.7415, 0.3736,  ..., 0.3599, 0.1153, 0.2413],
        [0.8142, 0.5379, 0.4930,  ..., 0.3877, 0.4305, 0.5831],
        ...,
        [0.3221, 0.3400, 0.3269,  ..., 0.3227, 0.3227, 0.3278],
        [0.4173, 0.4020, 0.3849,  ..., 0.3895, 0.3767, 0.3859],
        [0.3794, 0.3979, 0.3840,  ..., 0.3698, 0.4010, 0.4047]],
       device='cuda:0')


In [14]:
trained_movie_embeddings = model.item_factors.weight.data.cpu().numpy()


In [15]:
len(trained_movie_embeddings)

9724

In [17]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=10, random_state=0).fit(trained_movie_embeddings)




In [18]:
'''It can be seen here that the movies that are in the same cluster tend to have
similar genres. Also note that the algorithm is unfamiliar with the movie name
and only obtained the relationships by looking at the numbers representing how
users have responded to the movie selections.'''
for cluster in range(10):
  print("Cluster #{}".format(cluster))
  movs = []
  for movidx in np.where(kmeans.labels_ == cluster)[0]:
    movid = train_set.idx2movieid[movidx]
    rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
    movs.append((movie_names[movid], rat_count))
  for mov in sorted(movs, key=lambda tup: tup[1], reverse=True)[:10]:
    print("\t", mov[0])


Cluster #0
	 Forrest Gump (1994)
	 Star Wars: Episode IV - A New Hope (1977)
	 Schindler's List (1993)
	 Fugitive, The (1993)
	 Back to the Future (1985)
	 Groundhog Day (1993)
	 One Flew Over the Cuckoo's Nest (1975)
	 Beautiful Mind, A (2001)
	 E.T. the Extra-Terrestrial (1982)
	 Ghostbusters (a.k.a. Ghost Busters) (1984)
Cluster #1
	 Jurassic Park (1993)
	 Terminator 2: Judgment Day (1991)
	 Toy Story (1995)
	 Apollo 13 (1995)
	 Batman (1989)
	 Aladdin (1992)
	 True Lies (1994)
	 Lion King, The (1994)
	 Shrek (2001)
	 Men in Black (a.k.a. MIB) (1997)
Cluster #2
	 Dances with Wolves (1990)
	 Star Wars: Episode I - The Phantom Menace (1999)
	 Titanic (1997)
	 Batman Forever (1995)
	 Pretty Woman (1990)
	 Ghost (1990)
	 Sleepless in Seattle (1993)
	 Firm, The (1993)
	 V for Vendetta (2006)
	 While You Were Sleeping (1995)
Cluster #3
	 Spider-Man (2002)
	 Austin Powers: The Spy Who Shagged Me (1999)
	 Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone