#**PROBLEM STATEMENT**



>Here we use the **KMeans clustering** ML algorithm for finding movie recomendation by using a set of data.

#**LOADING THE DATASET**

In [None]:
#In python, a curl is a tool for transferring data requests to and from a server using PycURL.
#so using curl we are downloading a dataset.

!curl http://files.grouplens.org/datasets/movielens/ml-latest-small.zip -o ml-latest-small.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100  955k  100  955k    0     0  3411k      0 --:--:-- --:--:-- --:--:-- 3411k


#**Extracting the dataset into zip file format**

In [None]:
#ZIP is a common file format that's used to compress one or more files together into a single location.
#we are importing zipfile to work with it
#The import statement allows you to import one or more modules into your Python program
#with statement in Python is used in exception handling to make the code cleaner and much more readable.
#with statement in Python is used in exception handling to make the code cleaner and much more readable.
import zipfile
with zipfile.ZipFile('ml-latest-small.zip', 'r') as zip_ref:
  zip_ref.extractall('data')
#extractall() method will extract all the contents of the zip file to the current working directory. 

#**Reading the dataset**

In [None]:
#pandas is an open source Python package that is most widely used for data science/data analysis and machine learning tasks.
import pandas as pd
# using Pandas read_csv() function imports a CSV file to DataFrame format.
movies_df = pd.read_csv('data/ml-latest-small/movies.csv')
ratings_df = pd.read_csv('data/ml-latest-small/ratings.csv')

#**Finding the dimensions of movies and ratings dataframe**


In [None]:
print('The dimensions of movies dataframe are:', movies_df.shape, '\nThe dimenisions of rating dataframe are:', ratings_df.shape)


The dimensions of movies dataframe are: (9742, 3) 
The dimenisions of rating dataframe are: (100836, 4)


In [None]:
#The head() function is used to get the first n rows of movies dataframe.
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [None]:
#The head() function is used to get the first n rows of ratings dataframe.
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [None]:
#Pandas set_index() is a method to set a List, Series or Data frame as index of a Data Frame.
#to_dict() method is used to convert a dataframe into a dictionary of series 
movie_names = movies_df.set_index('movieId')['title'].to_dict()
#len() It returns the length of an object.
#unique() function is used to find the unique elements of an array
n_users = len(ratings_df.userId.unique())
n_items = len(ratings_df.movieId.unique())
print("Number of unique users:", n_users)
print("Number of unique movies:", n_items)
print("The full rating matrix will have:", n_users*n_items, 'elements.')
print("----------")
print("Number of ratings:", len(ratings_df))
print("Therefore: ", len(ratings_df) / (n_users* n_items) * 100,'% of the matrix is filled.')
print("We have an incredibly sparse matrix to work with here")
print("And... as you can imagine, as the number of users and products grow, the number of elements will increase by n*2")
print("You are going to need a lot of memory to work with global scale... storing a full matrix in memory would be a challenge")
print("One advantage here is that matrix factorization can realize the rating matrix implicitly, thus we don't need all the data")






Number of unique users: 610
Number of unique movies: 9724
The full rating matrix will have: 5931640 elements.
----------
Number of ratings: 100836
Therefore:  1.6999683055613624 % of the matrix is filled.
We have an incredibly sparse matrix to work with here
And... as you can imagine, as the number of users and products grow, the number of elements will increase by n*2
You are going to need a lot of memory to work with global scale... storing a full matrix in memory would be a challenge
One advantage here is that matrix factorization can realize the rating matrix implicitly, thus we don't need all the data


In [None]:
#Torch is an open-source machine learning library, a scientific computing framework, and a script language 
import torch
#NumPy is a Python library used for working with arrays. 
#Array is a collection of common type of data structures having elements with same data type.
import numpy as np
#The from keyword is used to import only a specified section from a module.
#Autograd can automatically differentiate native Python and Numpy code.
from torch.autograd import variable
#tqdm is a library in Python which is used for creating Progress Meters or Progress Bars.
from tqdm import tqdm_notebook as tqdm

#torch.nn.Module. It is a base class used to develop all neural network models.
class MatrixFactorization(torch.nn.Module):

#__init__ method lets the class initialize the object's attributes
  def __init__(self, n_users, n_items, n_factors=20):
#The super() function is used to give access to methods and properties of a parent or sibling class

      super().__init__()
      #create user embeddings
      #The self parameter is a reference to the current instance of the class, and is used to access variables that belongs to the class.
      self.user_factors = torch.nn.Embedding(n_users, n_factors)
      #torch. nn. Embedding just creates a Lookup Table, to get the word embedding given a word index. 
      #create item embeddings
      self.item_factors = torch.nn.Embedding(n_items, n_factors)
      # weight is used to weigh the possibility for each value
      #Uniform distributions are probability distributions with equally likely outcomes.
      self.user_factors.weight.data.uniform_(0, 0.05)
      self.item_factors.weight.data.uniform_(0, 0.05)

#forward() method is used to move the turtle forward by the value of the argument that it takes.
  def forward(self, data):
    #matrix multiplication
    users, items =data[:,0], data[:,1]
    return (self.user_factors(users)*self.item_factors(items)).sum(1)

# predict() function enables us to predict the labels of the data values on the basis of the trained model
  def predict(self, user, item):
    return self.forward(user, item)


In [None]:

#creating the downloader necessary for PyTorch
#utils classes which make common patterns shorter and easier
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader

class Loader(Dataset):
    def __init__(self):
        self.ratings = ratings_df.copy()

        users = ratings_df.userId.unique()
        movies = ratings_df.movieId.unique()

        #enumerate() allows us to iterate through a sequence but it keeps track of both the index and the element
        #"idx" is usually short for index.
        self.userid2idx = {o:i for i,o in enumerate(users)}
        self.movieid2idx = {o:i for i,o in enumerate(movies)}

        self.idx2userid = {i:o for o,i in self.userid2idx.items()}
        self.idx2movieid = {i:o for o,i in self.movieid2idx.items()}

        #We use lambda functions when we require a nameless function for a short period of time.
        self.ratings.movieId = ratings_df.movieId.apply(lambda x: self.movieid2idx[x])
        self.ratings.userId = ratings_df.userId.apply(lambda x: self.userid2idx[x])

        #The drop() method removes the specified row or column. 
        self.x = self.ratings.drop(['rating', 'timestamp'] , axis=1).values
        self.y = self.ratings['rating'].values
        #Using torch. tensor() is the most straightforward way to create a tensor if you already have data in a Python tuple or list.
        self.x, self.y = torch.tensor(self.x), torch.tensor(self.y)

    #The __getitem__ magic method is usually used for list indexing, dictionary lookup, or accessing ranges of values.
    def __getitem__(self, index):
        return (self.x[index], self.y[index])
    #The__len__ It returns the length of an object.
    def __len__(self):
        return len(self.ratings)






In [None]:
#An epoch is a term used in machine learning and indicates the number of passes of the entire training dataset the machine learning algorithm has completed. 
num_epochs = 128
#CUDA enables dramatic increases in computing performance by harnessing the power of the graphics processing unit (GPU)
#torch.cuda is used to set up and run CUDA operations.
cuda = torch.cuda.is_available()

print("Is running on GPU:", cuda)

model = MatrixFactorization(n_users, n_items, n_factors=8)
print(model)
for name, param in model.named_parameters():
    #requires_grad should be the main way you control which parts of the model are part of the gradient computation
    if param.requires_grad:
      print(name, param.data)

if cuda:
   model = model.cuda()
#torch.nn.MSELoss creates a criterion that measures the mean squared error (squared L2 norm) between each element in the input x and target y.
loss_fn = torch.nn.MSELoss()
# Adam optimizer is a replacement optimization algorithm for stochastic gradient descent for training deep learning models 
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

train_set = Loader()
train_loader = DataLoader(train_set, 128, shuffle=True)


Is running on GPU: True
MatrixFactorization(
  (user_factors): Embedding(610, 8)
  (item_factors): Embedding(9724, 8)
)
user_factors.weight tensor([[0.0497, 0.0034, 0.0020,  ..., 0.0179, 0.0268, 0.0247],
        [0.0018, 0.0382, 0.0072,  ..., 0.0305, 0.0304, 0.0485],
        [0.0078, 0.0196, 0.0479,  ..., 0.0058, 0.0112, 0.0350],
        ...,
        [0.0071, 0.0162, 0.0300,  ..., 0.0037, 0.0490, 0.0383],
        [0.0027, 0.0144, 0.0270,  ..., 0.0445, 0.0477, 0.0500],
        [0.0394, 0.0044, 0.0203,  ..., 0.0328, 0.0220, 0.0064]])
item_factors.weight tensor([[0.0414, 0.0173, 0.0059,  ..., 0.0005, 0.0054, 0.0012],
        [0.0452, 0.0004, 0.0333,  ..., 0.0070, 0.0445, 0.0114],
        [0.0150, 0.0455, 0.0176,  ..., 0.0199, 0.0299, 0.0054],
        ...,
        [0.0261, 0.0282, 0.0040,  ..., 0.0246, 0.0195, 0.0387],
        [0.0262, 0.0456, 0.0088,  ..., 0.0223, 0.0366, 0.0282],
        [0.0100, 0.0454, 0.0033,  ..., 0.0056, 0.0302, 0.0411]])


In [None]:
#tqdm is a library in Python which is used for creating Progress Meters or Progress Bars
for it in tqdm(range(num_epochs)):
    losses = []
    for x, y in train_loader:
         if cuda:
            x, y = x.cuda(), y.cuda()
            #zero_grad() restarts looping without losses from the last step if you use the gradient method for decreasing the error (or losses)
            optimizer.zero_grad()
            outputs = model(x)
            #squeeze() function is used when we want to remove single-dimensional entries from the shape of an array
            loss = loss_fn(outputs.squeeze(), y.type(torch.float32)) 
            losses.append(loss.item())
            loss.backward()
            optimizer.step()
print("iter #{}".format(it), "Loss:", sum(losses) / len(losses))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """Entry point for launching an IPython kernel.


  0%|          | 0/128 [00:00<?, ?it/s]

iter #127 Loss: 0.32500748219072517


In [None]:
#by training the model, we will have tuned latent factors for movies and users
c = 0
uw = 0
iw = 0
for name, param in model.named_parameters():
  if param.requires_grad:
    print(name, param.data)
    if c == 0:
      uw = param.data
      c +=1
    else:
      iw = param.data





user_factors.weight tensor([[ 1.3130,  1.4362,  1.3285,  ...,  1.2671,  1.0224,  1.3640],
        [ 0.3749,  1.3657,  1.9535,  ...,  1.1544,  0.5392,  0.7386],
        [-2.4844,  0.7659,  1.7060,  ...,  1.6599, -0.2572, -0.7986],
        ...,
        [ 0.3601, -0.0222,  1.7244,  ..., -0.4462,  1.7645,  2.2586],
        [ 1.0274,  1.1209,  1.1705,  ...,  0.6577,  1.0913,  1.2333],
        [ 0.9980,  1.5016,  0.4468,  ...,  1.2551,  1.6044,  0.2564]],
       device='cuda:0')
item_factors.weight tensor([[0.8455, 0.4501, 0.2948,  ..., 0.6289, 0.2692, 0.2466],
        [0.5812, 0.7746, 0.1109,  ..., 0.4609, 0.7211, 0.2088],
        [0.5419, 0.4999, 0.2473,  ..., 0.3009, 0.5513, 0.8698],
        ...,
        [0.3313, 0.3341, 0.3075,  ..., 0.3292, 0.3257, 0.3349],
        [0.3793, 0.3996, 0.3612,  ..., 0.3745, 0.3903, 0.3713],
        [0.3855, 0.4214, 0.3812,  ..., 0.3793, 0.4064, 0.3867]],
       device='cuda:0')


In [None]:
trained_movie_embeddings = model.item_factors.weight.data.cpu().numpy()

In [None]:
trained_movie_embeddings

array([[0.8454738 , 0.45011565, 0.29475915, ..., 0.6289333 , 0.26922446,
        0.24660386],
       [0.58122337, 0.7746151 , 0.11090755, ..., 0.4608917 , 0.7210832 ,
        0.20879894],
       [0.54190516, 0.49993134, 0.24728596, ..., 0.3009057 , 0.5513073 ,
        0.8698494 ],
       ...,
       [0.33128121, 0.33408463, 0.30745497, ..., 0.32920817, 0.32574692,
        0.33493987],
       [0.3792503 , 0.39958686, 0.3612351 , ..., 0.37452516, 0.3902571 ,
        0.37125525],
       [0.3855182 , 0.42139256, 0.38123605, ..., 0.3793232 , 0.40642524,
        0.38674217]], dtype=float32)

In [None]:
#Scikit-learn is probably the most useful library for machine learning in Python.
#K-means clustering is a type of unsupervised learning, which is used when you have unlabeled data (i.e., data without defined categories or groups). 
'''Clustering or cluster analysis is an unsupervised learning problem. It is often used as a data analysis technique for discovering interesting patterns in data, such as groups of customers based on their behavior'''
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=10, random_state=0).fit(trained_movie_embeddings)

In [None]:
for cluster in range(10):
  #permits you to try and do variable substitutions and data formatting.
  print("Cluster #{}".format(cluster))
  movs = []
  #where() function to select elements from a numpy array, based on a condition
  for movidx in np.where(kmeans.labels_ == cluster)[0]:
    movid = train_set.idx2movieid[movidx]
    rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
    movs.append((movie_names[movid], rat_count))
  for mov in sorted(movs, key=lambda tup: tup[1], reverse=True)[:10]:
    print("\t", mov[0])


Cluster #0
	 Batman & Robin (1997)
	 Free Willy (1993)
	 Godzilla (1998)
	 Super Mario Bros. (1993)
	 Fantastic Four: Rise of the Silver Surfer (2007)
	 Honey, I Blew Up the Kid (1992)
	 Battlefield Earth (2000)
	 Superman IV: The Quest for Peace (1987)
	 Next Karate Kid, The (1994)
	 Karate Kid, Part III, The (1989)
Cluster #1
	 Forrest Gump (1994)
	 Shawshank Redemption, The (1994)
	 Silence of the Lambs, The (1991)
	 Matrix, The (1999)
	 Braveheart (1995)
	 Fight Club (1999)
	 Star Wars: Episode V - The Empire Strikes Back (1980)
	 Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)
	 Star Wars: Episode VI - Return of the Jedi (1983)
	 Saving Private Ryan (1998)
Cluster #2
	 Ace Ventura: Pet Detective (1994)
	 Waterworld (1995)
	 Armageddon (1998)
	 Ace Ventura: When Nature Calls (1995)
	 Mummy, The (1999)
	 Broken Arrow (1996)
	 Santa Clause, The (1994)
	 Demolition Man (1993)
	 Starship Troopers (1997)
	 Face/Off (1997)
Cluster #3
	 Jurassic Park (1993)
