### Import Required Libraries

In [22]:
import os
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import random

from sklearn.feature_extraction.text import TfidfTransformer
import tensorflow
from google.colab import drive

### Dataset

In [2]:
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
dataset_path = "/content/drive/My Drive/Machine Learning Datasets/Movie_Recommendation"

In [4]:
ICM = (pd.read_csv(dataset_path + '/ItemContentMatrix.csv', index_col = 'tmdbId')).astype(np.uint8)
URM = (pd.read_csv(dataset_path + '/UserRatingMatrix.csv', index_col = 'userId')).astype(np.uint8)
movies = pd.read_csv(dataset_path + '/movies.csv', index_col = 'tmdbId')

In [5]:
ICM.head()

Unnamed: 0_level_0,Adventure,Animation,Children,Comedy,Fantasy,1990s,Romance,Action,Crime,Thriller,...,Western,Sci-Fi,Musical,2000 st,2000 end,IMAX,2010 st,Documentary,latest,Film-Noir
tmdbId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,1,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
11,1,0,0,0,0,1,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
12,1,1,1,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [6]:
URM.head()

Unnamed: 0_level_0,2,5,6,11,12,13,14,15,16,18,...,490003,490928,494368,497520,500475,500609,502616,502892,503475,525662
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,10,0,8,0,10,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,0,0,0,10,0,0,10,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
ICM.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9588 entries, 2 to 525662
Data columns (total 24 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   Adventure    9588 non-null   uint8
 1   Animation    9588 non-null   uint8
 2   Children     9588 non-null   uint8
 3   Comedy       9588 non-null   uint8
 4   Fantasy      9588 non-null   uint8
 5   1990s        9588 non-null   uint8
 6   Romance      9588 non-null   uint8
 7   Action       9588 non-null   uint8
 8   Crime        9588 non-null   uint8
 9   Thriller     9588 non-null   uint8
 10  Mystery      9588 non-null   uint8
 11  Horror       9588 non-null   uint8
 12  Drama        9588 non-null   uint8
 13  War          9588 non-null   uint8
 14  Western      9588 non-null   uint8
 15  Sci-Fi       9588 non-null   uint8
 16  Musical      9588 non-null   uint8
 17  2000 st      9588 non-null   uint8
 18  2000 end     9588 non-null   uint8
 19  IMAX         9588 non-null   uint8
 20  2010 s

In [8]:
URM.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 610 entries, 1 to 610
Columns: 9588 entries, 2 to 525662
dtypes: uint8(9588)
memory usage: 5.6 MB


### Term Frequency- Inverse Document Frequency Transformation
    -> The Features of movies in ICM do not contain any representation.
    -> All the features are either in a movie, or not in a movie.
    -> None of them has a weightage over how important that feature could be.
    -> TF-IDF solves this problem.
    -> It uses the number of occurances of a feature in an item(which would always be one for our case) and the number of occurances of the feature in entire dataset.

In [9]:
Tfidf_Transformer = TfidfTransformer()
tfidf_vectors = Tfidf_Transformer.fit_transform(ICM)

In [10]:
tfidf_vectors.toarray().shape

(9588, 24)

In [11]:
tfidf_rep = pd.DataFrame(tfidf_vectors.toarray(), index = ICM.index, columns = ICM.columns)
tfidf_rep.head()

Unnamed: 0_level_0,Adventure,Animation,Children,Comedy,Fantasy,1990s,Romance,Action,Crime,Thriller,...,Western,Sci-Fi,Musical,2000 st,2000 end,IMAX,2010 st,Documentary,latest,Film-Noir
tmdbId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,0.0,0.0,0.0,0.0,0.0,0.683503,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.756064,0.0,0.654498,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.328548,0.0,0.518928,0.600557,0.511958,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11,0.554143,0.0,0.0,0.0,0.0,0.308231,0.0,0.486838,0.0,0.0,...,0.0,0.600756,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,0.433385,0.536829,0.525079,0.27847,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.413205,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
tfidf_rep.to_csv(dataset_path + '/TFIDF_ICM.csv')

### Item-Item Similarity
    -> Using the TF-IDF representation for the movies, Item-Item Similarity Matrix is prepared.
    -> Here, I have used Weighted Cosine Similarity for calculating the similarity between any 2 movies.

In [13]:
def get_ItemItem_SimilarityMatrix(ICM, Shrink = 1):
    """
    ICM- Item Content Matrix
         ICM denotes if a feature(columns) is present in item(rows) or not.(0/1 ICM)
         ICM with continuous values denote relevance of a feature in an item(if the feature exists in the item).
    Shrink- Shrink term is used to give weightage to items with more features.
    """
    Movie_norms = pd.Series(np.linalg.norm(ICM, axis = 1), index = ICM.index)

    ItemSimilarity = pd.DataFrame(np.dot(ICM, ICM.T), index = ICM.index)
    ItemSimilarity.columns = ICM.index

    ItemSimilarity = ItemSimilarity / (Movie_norms + Shrink)
    
    return ItemSimilarity

In [14]:
ItemItem_Similarity = get_ItemItem_SimilarityMatrix(tfidf_rep)

In [15]:
ItemItem_Similarity.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9588 entries, 2 to 525662
Columns: 9588 entries, 2 to 525662
dtypes: float64(9588)
memory usage: 701.7 MB


In [16]:
ItemItem_Similarity.head()

tmdbId,2,5,6,11,12,13,14,15,16,18,...,490003,490928,494368,497520,500475,500609,502616,502892,503475,525662
tmdbId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,0.5,0.223676,0.112282,0.105338,0.0,0.206493,0.330394,0.271534,0.118871,0.099236,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.105337,0.0
5,0.223676,0.5,0.107517,0.100868,0.10527,0.215644,0.147802,0.121471,0.0,0.221829,...,0.0,0.178137,0.178137,0.178137,0.0,0.0,0.178137,0.0,0.235469,0.0
6,0.112282,0.107517,0.5,0.176951,0.0,0.046371,0.074195,0.060977,0.0,0.166699,...,0.0,0.0,0.0,0.0,0.377406,0.214001,0.0,0.147213,0.050634,0.0
11,0.105338,0.100868,0.176951,0.5,0.120079,0.043503,0.069606,0.057206,0.0,0.471032,...,0.0,0.0,0.0,0.0,0.106883,0.0,0.0,0.0,0.201038,0.0
12,0.0,0.10527,0.0,0.120079,0.5,0.045402,0.0,0.0,0.108004,0.159826,...,0.0,0.065611,0.065611,0.065611,0.0,0.0,0.065611,0.0,0.353894,0.0


### Estimated Ratings For Different Movies By the Users
    -> This depends upon the items that which users have already rated, and similarity of those items with other items.
    -> As we have used the TF-IDF representation for items, rather than one-hot representations, the ratings can be seen to be very low.
    -> Though this won't change the order of best preferable items, and hence the recommendation.

In [17]:
def get_estimated_ratings(URM, ItemItem_Similarity):
    Estimated_Ratings = pd.DataFrame(np.dot(URM, ItemItem_Similarity), index = URM.index, columns = URM.columns)
    Estimated_Ratings = Estimated_Ratings / ItemItem_Similarity.sum(axis = 1).values
    return Estimated_Ratings

In [18]:
Estimated_Ratings = get_estimated_ratings(URM, ItemItem_Similarity)

In [19]:
Estimated_Ratings.head()

Unnamed: 0_level_0,2,5,6,11,12,13,14,15,16,18,...,490003,490928,494368,497520,500475,500609,502616,502892,503475,525662
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.156226,0.180068,0.24347,0.275345,0.177392,0.162465,0.141826,0.163998,0.104071,0.238441,...,0.0,0.088661,0.088661,0.088661,0.202403,0.131109,0.088661,0.110568,0.218393,0.0
2,0.014972,0.010624,0.031477,0.021007,0.011647,0.01436,0.012823,0.016438,0.016939,0.019647,...,0.044774,0.022031,0.022031,0.022031,0.04258,0.070865,0.022031,0.049999,0.010741,0.044774
3,0.016982,0.016802,0.028167,0.057496,0.011275,0.011157,0.013172,0.016792,0.003365,0.043636,...,0.0,0.002921,0.002921,0.002921,0.022292,0.000628,0.002921,0.000529,0.020296,0.0
4,0.204913,0.210614,0.15684,0.152447,0.14908,0.198687,0.209525,0.218372,0.189283,0.162777,...,0.019174,0.127042,0.127042,0.127042,0.09607,0.075603,0.127042,0.075187,0.181078,0.019174
5,0.03549,0.02882,0.043866,0.031714,0.025891,0.03226,0.034963,0.029706,0.02243,0.027605,...,0.0,0.010571,0.010571,0.010571,0.036163,0.035952,0.010571,0.030319,0.031224,0.0


In [21]:
Estimated_Ratings.to_csv(dataset_path + '/Content_Based/Estimated_Ratings.csv')

### Recommendations

In [27]:
# Let's pick a radom userId from the available range, i.e. 1 to 610
random.seed(7)
user_id = random.randint(1, 611)
print(user_id)      # Should print 332 as we have set seed

332


In [54]:
User_ratings = (URM.loc[user_id]).sort_values(ascending = False)
User_ratings

137       10
824       10
78        10
548       10
77        10
          ..
11215      0
11216      0
11217      0
11219      0
525662     0
Name: 332, Length: 9588, dtype: uint8

In [55]:
# The movie indices(TMDBId) here, would be having data type as str, because they represent column names.
# So, they are converted to int.
User_ratings.index = User_ratings.index.astype('int')

#### Top 20 rated Movies by the chosen User
    -> This shall help us judge our recommendation System

In [64]:
(pd.concat([movies[['title', 'genres']].loc[User_ratings.index], User_ratings], axis = 1)).head(20)

Unnamed: 0,title,genres,332
137,Groundhog Day (1993),Comedy|Fantasy|Romance,10
824,Moulin Rouge (2001),Drama|Musical|Romance,10
78,Blade Runner (1982),Action|Sci-Fi|Thriller,10
548,Rashomon (Rashômon) (1950),Crime|Drama|Mystery,10
77,Memento (2000),Mystery|Thriller,10
603,"Matrix, The (1999)",Action|Sci-Fi|Thriller,10
599,Sunset Blvd. (a.k.a. Sunset Boulevard) (1950),Drama|Film-Noir|Romance,9
146,"Crouching Tiger, Hidden Dragon (Wo hu cang lon...",Action|Drama|Romance,9
79,Hero (Ying xiong) (2002),Action|Adventure|Drama,9
983,"Man Who Would Be King, The (1975)",Adventure|Drama,9


#### Top 20 Recommendations for the User based on the Movies he/she rated

In [70]:
Estimated_user_ratings = Estimated_Ratings.loc[user_id].sort_values(ascending = False)
Estimated_user_ratings.index = Estimated_user_ratings.index.astype('int')

In [71]:
(pd.concat([movies[['title', 'genres']].loc[Estimated_user_ratings.index], Estimated_user_ratings], axis = 1)).head(20)

Unnamed: 0,title,genres,332
558,Spider-Man 2 (2004),Action|Adventure|Sci-Fi|IMAX,0.399496
1894,Star Wars: Episode II - Attack of the Clones (...,Action|Adventure|Sci-Fi|IMAX,0.399496
38356,Transformers: Dark of the Moon (2011),Action|Adventure|Sci-Fi|War|IMAX,0.378082
604,"Matrix Reloaded, The (2003)",Action|Adventure|Sci-Fi|Thriller|IMAX,0.373284
605,"Matrix Revolutions, The (2003)",Action|Adventure|Sci-Fi|Thriller|IMAX,0.373284
2135,"Time Machine, The (2002)",Action|Adventure|Sci-Fi,0.372379
296,Terminator 3: Rise of the Machines (2003),Action|Adventure|Sci-Fi,0.372379
5137,Sky Captain and the World of Tomorrow (2004),Action|Adventure|Sci-Fi,0.372379
36657,X-Men (2000),Action|Adventure|Sci-Fi,0.372379
11817,Bulletproof Monk (2003),Action|Adventure|Sci-Fi,0.372379


<center>END</center>