<a href="https://colab.research.google.com/github/koushik395/MOVIE-RECOMMENDATION-SYSTEM/blob/main/Movie_Recommendation_System.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
from urllib.request import urlretrieve
import zipfile


#Downloading movielens data...
urlretrieve("http://files.grouplens.org/datasets/movielens/ml-100k.zip", "movielens.zip")
zip_ref = zipfile.ZipFile('movielens.zip', "r")  #This method reads data from a particular file in the archive.
zip_ref.extractall() #This method extracts all files in the archive to current directory by default

In [2]:
!ls -lrt

total 4820
drwxr-xr-x 1 root root    4096 Jan 31 14:44 sample_data
-rw-r--r-- 1 root root 4924029 Feb  2 16:28 movielens.zip
drwxr-xr-x 2 root root    4096 Feb  2 16:28 ml-100k


In [3]:
# Load each data set (users, movies, and ratings).
users_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('ml-100k/u.user', sep='|', names=users_cols, encoding='latin-1')

ratings_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=ratings_cols, encoding='latin-1')

# The movies file contains a binary feature for each genre.
genre_cols = [
    "genre_unknown", "Action", "Adventure", "Animation", "Children", "Comedy",
    "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror",
    "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"
] 
movies_cols = [
    'movie_id', 'title', 'release_date', "video_release_date", "imdb_url"
] + genre_cols
movies = pd.read_csv(
    'ml-100k/u.item', sep='|', names=movies_cols, encoding='latin-1')

In [4]:
# ratings matrix
print(ratings.shape)

(100000, 4)


In [5]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [6]:
movies.head()

Unnamed: 0,movie_id,title,release_date,video_release_date,imdb_url,genre_unknown,Action,Adventure,Animation,Children,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [7]:
# For each movie, get a set of users who rated it highly (4 or 5)
filtered_ratings = ratings[ratings['rating']>=4]
print(filtered_ratings.shape)

filtered_ratings

gp = filtered_ratings.groupby(['movie_id']).agg({'user_id': lambda x: set(x)}).reset_index()# reset_index used to make movie_id as a column instead of a Pandas index
print(gp)

(55375, 4)
      movie_id                                            user_id
0            1  {1, 2, 5, 6, 10, 16, 17, 18, 21, 23, 25, 38, 4...
1            2  {256, 640, 130, 387, 642, 648, 393, 276, 532, ...
2            3  {1, 130, 267, 523, 534, 663, 793, 923, 417, 55...
3            4  {514, 7, 10, 12, 13, 524, 16, 19, 532, 22, 543...
4            5  {256, 130, 388, 648, 776, 907, 270, 405, 406, ...
...        ...                                                ...
1442      1656                                              {883}
1443      1658                                              {894}
1444      1662                                              {782}
1445      1664                                    {880, 870, 782}
1446      1674                                              {840}

[1447 rows x 2 columns]


In [8]:
gp.shape

(1447, 2)

In [9]:
# Use only those movies that are liked by atleast 20 users: NON-Optimized code
for i in range(gp.shape[0]):
  if len((gp.user_id)[i]) < 20:
    gp.drop(i, axis=0, inplace=True)

In [10]:
gp.shape

(621, 2)

In [11]:
print(gp)

      movie_id                                            user_id
0            1  {1, 2, 5, 6, 10, 16, 17, 18, 21, 23, 25, 38, 4...
1            2  {256, 640, 130, 387, 642, 648, 393, 276, 532, ...
2            3  {1, 130, 267, 523, 534, 663, 793, 923, 417, 55...
3            4  {514, 7, 10, 12, 13, 524, 16, 19, 532, 22, 543...
4            5  {256, 130, 388, 648, 776, 907, 270, 405, 406, ...
...        ...                                                ...
1084      1119  {907, 398, 270, 659, 532, 790, 796, 416, 299, ...
1102      1137  {257, 399, 918, 919, 568, 184, 63, 192, 201, 7...
1107      1142  {130, 903, 136, 392, 144, 533, 793, 160, 292, ...
1152      1194  {655, 916, 406, 409, 286, 543, 553, 440, 321, ...
1155      1197  {144, 277, 151, 792, 919, 411, 160, 168, 178, ...

[621 rows x 2 columns]


In [12]:
type(gp.iloc[0]['user_id'])

set

In [None]:
# For each movie, get similar movieS using Intersection over union(IoU)
# Code written for understandability rather than speed.
# This takes a while to run

movie_ids = list(gp.movie_id) 
similar_movies= {} # empty dictionary

cnt =0;

for movie_id in movie_ids:
  print(movie_id)
  # compute the IoU for this movie_id and others
  l = [] # list of tuples of each movie and IoU
  for i in range(gp.shape[0]):
    x = gp[ gp['movie_id'] == movie_id ]['user_id'].to_list()[0]
    y = gp.iloc[i]['user_id']
    
    t = (gp.iloc[i]['movie_id'], len(x & y)/len(x | y)) # (movie_id_i,IoU)
    l.append(t)

  similar_movies[movie_id] = l

In [14]:
# similar movies
idx = 1

l = similar_movies[idx]

# sort by similarities
l.sort(key = lambda x: x[1], reverse=True)

print("Given movie:")
print(movies[movies['movie_id']==idx]["title"])
print("*****************************")
print("Top-5 similar movies:")
print(movies[movies['movie_id']==l[0][0]]["title"])
print(movies[movies['movie_id']==l[1][0]]["title"])
print(movies[movies['movie_id']==l[2][0]]["title"])
print(movies[movies['movie_id']==l[3][0]]["title"])
print(movies[movies['movie_id']==l[4][0]]["title"])

Given movie:
0    Toy Story (1995)
Name: title, dtype: object
*****************************
Top-5 similar movies:
0    Toy Story (1995)
Name: title, dtype: object
49    Star Wars (1977)
Name: title, dtype: object
180    Return of the Jedi (1983)
Name: title, dtype: object
173    Raiders of the Lost Ark (1981)
Name: title, dtype: object
171    Empire Strikes Back, The (1980)
Name: title, dtype: object
