# <font color=dark>Collabaritive Filtering Recommender System</font>

<hr style="border:2px solid gray">

## <font color=blue>Objective</font>
### Reccomending those movies that users with interests similar to ours like

<hr style="border:2px solid gray">

# <font color=red>User Based Similarity</font>

## <font color=blue>Importing Libraries & Data Pre-Processing</font>

In [2]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sn
%matplotlib inline
from sklearn import metrics
import random
import math
import statsmodels.api as sm


In [3]:
# Taking a data of users & ratings given them on a Movie ID
# Have taken a subset of complete ratings Database of 1 lac users

rating_df = pd.read_csv( "ratings.csv" )
rating_df.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [4]:
rating_df.shape

(100004, 4)

In [5]:
#dropping timestamp column as not needed

rating_df.drop( 'timestamp', axis = 1, inplace = True )
rating_df.head()

Unnamed: 0,userId,movieId,rating
0,1,31,2.5
1,1,1029,3.0
2,1,1061,3.0
3,1,1129,2.0
4,1,1172,4.0


In [6]:
#Getting unique user id

len( rating_df.userId.unique() )

671

In [7]:
#Getting unique Movie Id

len( rating_df.movieId.unique() )

9066

In [8]:
# Making a Pivot table, with userid in rows & MovieId in column
# The values of the matrix will be the ratings the users have given to those movies
# There are 671 users & 9066 movies, thus a matrix of 671 x 9066 will be created


user_movies_df = rating_df.pivot( index='userId',
columns='movieId',
values = "rating" ).reset_index(drop=True)
user_movies_df.index = rating_df.userId.unique()
user_movies_df.shape

(671, 9066)

In [9]:
# Matrix created is supposed to be sparsed, as ratings will be only present for which users have watched.

user_movies_df.iloc[0:5, 0:15]

movieId,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
1,,,,,,,,,,,,,,,
2,,,,,,,,,,4.0,,,,,
3,,,,,,,,,,,,,,,
4,,,,,,,,,,4.0,,,,,
5,,,4.0,,,,,,,,,,,,


In [10]:
#Relacing NaN with 0

user_movies_df.fillna( 0, inplace = True )
user_movies_df.iloc[0:5, 0:10]

movieId,1,2,3,4,5,6,7,8,9,10
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
5,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


<hr style="border:2px solid gray">

## <font color=blue>Calculating Cosine similarity</font>

In [11]:
# As each row in user_movies_df represents a user, we need to calculate the similarity between rows to get similarity b/w users

#Calculating Cosine Similarity between users

from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine, correlation
user_sim = 1 - pairwise_distances( user_movies_df.values, metric="cosine" )

#Store the results in a dataframe

user_sim_df = pd.DataFrame( user_sim )

# set the index and column names to user ids (0 to 670)

user_sim_df.index = rating_df.userId.unique()
user_sim_df.columns = rating_df.userId.unique()
user_sim_df.shape

(671, 671)

In [14]:
#Cosine similarity closer to 1 means users are very similar and closer to 0 means users are very dissimilar
# Similarity between first 5 users

user_sim_df.iloc[0:5, 0:5]

Unnamed: 0,1,2,3,4,5
1,1.0,0.0,0.0,0.074482,0.016818
2,0.0,1.0,0.124295,0.118821,0.103646
3,0.0,0.124295,1.0,0.08164,0.151531
4,0.074482,0.118821,0.08164,1.0,0.130649
5,0.016818,0.103646,0.151531,0.130649,1.0


In [15]:
# As diagnal values is 1, we are setting diagonal values to 0, to avoid selecting same user as most similar.

np.fill_diagonal( user_sim, 0 )
user_sim_df.iloc[0:5, 0:5]

Unnamed: 0,1,2,3,4,5
1,0.0,0.0,0.0,0.074482,0.016818
2,0.0,0.0,0.124295,0.118821,0.103646
3,0.0,0.124295,0.0,0.08164,0.151531
4,0.074482,0.118821,0.08164,0.0,0.130649
5,0.016818,0.103646,0.151531,0.130649,0.0


In [16]:
# Filtering Similar User   
# The most similar user to first 5 users

user_sim_df.idxmax(axis=1)[0:5]

1    325
2    338
3    379
4    518
5    313
dtype: int64

In [21]:
user_sim_df.idxmax(axis=1)[5:10]

6     345
7      21
8     520
9     670
10    271
dtype: int64

In [19]:
# The similarity values between user2 and users ranging b/w 393 to 401
# Similarity can be explained intuitively if we can find that two users have watched movies in common and rated very similarly.

user_sim_df.iloc[1:2, 392:401]

Unnamed: 0,393,394,395,396,397,398,399,400,401
2,0.054702,0.160554,0.023183,0.425653,0.0,0.0097,0.291828,0.571252,0.02253


In [24]:
# Importing a movie data set with Movie ID & Movie Title
# TOo get the mapping of movie ID with Movie Title (Movie ID same in this sheet with Movie ID in user sheet)

movies_df = pd.read_csv( "movies_new.csv" )
movies_df.head(7)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance


In [25]:
# Dropping 'genres' as not useful for our analysis

movies_df.drop( 'genres', axis = 1, inplace = True )
movies_df[0:7]

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)
5,6,Heat (1995)
6,7,Sabrina (1995)


In [26]:
#Finding common movies b/w similar users

def get_user_similar_movies( user1, user2 ):
    # Inner join between movies watched between two users & give the common movies watched.
    common_movies = rating_df[rating_df.userId == user1].merge(
        rating_df[rating_df.userId == user2],
        on = "movieId",
        how = "inner" )
    # join the above result set with movies details
    return common_movies.merge( movies_df, on = 'movieId' )

In [29]:
# Common movie b/w user 2 & 400
common_movies_2_400 = get_user_similar_movies( 2, 400 )

In [31]:
common_movies_2_400[0:10]

Unnamed: 0,userId_x,movieId,rating_x,userId_y,rating_y,title
0,2,10,4.0,400,3.0,GoldenEye (1995)
1,2,39,5.0,400,4.0,Clueless (1995)
2,2,47,4.0,400,4.0,Seven (a.k.a. Se7en) (1995)
3,2,50,4.0,400,5.0,"Usual Suspects, The (1995)"
4,2,110,4.0,400,5.0,Braveheart (1995)
5,2,150,5.0,400,4.0,Apollo 13 (1995)
6,2,153,4.0,400,3.0,Batman Forever (1995)
7,2,161,3.0,400,3.0,Crimson Tide (1995)
8,2,165,3.0,400,4.0,Die Hard: With a Vengeance (1995)
9,2,185,3.0,400,1.0,"Net, The (1995)"


In [None]:
# User 2 & 400 have watched above 10 movies which are in common

In [32]:
# User 2 & 400 have rated at least 4

common_movies[(common_movies.rating_x >= 4.0) &
((common_movies.rating_y >= 4.0))]

Unnamed: 0,userId_x,movieId,rating_x,userId_y,rating_y,title
1,2,39,5.0,400,4.0,Clueless (1995)
2,2,47,4.0,400,4.0,Seven (a.k.a. Se7en) (1995)
3,2,50,4.0,400,5.0,"Usual Suspects, The (1995)"
4,2,110,4.0,400,5.0,Braveheart (1995)
5,2,150,5.0,400,4.0,Apollo 13 (1995)
15,2,296,4.0,400,5.0,Pulp Fiction (1994)
26,2,480,4.0,400,5.0,Jurassic Park (1993)
28,2,515,4.0,400,4.0,"Remains of the Day, The (1993)"
29,2,527,4.0,400,4.0,Schindler's List (1993)
35,2,590,5.0,400,4.0,Dances with Wolves (1990)


## <font color=green>User Based Similarity may not work for new users, we need to wait till the user watch some movies & then recommender system can provide recommendations.</font>
## <font color=green>This is a cold-start problem & item based similarity can resolve this.</font>



<hr style="border:2px solid gray">



# <font color=red>Item based similarity</font>

### If 2 movies are watched by many users & rated very similarly then there is some inherent relationship b/w these 2 movies, i.e. movies are of similar taste. Or we can say if user A watches one of these movie then user A is also likely to watch other movie.

In [33]:
# create a pivot table with the rows represent movies, columns represent users & values represents ratings.

rating_mat = rating_df.pivot( index='movieId',
columns='userId',
values = "rating" ).reset_index(drop=True)
rating_mat.index = rating_df.movieId.unique()
rating_mat.shape

(9066, 671)

In [34]:
# fill all NaNs with 0

rating_mat.fillna( 0, inplace = True )

# Find the correlation between movies (item based correlation)
# pairwise_distances method takes the rating matrix, and returns a correlation distance matrix

movie_sim = 1 - pairwise_distances( rating_mat.values,
metric="correlation" )
movie_sim[0:5]

array([[ 1.        ,  0.22374218,  0.18326579, ..., -0.0281574 ,
        -0.0281574 ,  0.04097762],
       [ 0.22374218,  1.        ,  0.12379014, ..., -0.01619963,
        -0.01619963, -0.01619963],
       [ 0.18326579,  0.12379014,  1.        , ..., -0.01122147,
        -0.01122147, -0.01122147],
       [ 0.07105519,  0.12501429,  0.14777073, ..., -0.00507279,
        -0.00507279, -0.00507279],
       [ 0.10507583,  0.19314427,  0.31791124, ..., -0.01116479,
        -0.01116479, -0.01116479]])

In [35]:
movie_sim_df = pd.DataFrame( movie_sim )
movie_sim_df.shape

(9066, 9066)

In [36]:
movie_sim_df.iloc[0:5, 0:5]

Unnamed: 0,0,1,2,3,4
0,1.0,0.223742,0.183266,0.071055,0.105076
1,0.223742,1.0,0.12379,0.125014,0.193144
2,0.183266,0.12379,1.0,0.147771,0.317911
3,0.071055,0.125014,0.147771,1.0,0.150562
4,0.105076,0.193144,0.317911,0.150562,1.0


In [37]:
# Finding most similar movies.
# movieid and index of the movie record in the movies_df are not same.
# movieid is taken as a parameter & returns similar movies based on cosine similarity.
# We find the index of the movie records from the movieid and use that to find similarity.


def get_similar_movies( movieid, topN = 5 ):
    movieidx = movies_df[movies_df.movieId == movieid].index[0]
    movies_df['similarity'] = movie_sim_df.iloc[movieidx]
    top_n = movies_df.sort_values( ["similarity"], ascending = False )[0:topN]
    return top_n

In [38]:
movies_df[movies_df.movieId == 858]

Unnamed: 0,movieId,title
659,858,"Godfather, The (1972)"


In [39]:
get_similar_movies(858)
# if someone watches Godfather, the other movies also can be recommended to that user.

Unnamed: 0,movieId,title,similarity
659,858,"Godfather, The (1972)",1.0
4922,7379,The Alamo (2004),0.992134
7880,94661,Rocket Singh: Salesman of the Year (2009),0.988825
4363,6380,Capturing the Friedmans (2003),0.953111
8607,118166,Courier (1987),0.928278


In [40]:
movies_df[movies_df.movieId == 231]

Unnamed: 0,movieId,title,similarity
197,231,Dumb & Dumber (Dumb and Dumber) (1994),-0.012476


In [42]:
get_similar_movies(231)

Unnamed: 0,movieId,title,similarity
197,231,Dumb & Dumber (Dumb and Dumber) (1994),1.0
233,271,Losing Isaiah (1995),0.379775
1659,2226,"Ring, The (1927)",0.378813
316,358,Higher Learning (1995),0.366623
242,280,Murder in the First (1995),0.363545


<hr style="border:2px solid gray">