In [1]:
import pandas as pd
import numpy as np

rating_df = pd.read_csv("C:\\Users\\HP\\Downloads\\recommender_dataset\\ratings.csv")
rating_df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [2]:
rating_df.drop("timestamp",axis="columns",inplace=True)

In [3]:
rating_df.userId.nunique()

610

In [4]:
rating_df.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [5]:
user_movies_df = rating_df.pivot(index="userId",columns="movieId",values ="rating").reset_index(drop=True)
user_movies_df

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
0,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,4.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
605,2.5,,,,,,2.5,,,,...,,,,,,,,,,
606,4.0,,,,,,,,,,...,,,,,,,,,,
607,2.5,2.0,2.0,,,,,,,4.0,...,,,,,,,,,,
608,3.0,,,,,,,,,4.0,...,,,,,,,,,,


### NAN are movies that have been 'seen',but not rated by the user
### Filling the NAs with 0

In [6]:
user_movies_df.fillna(0,inplace=True)
user_movies_df.iloc[0:5,0:15]

movieId,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Formula:
Cosine Distance = 1 - Cosine_Similarity

In [7]:
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine,correlation

user_sim = 1 - pairwise_distances(user_movies_df.values, metric="cosine") #Yields arrays in row form

#Store the results in a dataframe
user_sim_df = pd.DataFrame( user_sim )

#Set the index and column names to user ids (0 to 671)
user_sim_df.index = rating_df.userId.unique()
user_sim_df.columns = rating_df.userId.unique()

#### This matrix shows the 'cosine distance' between all possible pairs of users

In [22]:
user_sim_df.iloc[0:5,0:5]

Unnamed: 0,1,2,3,4,5
1,0.0,0.027283,0.05972,0.194395,0.12908
2,0.027283,0.0,0.0,0.003726,0.016614
3,0.05972,0.0,0.0,0.002251,0.00502
4,0.194395,0.003726,0.002251,0.0,0.128659
5,0.12908,0.016614,0.00502,0.128659,0.0


In [23]:
user_sim_df.shape

(610, 610)

#### Setting diagonal values of the matrix from 1 to 0.
#### We need the algorithm to find other users who are similar to a specific user

In [11]:
np.fill_diagonal(user_sim,0)
user_sim_df.iloc[0:5,0:5]

Unnamed: 0,1,2,3,4,5
1,0.0,0.027283,0.05972,0.194395,0.12908
2,0.027283,0.0,0.0,0.003726,0.016614
3,0.05972,0.0,0.0,0.002251,0.00502
4,0.194395,0.003726,0.002251,0.0,0.128659
5,0.12908,0.016614,0.00502,0.128659,0.0


In [12]:
#Returns the positions of cells carrying highest value
user_sim_df.idxmax(axis=1)[0:5]

1    266
2    366
3    313
4    391
5    470
dtype: int64

In [35]:
#Returns the position where similarity value is high
np.argmax(user_sim_df.iloc[5,0:600])

116

##### 116 implies user 5 is similar to user 117 in terms of movie choice

In [36]:
user_sim_df.iloc[5,110:120]

111    0.050802
112    0.196370
113    0.087633
114    0.009547
115    0.134555
116    0.160604
117    0.570296
118    0.054803
119    0.037971
120    0.141976
Name: 6, dtype: float64

Now we'll join the tables containing movies names with user table and see the title of movies that are shown to be similar

In [37]:
movies_df = pd.read_csv("C:\\Users\\HP\\Downloads\\recommender_dataset\\movies.csv")
movies_df[0:5]

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [38]:
movies_df.drop("genres",axis=1,inplace=True)

In [39]:
def get_user_similar_movies(user1,user2):
    # Inner join between movies watched between 2 users will give
    # the common movies watched
    common_movies = rating_df[rating_df.userId ==user1].merge(rating_df[rating_df.userId ==user2],
                                                             on ="movieId",
                                                             how="inner")
    #join the above result set with movies details
    return common_movies.merge(movies_df, on ="movieId") #this returns a column with movie names in the df

In [40]:
rating_df[rating_df["userId"]==4].merge(rating_df[rating_df.userId ==2],on ="movieId",how="inner")

Unnamed: 0,userId_x,movieId,rating_x,userId_y,rating_y
0,4,1704,1.0,2,4.5


In [41]:
rating_df[rating_df.userId ==1].merge(rating_df[rating_df.userId==4],on="movieId",how="inner")

Unnamed: 0,userId_x,movieId,rating_x,userId_y,rating_y
0,1,47,5.0,4,2.0
1,1,235,4.0,4,2.0
2,1,260,5.0,4,5.0
3,1,296,3.0,4,1.0
4,1,441,4.0,4,1.0
5,1,457,5.0,4,5.0
6,1,553,5.0,4,2.0
7,1,593,4.0,4,5.0
8,1,608,5.0,4,5.0
9,1,648,3.0,4,3.0


In [42]:
common_movies = get_user_similar_movies(5,117)

In [43]:
common_movies[(common_movies.rating_x>=4.0) & ((common_movies.rating_y >= 4.0))]

Unnamed: 0,userId_x,movieId,rating_x,userId_y,rating_y,title
0,5,21,4.0,117,4.0,Get Shorty (1995)
4,5,50,4.0,117,4.0,"Usual Suspects, The (1995)"
5,5,110,4.0,117,5.0,Braveheart (1995)
11,5,296,5.0,117,4.0,Pulp Fiction (1994)
22,5,457,4.0,117,4.0,"Fugitive, The (1993)"
26,5,527,5.0,117,5.0,Schindler's List (1993)
29,5,588,4.0,117,4.0,Aladdin (1992)
31,5,590,5.0,117,4.0,Dances with Wolves (1990)
33,5,594,5.0,117,4.0,Snow White and the Seven Dwarfs (1937)
34,5,595,5.0,117,4.0,Beauty and the Beast (1991)


In [44]:
common_movies = get_user_similar_movies(5,224)
common_movies

Unnamed: 0,userId_x,movieId,rating_x,userId_y,rating_y,title
0,5,39,3.0,224,5.0,Clueless (1995)
1,5,527,5.0,224,5.0,Schindler's List (1993)
