In [1]:
import pandas as pd
import numpy as np

In [2]:
movies_df = pd.read_csv('Movie.csv')
movies_df.head()

Unnamed: 0,userId,movie,rating
0,3,Toy Story (1995),4.0
1,6,Toy Story (1995),5.0
2,8,Toy Story (1995),4.0
3,10,Toy Story (1995),4.0
4,11,Toy Story (1995),4.5


In [3]:
movies_df.sort_values('userId')

Unnamed: 0,userId,movie,rating
2569,1,Jumanji (1995),3.5
3724,2,Grumpier Old Men (1995),4.0
0,3,Toy Story (1995),4.0
5204,4,Heat (1995),3.0
7444,4,GoldenEye (1995),4.0
...,...,...,...
6463,7117,Heat (1995),5.0
2567,7119,Toy Story (1995),5.0
2568,7120,Toy Story (1995),4.5
3723,7120,Jumanji (1995),4.0


In [4]:
#number of unique users in the dataset
len(movies_df.userId.unique())

4081

In [5]:
movies_df['rating'].value_counts()

3.0    2736
4.0    2660
5.0    1394
3.5     679
2.0     542
4.5     374
2.5     277
1.0     212
1.5      61
0.5      57
Name: rating, dtype: int64

In [6]:
len(movies_df.movie.unique())

10

In [7]:
movies_df.movie.value_counts()

Toy Story (1995)                      2569
GoldenEye (1995)                      1548
Heat (1995)                           1260
Jumanji (1995)                        1155
Sabrina (1995)                         700
Grumpier Old Men (1995)                685
Father of the Bride Part II (1995)     657
Sudden Death (1995)                    202
Waiting to Exhale (1995)               138
Tom and Huck (1995)                     78
Name: movie, dtype: int64

### Step 1: Customer as a p-dimensional vector of items
* p: number of distinct catalog items (movies)
* components: watched(1)/ not watched (0); ratings (1 to 5)

In [8]:
user_movies_df = movies_df.pivot(index='userId',
                                 columns='movie',
                                 values='rating')

In [9]:
user_movies_df

movie,Father of the Bride Part II (1995),GoldenEye (1995),Grumpier Old Men (1995),Heat (1995),Jumanji (1995),Sabrina (1995),Sudden Death (1995),Tom and Huck (1995),Toy Story (1995),Waiting to Exhale (1995)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,,,,,3.5,,,,,
2,,,4.0,,,,,,,
3,,,,,,,,,4.0,
4,,4.0,,3.0,,,,,,
5,,,,,3.0,,,,,
...,...,...,...,...,...,...,...,...,...,...
7115,4.0,,,,,,,,,
7116,3.5,,,,,,,,4.0,
7117,,3.0,4.0,5.0,,3.0,1.0,,4.0,
7119,,,,,,,,,5.0,


In [11]:
#user_movies_df.index = movies_df.userId.unique()

In [12]:
user_movies_df

movie,Father of the Bride Part II (1995),GoldenEye (1995),Grumpier Old Men (1995),Heat (1995),Jumanji (1995),Sabrina (1995),Sudden Death (1995),Tom and Huck (1995),Toy Story (1995),Waiting to Exhale (1995)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,,,,,3.5,,,,,
2,,,4.0,,,,,,,
3,,,,,,,,,4.0,
4,,4.0,,3.0,,,,,,
5,,,,,3.0,,,,,
...,...,...,...,...,...,...,...,...,...,...
7115,4.0,,,,,,,,,
7116,3.5,,,,,,,,4.0,
7117,,3.0,4.0,5.0,,3.0,1.0,,4.0,
7119,,,,,,,,,5.0,


In [13]:
#Impute those NaNs with 0 values
#same as user_movies_df = user_movies_df.fillna(0)

user_movies_df.fillna(0, inplace=True)

In [14]:
user_movies_df

movie,Father of the Bride Part II (1995),GoldenEye (1995),Grumpier Old Men (1995),Heat (1995),Jumanji (1995),Sabrina (1995),Sudden Death (1995),Tom and Huck (1995),Toy Story (1995),Waiting to Exhale (1995)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,0.0,0.0,0.0,0.0,3.5,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
4,0.0,4.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
7115,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7116,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
7117,0.0,3.0,4.0,5.0,0.0,3.0,1.0,0.0,4.0,0.0
7119,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0


### Step 2: Calculating Cosine Similarity between Users

In [15]:

from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine, correlation

In [16]:
user_sim = 1 - pairwise_distances(user_movies_df.values,metric='cosine')

In [17]:
user_sim

array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.55337157],
       [0.        , 1.        , 0.        , ..., 0.45883147, 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.45883147, 1.        ,
        0.62254302],
       ...,
       [0.        , 0.45883147, 0.45883147, ..., 1.        , 0.45883147,
        0.47607054],
       [0.        , 0.        , 1.        , ..., 0.45883147, 1.        ,
        0.62254302],
       [0.55337157, 0.        , 0.62254302, ..., 0.47607054, 0.62254302,
        1.        ]])

In [18]:
#Store the results in a dataframe
user_sim_df = pd.DataFrame(user_sim)
user_sim_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4071,4072,4073,4074,4075,4076,4077,4078,4079,4080
0,1.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,1.000000,0.707107,0.000000,0.000000,0.000000,0.000000,0.000000,0.553372
1,0.000000,1.000000,0.000000,0.000000,0.000000,0.390567,0.707107,0.615457,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.458831,0.000000,0.000000
2,0.000000,0.000000,1.000000,0.000000,0.000000,0.650945,0.000000,0.492366,1.000000,0.874157,...,0.000000,1.000000,0.000000,0.707107,0.000000,0.000000,0.752577,0.458831,1.000000,0.622543
3,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.615457,0.000000,0.388514,...,0.800000,0.000000,0.000000,0.000000,0.989949,0.000000,0.000000,0.619422,0.000000,0.000000
4,1.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,1.000000,0.707107,0.000000,0.000000,0.000000,0.000000,0.000000,0.553372
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4076,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.658505,0.000000,0.000000,0.000000
4077,0.000000,0.000000,0.752577,0.000000,0.000000,0.489886,0.000000,0.370543,0.752577,0.657870,...,0.000000,0.752577,0.000000,0.532152,0.000000,0.658505,1.000000,0.345306,0.752577,0.468511
4078,0.000000,0.458831,0.458831,0.619422,0.000000,0.701884,0.567775,0.889532,0.458831,0.568212,...,0.344124,0.458831,0.000000,0.324443,0.648886,0.000000,0.345306,1.000000,0.458831,0.476071
4079,0.000000,0.000000,1.000000,0.000000,0.000000,0.650945,0.000000,0.492366,1.000000,0.874157,...,0.000000,1.000000,0.000000,0.707107,0.000000,0.000000,0.752577,0.458831,1.000000,0.622543


In [19]:
#Set the index and column names to user ids 
user_sim_df.index = movies_df.userId.unique()
user_sim_df.columns = movies_df.userId.unique()

In [20]:
user_sim_df.iloc[0:30, 0:30]

Unnamed: 0,3,6,8,10,11,12,13,14,16,19,...,59,66,69,80,82,84,90,91,93,96
3,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.6,0.0,0.0,0.0,0.363803,0.0,0.0,0.0
6,0.0,1.0,0.0,0.0,0.0,0.390567,0.707107,0.615457,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.457496,0.0,0.0
8,0.0,0.0,1.0,0.0,0.0,0.650945,0.0,0.492366,1.0,0.874157,...,0.544331,0.0,0.0,0.0,1.0,0.0,0.606339,0.0,0.707107,0.0
10,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.615457,0.0,0.388514,...,0.653197,0.8,0.0,0.0,0.0,0.989949,0.485071,0.0,0.424264,0.0
11,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.6,0.0,0.0,0.0,0.363803,0.0,0.0,0.0
12,0.0,0.390567,0.650945,0.0,0.0,1.0,0.73646,0.56088,0.650945,0.569028,...,0.620076,0.0,0.0,0.0,0.650945,0.0,0.631509,0.675023,0.460287,0.0
13,0.0,0.707107,0.0,0.0,0.0,0.73646,1.0,0.435194,0.0,0.0,...,0.288675,0.0,0.0,0.0,0.0,0.0,0.257248,0.862662,0.0,0.0
14,0.0,0.615457,0.492366,0.615457,0.0,0.56088,0.435194,1.0,0.492366,0.669519,...,0.670025,0.492366,0.0,0.0,0.492366,0.609272,0.597081,0.281569,0.609272,0.0
16,0.0,0.0,1.0,0.0,0.0,0.650945,0.0,0.492366,1.0,0.874157,...,0.544331,0.0,0.0,0.0,1.0,0.0,0.606339,0.0,0.707107,0.0
19,0.0,0.0,0.874157,0.388514,0.0,0.569028,0.0,0.669519,0.874157,1.0,...,0.674094,0.485643,0.0,0.0,0.874157,0.343401,0.8245,0.0,0.618123,0.0


In [21]:
np.fill_diagonal(user_sim, 0)
user_sim_df.iloc[0:10, 0:10]

Unnamed: 0,3,6,8,10,11,12,13,14,16,19
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.390567,0.707107,0.615457,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.650945,0.0,0.492366,1.0,0.874157
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.615457,0.0,0.388514
11,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,0.0,0.390567,0.650945,0.0,0.0,0.0,0.73646,0.56088,0.650945,0.569028
13,0.0,0.707107,0.0,0.0,0.0,0.73646,0.0,0.435194,0.0,0.0
14,0.0,0.615457,0.492366,0.615457,0.0,0.56088,0.435194,0.0,0.492366,0.669519
16,0.0,0.0,1.0,0.0,0.0,0.650945,0.0,0.492366,0.0,0.874157
19,0.0,0.0,0.874157,0.388514,0.0,0.569028,0.0,0.669519,0.874157,0.0


In [22]:
#Most Similar Users
user_sim_df.idxmax(axis=1)[0:20]

3       11
6      168
8       16
10    4047
11       3
12    6676
13    5953
14    4138
16       8
19    3603
22    3003
23    2096
24     539
31      66
34      93
39      53
47     258
53      39
54    2912
58       8
dtype: int64

In [23]:
movies_df[(movies_df['userId']==8) | (movies_df['userId']==16)]

Unnamed: 0,userId,movie,rating
2,8,Toy Story (1995),4.0
8,16,Toy Story (1995),3.0
3727,8,Grumpier Old Men (1995),5.0
5205,8,Heat (1995),3.0
5207,16,Heat (1995),3.0
7445,8,GoldenEye (1995),4.0


In [24]:
user_1=movies_df[movies_df['userId']==8]

In [25]:
user_2=movies_df[movies_df['userId']==16]

In [26]:
user_2.movie

8       Toy Story (1995)
5207         Heat (1995)
Name: movie, dtype: object

In [27]:
user_1.movie

2              Toy Story (1995)
3727    Grumpier Old Men (1995)
5205                Heat (1995)
7445           GoldenEye (1995)
Name: movie, dtype: object

In [28]:
user_merged = pd.merge(user_1,user_2,on='movie',how='outer')
user_merged

Unnamed: 0,userId_x,movie,rating_x,userId_y,rating_y
0,8,Toy Story (1995),4.0,16.0,3.0
1,8,Grumpier Old Men (1995),5.0,,
2,8,Heat (1995),3.0,16.0,3.0
3,8,GoldenEye (1995),4.0,,


In [29]:
user_merged =  user_merged[['movie', 'userId_x', 'rating_x', 'rating_y', 'userId_y']]
user_merged

Unnamed: 0,movie,userId_x,rating_x,rating_y,userId_y
0,Toy Story (1995),8,4.0,3.0,16.0
1,Grumpier Old Men (1995),8,5.0,,
2,Heat (1995),8,3.0,3.0,16.0
3,GoldenEye (1995),8,4.0,,


In [30]:
import seaborn as sns
sns.heatmap(user_sim_df.iloc[:10,:10], annot=True)   # Very easy!

<AxesSubplot:>

In [31]:
user_sim_df[12].sort_values(ascending=False)[:30]

6676    0.996273
574     0.994221
7049    0.994221
4285    0.991194
6874    0.991194
207     0.982397
172     0.982397
1732    0.982397
133     0.977140
4797    0.972828
2080    0.971494
4906    0.964754
256     0.964252
2015    0.959532
5786    0.957398
1429    0.957398
2985    0.956689
4478    0.956689
5559    0.948519
6371    0.937360
6870    0.931476
3778    0.929182
5060    0.921256
4081    0.920575
1458    0.920575
3651    0.920575
1939    0.920575
1769    0.920575
964     0.920575
6124    0.920575
Name: 12, dtype: float64

### Step 3: Once similar, recommend movies that are:
* Not watched/ rated by the target user yet
* Rank each movie by how many similar users watched/ rated it


In [32]:
## For user inputs
#create variable to take userid
print("Please enter the user id?")
user_id = int(input())

print("Please enter the number of similar users to be viewed?")
top_x = int(input())

print(user_sim_df[user_id].sort_values(ascending=False)[:top_x])

Please enter the user id?
256
Please enter the number of similar users to be viewed?
5
4624    0.988024
3583    0.988024
326     0.988024
4004    0.988024
6290    0.988024
Name: 256, dtype: float64
