In [1]:
import pandas as pd
import numpy as np

In [2]:
unames = ['user_id', 'gender', 'age', 'occupation', 'zip']
users = pd.read_table('./dataset/movielens/users.dat', sep='::', header=None, names=unames, engine='python')

In [3]:
users.head()

Unnamed: 0,user_id,gender,age,occupation,zip
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [4]:
rnames = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_table('./dataset/movielens/ratings.dat', sep='::', header=None, names=rnames, engine='python')

In [5]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [6]:
mnames = ['movie_id', 'title', 'genres']
movies = pd.read_table('./dataset/movielens/movies.dat', sep='::', header=None, names=mnames, engine='python')

In [7]:
movies.head()

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [8]:
data = pd.merge(pd.merge(ratings, users), movies)

In [9]:
data.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,title,genres
0,1,1193,5,978300760,F,1,10,48067,One Flew Over the Cuckoo's Nest (1975),Drama
1,2,1193,5,978298413,M,56,16,70072,One Flew Over the Cuckoo's Nest (1975),Drama
2,12,1193,4,978220179,M,25,12,32793,One Flew Over the Cuckoo's Nest (1975),Drama
3,15,1193,4,978199279,M,25,7,22903,One Flew Over the Cuckoo's Nest (1975),Drama
4,17,1193,5,978158471,M,50,1,95350,One Flew Over the Cuckoo's Nest (1975),Drama


In [10]:
data1 = pd.pivot_table(data, values='rating', index='user_id', columns='movie_id')

In [11]:
data1.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,2.0,,,,,...,,,,,,,,,,


In [12]:
data1.loc[5,6]

2.0

In [13]:
from sklearn.metrics.pairwise import cosine_similarity

In [14]:
data1 = data1.apply(lambda x:x-x.mean(), axis=1) # decentralization

In [15]:
data1.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.811321,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,-1.146465,,,,,...,,,,,,,,,,


In [16]:
cos_s = []
for i in range(len(data1)):
    cos_s.append(cosine_similarity(np.nan_to_num(data1.iloc[0,:]).reshape(1, -1), np.nan_to_num(data1.iloc[i,:]).reshape(1, -1)))

In [17]:
sim = pd.Series(cos_s, index=data1.index)

In [18]:
data2 = data1.assign(sim = cos_s)

In [19]:
data2.sort_values(by='sim', ascending=False, inplace=True)

In [20]:
data3 = data2.iloc[1:6, :].copy()

In [21]:
data3 # Top 5 users with high sim

movie_id,1,2,3,4,5,6,7,8,9,10,...,3944,3945,3946,3947,3948,3949,3950,3951,3952,sim
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1337,,,,,,,,,,,...,,,,,,,,,,[[0.18924150954139235]]
379,,,,,,,,,,,...,,,,,,,,,,[[0.1598926939188515]]
5404,1.152381,,,,,,,,,,...,,,,,,,,,,[[0.15515416058858936]]
49,1.287037,,,,,,,,,,...,,,,,,,,,,[[0.14845529673917254]]
2607,,,,,,,,-1.177419,,,...,,,,,,,,,,[[0.14810498948254988]]


In [22]:
data3.dropna(how='all', axis=1, inplace=True)

In [23]:
data3 # Drop those who never rated any movie

movie_id,1,8,11,13,17,21,25,32,34,36,...,3776,3785,3789,3793,3916,3926,3927,3928,3930,sim
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1337,,,,,,,,,1.184615,,...,-0.815385,,,,1.184615,0.184615,0.184615,0.184615,-0.815385,[[0.18924150954139235]]
379,,,-0.234375,,-0.234375,,,,,,...,,,,,,,,,,[[0.1598926939188515]]
5404,1.152381,,0.152381,,,,1.152381,-1.847619,-0.847619,0.152381,...,,,,,,,,,,[[0.15515416058858936]]
49,1.287037,,,0.287037,,-0.712963,,,,,...,,-0.712963,,-0.712963,,,,,,[[0.14845529673917254]]
2607,,-1.177419,,,,,,,-1.177419,-1.177419,...,,,-1.177419,,,,,,,[[0.14810498948254988]]


In [24]:
data3.sim.map(lambda x:x[0][0])

user_id
1337    0.189242
379     0.159893
5404    0.155154
49      0.148455
2607    0.148105
Name: sim, dtype: float64

In [25]:
data4 = (data3[data3.columns[:-1]].fillna(0).apply(lambda x:x*data3.sim.map(lambda x:x[0][0]))).sum()

In [26]:
data4_ = data4/data3[data3.columns[:-1]].sum()

In [27]:
data4_.sort_values(ascending=False, inplace=True)
data4_[:10]             # Top 10 movie that user "1"likes

movie_id
1282    0.640816
3755    0.314737
594     0.250895
1307    0.226490
783     0.196902
2294    0.196902
1907    0.196902
2846    0.189242
2717    0.189242
2430    0.189242
dtype: float64

In [28]:
recommend_ = data4_[:10].index

In [29]:
len(recommend_)

10

In [30]:
recommend_

Index([1282, 3755, 594, 1307, 783, 2294, 1907, 2846, 2717, 2430], dtype='object', name='movie_id')

In [31]:
have_seen = data2.columns[~data2.iloc[0,:].isnull()][:-1] # Movies that User 1 has already watched

In [32]:
have_seen

Index([   1,   48,  150,  260,  527,  531,  588,  594,  595,  608,  661,  720,
        745,  783,  914,  919,  938, 1022, 1028, 1029, 1035, 1097, 1193, 1197,
       1207, 1246, 1270, 1287, 1545, 1566, 1721, 1836, 1907, 1961, 1962, 2018,
       2028, 2294, 2321, 2340, 2355, 2398, 2687, 2692, 2762, 2791, 2797, 2804,
       2918, 3105, 3114, 3186, 3408],
      dtype='object', name='movie_id')

In [33]:
len(set(recommend_) - set(have_seen))  # Get movies that user 1 has never watched before

6

In [34]:
recommend = set(recommend_) - set(have_seen)

In [35]:
recommend

{1282, 1307, 2430, 2717, 2846, 3755}

In [36]:
movies[movies.movie_id.isin(recommend)]   # Get the names of recommended movies

Unnamed: 0,movie_id,title,genres
1262,1282,Fantasia (1940),Animation|Children's|Musical
1287,1307,When Harry Met Sally... (1989),Comedy|Romance
2361,2430,Mighty Joe Young (1949),Adventure|Children's|Drama
2648,2717,Ghostbusters II (1989),Comedy|Horror
2777,2846,"Adventures of Milo and Otis, The (1986)",Children's
3686,3755,"Perfect Storm, The (2000)",Action|Adventure|Thriller


In [37]:
most_love = (data2.iloc[0].sort_values(ascending=False)[1:19]).index

In [38]:
data2.iloc[0].sort_values(ascending=False)[1:19] # Get movies that User 1 likes the most

movie_id
2028    0.811321
48      0.811321
150     0.811321
527     0.811321
595     0.811321
1022    0.811321
1028    0.811321
1029    0.811321
1035    0.811321
1193    0.811321
1287    0.811321
1836    0.811321
1961    0.811321
1270    0.811321
1       0.811321
2804    0.811321
2355    0.811321
3105    0.811321
Name: 1, dtype: object

In [39]:
most_love

Index([2028,   48,  150,  527,  595, 1022, 1028, 1029, 1035, 1193, 1287, 1836,
       1961, 1270,    1, 2804, 2355, 3105],
      dtype='object', name='movie_id')

In [40]:
movies[movies.movie_id.isin(most_love)]  # Get names and genres

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
47,48,Pocahontas (1995),Animation|Children's|Musical|Romance
148,150,Apollo 13 (1995),Drama
523,527,Schindler's List (1993),Drama|War
591,595,Beauty and the Beast (1991),Animation|Children's|Musical
1009,1022,Cinderella (1950),Animation|Children's|Musical
1015,1028,Mary Poppins (1964),Children's|Comedy|Musical
1016,1029,Dumbo (1941),Animation|Children's|Musical
1022,1035,"Sound of Music, The (1965)",Musical
1176,1193,One Flew Over the Cuckoo's Nest (1975),Drama


# Collaborative Filtering

In [41]:
data.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,title,genres
0,1,1193,5,978300760,F,1,10,48067,One Flew Over the Cuckoo's Nest (1975),Drama
1,2,1193,5,978298413,M,56,16,70072,One Flew Over the Cuckoo's Nest (1975),Drama
2,12,1193,4,978220179,M,25,12,32793,One Flew Over the Cuckoo's Nest (1975),Drama
3,15,1193,4,978199279,M,25,7,22903,One Flew Over the Cuckoo's Nest (1975),Drama
4,17,1193,5,978158471,M,50,1,95350,One Flew Over the Cuckoo's Nest (1975),Drama


In [65]:
data5 = pd.pivot_table(data, values='rating', columns='user_id', index='movie_id')

In [66]:
data5 = data5.apply(lambda x:x-x.mean(), axis=1)

In [67]:
data5.head()

user_id,1,2,3,4,5,6,7,8,9,10,...,6031,6032,6033,6034,6035,6036,6037,6038,6039,6040
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.853154,,,,,-0.146846,,-0.146846,0.853154,0.853154,...,,-0.146846,,,-0.146846,,,,,-1.146846
2,,,,,,,,,,1.798859,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,-2.016736,,,,,
4,,,,,,,,0.270588,,,...,,,,,-0.729412,-0.729412,,,,
5,,,,,,,,,,,...,,,,,-2.006757,,,,,


In [84]:
cos_s = []
for i in range(len(data5)):
    cos_s.append(cosine_similarity(np.nan_to_num(data5.loc[2,:]).reshape(1, -1), np.nan_to_num(data5.iloc[i,:]).reshape(1, -1)))
new_ratings = pd.DataFrame({'similarity': cos_s, 'rating': data5.loc[:, target_user_id]})
top = new_ratings.dropna().sort_values('similarity', ascending=False)[:3].copy()
top['multiple'] = top['rating']*top['similarity']
result = top['multiple'].sum()/top['similarity'].sum()

In [85]:
result

array([[-0.02897561]])

In [86]:
cos_s = []
for i in range(len(data5)):
    cos_s.append(cosine_similarity(np.nan_to_num(data5.loc[3,:]).reshape(1, -1), np.nan_to_num(data5.iloc[i,:]).reshape(1, -1)))
new_ratings = pd.DataFrame({'similarity': cos_s, 'rating': data5.loc[:, target_user_id]})
top = new_ratings.dropna().sort_values('similarity', ascending=False)[:3].copy()
top['multiple'] = top['rating']*top['similarity']
result = top['multiple'].sum()/top['similarity'].sum()

In [87]:
result

array([[0.71984767]])