In [4]:
import pandas as pd
import numpy as np
import math
from sklearn import cross_validation as cv
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import mean_squared_error
from math import sqrt

In [5]:
#Reading users file:
u_cols = ['users_id','age','sex','occupation','zip_code']
users = pd.read_csv('ml-100k/u.user', sep='|', \
                    names=u_cols,encoding='latin-1')

#Reading ratings file:
r_cols = ['user_id','movie_id','rating','unix_timestamp']
df = pd.read_csv('ml-100k/u.data',sep='\t',names=r_cols,\
                     encoding='latin-1')

#Reading items file:
i_cols = ['movie_id','movie_title','release_date','video_release_date',\
         'IMDB_URL','unknown','Action','Adventure','Animation',\
         'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama',\
         'Fantasy','Film-Noir', 'Horror', 'Musical', 'Mystery',\
         'Romance','Sci-Fi', 'Thriller', 'War', 'Western']
items = pd.read_csv('ml-100k/u.item',sep='|',names=i_cols,\
                    encoding='latin-1')

In [6]:
print("\nUsers:\n\n",users.head())
print("\n\nData:\n\n",df.head())
print("\n\nItems:\n\n",items.head())


Users:

    users_id  age sex  occupation zip_code
0         1   24   M  technician    85711
1         2   53   F       other    94043
2         3   23   M      writer    32067
3         4   24   M  technician    43537
4         5   33   F       other    15213


Data:

    user_id  movie_id  rating  unix_timestamp
0      196       242       3       881250949
1      186       302       3       891717742
2       22       377       1       878887116
3      244        51       2       880606923
4      166       346       1       886397596


Items:

    movie_id        movie_title release_date  video_release_date  \
0         1   Toy Story (1995)  01-Jan-1995                 NaN   
1         2   GoldenEye (1995)  01-Jan-1995                 NaN   
2         3  Four Rooms (1995)  01-Jan-1995                 NaN   
3         4  Get Shorty (1995)  01-Jan-1995                 NaN   
4         5     Copycat (1995)  01-Jan-1995                 NaN   

                                            

In [7]:
print("Users Shape:",users.shape)

Users Shape: (943, 5)


In [8]:
train_data, test_data = cv.train_test_split(df, test_size=0.25)

In [9]:
n_users = df.user_id.unique().shape[0]
n_items = df.movie_id.unique().shape[0]
print ('Number of users = ' + str(n_users) +\
       ' | Number of movies = ' + str(n_items))

Number of users = 943 | Number of movies = 1682


In [10]:
train_data_matrix = np.zeros((n_users,n_items))
for line in train_data.itertuples():
    train_data_matrix[line[1]-1, line[2]-1] = line[3]

test_data_matrix = np.zeros((n_users, n_items))
for line in test_data.itertuples():
    test_data_matrix[line[1]-1, line[2]-1] = line[3]

In [11]:
user_similarity = pairwise_distances(train_data_matrix,\
                                     metric = 'cosine')
print(user_similarity)
print(user_similarity.shape)

[[0.         0.84258758 0.95301126 ... 0.8308281  0.85961819 0.68009021]
 [0.84258758 0.         0.91657638 ... 0.97309552 0.87421957 0.89097726]
 [0.95301126 0.91657638 0.         ... 0.87598005 0.87170751 0.98006391]
 ...
 [0.8308281  0.97309552 0.87598005 ... 0.         0.86897954 0.93232308]
 [0.85961819 0.87421957 0.87170751 ... 0.86897954 0.         0.89864194]
 [0.68009021 0.89097726 0.98006391 ... 0.93232308 0.89864194 0.        ]]
(943, 943)


In [12]:
def predict(ratings, similarity, type='user'):
    if(type == 'user'):
        mean_user_rating = ratings.mean(axis=1)
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + \
               similarity.dot(ratings_diff) /\
               np.array([np.abs(similarity).sum(axis=1)]).T
    return pred
user_prediction = predict(train_data_matrix,user_similarity,\
                          type='user')

In [13]:
print(user_prediction[0])
print(user_prediction.shape)

[1.60775522 0.57583549 0.4695317  ... 0.28944176 0.28908111 0.28891225]
(943, 1682)


In [14]:
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction,ground_truth))

print('User-based CF RMSE: ' + str(rmse(user_prediction,\
                                        test_data_matrix)))

User-based CF RMSE: 3.1175065101707937


In [15]:
print(user_prediction.shape)
print((user_prediction[0].shape))
print(user_prediction[0])

(943, 1682)
(1682,)
[1.60775522 0.57583549 0.4695317  ... 0.28944176 0.28908111 0.28891225]


In [16]:
df_temp1 = pd.DataFrame(user_prediction[0],columns=['simi'])
#print(df)

df_temp2 = df_temp1.copy()

df_temp2.sort_values("simi", inplace=True,ascending=False)
df_temp2 = df_temp2.reset_index()
#print(df_temp2)
df_temp2.columns=['movie_id','simi']
for i in range(len(df_temp2)):
    df_temp2.iloc[i]+=1
merged = pd.merge(df_temp2,items)

merged.sort_values("simi",inplace=True,ascending=False)
print("user887")
merged.head(20)

user887


Unnamed: 0,movie_id,simi,movie_title,release_date,video_release_date,IMDB_URL,unknown,Action,Adventure,Animation,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,50,3.057526,Star Wars (1977),01-Jan-1977,,http://us.imdb.com/M/title-exact?Star%20Wars%2...,0,1,1,0,...,0,0,0,0,0,1,1,0,1,0
1,100,2.952313,Fargo (1996),14-Feb-1997,,http://us.imdb.com/M/title-exact?Fargo%20(1996),0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,258,2.823251,Contact (1997),11-Jul-1997,,http://us.imdb.com/Title?Contact+(1997/I),0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,181,2.77663,Return of the Jedi (1983),14-Mar-1997,,http://us.imdb.com/M/title-exact?Return%20of%2...,0,1,1,0,...,0,0,0,0,0,1,1,0,1,0
4,286,2.659089,"English Patient, The (1996)",15-Nov-1996,,http://us.imdb.com/M/title-exact?English%20Pat...,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
5,174,2.63536,Raiders of the Lost Ark (1981),01-Jan-1981,,http://us.imdb.com/M/title-exact?Raiders%20of%...,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
6,1,2.607755,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
7,300,2.598174,Air Force One (1997),01-Jan-1997,,http://us.imdb.com/M/title-exact?Air+Force+One...,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
8,127,2.579842,"Godfather, The (1972)",01-Jan-1972,,"http://us.imdb.com/M/title-exact?Godfather,%20...",0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
9,288,2.555809,Scream (1996),20-Dec-1996,,http://us.imdb.com/M/title-exact?Scream%20(1996),0,0,0,0,...,0,0,1,0,0,0,0,1,0,0


In [19]:
print("user565")
merged.head(20)

user565


Unnamed: 0,movie_id,simi,movie_title,release_date,video_release_date,IMDB_URL,unknown,Action,Adventure,Animation,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,50,3.135607,Star Wars (1977),01-Jan-1977,,http://us.imdb.com/M/title-exact?Star%20Wars%2...,0,1,1,0,...,0,0,0,0,0,1,1,0,1,0
1,100,2.79734,Fargo (1996),14-Feb-1997,,http://us.imdb.com/M/title-exact?Fargo%20(1996),0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,181,2.743943,Return of the Jedi (1983),14-Mar-1997,,http://us.imdb.com/M/title-exact?Return%20of%2...,0,1,1,0,...,0,0,0,0,0,1,1,0,1,0
3,258,2.731809,Contact (1997),11-Jul-1997,,http://us.imdb.com/Title?Contact+(1997/I),0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,286,2.689254,"English Patient, The (1996)",15-Nov-1996,,http://us.imdb.com/M/title-exact?English%20Pat...,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
5,127,2.565457,"Godfather, The (1972)",01-Jan-1972,,"http://us.imdb.com/M/title-exact?Godfather,%20...",0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
6,1,2.511823,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
7,288,2.500493,Scream (1996),20-Dec-1996,,http://us.imdb.com/M/title-exact?Scream%20(1996),0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
8,174,2.487637,Raiders of the Lost Ark (1981),01-Jan-1981,,http://us.imdb.com/M/title-exact?Raiders%20of%...,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
9,98,2.480076,"Silence of the Lambs, The (1991)",01-Jan-1991,,http://us.imdb.com/M/title-exact?Silence%20of%...,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
