In [1]:
import pandas as pd
import numpy as np
import math
from sklearn import cross_validation as cv
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import mean_squared_error
from math import sqrt



In [2]:
#Reading users file:
u_cols = ['users_id','age','sex','occupation','zip_code']
users = pd.read_csv('ml-100k/u.user', sep='|', \
                    names=u_cols,encoding='latin-1')

#Reading ratings file:
r_cols = ['user_id','movie_id','rating','unix_timestamp']
df = pd.read_csv('ml-100k/u.data',sep='\t',names=r_cols,\
                     encoding='latin-1')

#Reading items file:
i_cols = ['movie_id','movie_title','release_date','video_release_date',\
         'IMDB_URL','unknown','Action','Adventure','Animation',\
         'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama',\
         'Fantasy','Film-Noir', 'Horror', 'Musical', 'Mystery',\
         'Romance','Sci-Fi', 'Thriller', 'War', 'Western']
items = pd.read_csv('ml-100k/u.item',sep='|',names=i_cols,\
                    encoding='latin-1')

In [3]:
print("\nUsers:\n\n",users.head())
print("\n\nData:\n\n",df.head())
print("\n\nItems:\n\n",items.head())


Users:

    users_id  age sex  occupation zip_code
0         1   24   M  technician    85711
1         2   53   F       other    94043
2         3   23   M      writer    32067
3         4   24   M  technician    43537
4         5   33   F       other    15213


Data:

    user_id  movie_id  rating  unix_timestamp
0      196       242       3       881250949
1      186       302       3       891717742
2       22       377       1       878887116
3      244        51       2       880606923
4      166       346       1       886397596


Items:

    movie_id        movie_title release_date  video_release_date  \
0         1   Toy Story (1995)  01-Jan-1995                 NaN   
1         2   GoldenEye (1995)  01-Jan-1995                 NaN   
2         3  Four Rooms (1995)  01-Jan-1995                 NaN   
3         4  Get Shorty (1995)  01-Jan-1995                 NaN   
4         5     Copycat (1995)  01-Jan-1995                 NaN   

                                            

In [4]:
print("Users Shape:",users.shape)

Users Shape: (943, 5)


In [5]:
train_data, test_data = cv.train_test_split(df, test_size=0.25)

In [6]:
n_users = df.user_id.unique().shape[0]
n_items = df.movie_id.unique().shape[0]
print ('Number of users = ' + str(n_users) +\
       ' | Number of movies = ' + str(n_items))

Number of users = 943 | Number of movies = 1682


In [7]:
train_data_matrix = np.zeros((n_users,n_items))
for line in train_data.itertuples():
    train_data_matrix[line[1]-1, line[2]-1] = line[3]

test_data_matrix = np.zeros((n_users, n_items))
for line in test_data.itertuples():
    test_data_matrix[line[1]-1, line[2]-1] = line[3]

In [8]:
user_similarity = pairwise_distances(train_data_matrix,\
                                     metric = 'cosine')

In [9]:
def predict(ratings, similarity, type='user'):
    if(type == 'user'):
        mean_user_rating = ratings.mean(axis=1)
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + \
               similarity.dot(ratings_diff) /\
               np.array([np.abs(similarity).sum(axis=1)]).T
    return pred
user_prediction = predict(train_data_matrix,user_similarity,\
                          type='user')

In [11]:
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction,ground_truth))

print('User-based CF RMSE: ' + str(rmse(user_prediction,\
                                        test_data_matrix)))

User-based CF RMSE: 3.1106665732662644
