In [127]:
import numpy as np  
import pandas as pd
import matplotlib.pyplot as plt
from math import sqrt

In [128]:
#Reading ratings file:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=r_cols, encoding='latin-1')

# split the ratings into training and test
ratings_training = ratings.sample(frac=0.7)
ratings_test = ratings.drop(ratings_training.index)

#Reading items file:
i_cols = ['movie_id', 'movie title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
items = pd.read_csv('ml-100k/u.item', sep='|', names=i_cols, encoding='latin-1')

#Merging ratings and items:
movie_data = pd.merge(ratings_training, items, on='movie_id')  
movie_data = movie_data.drop(['unix_timestamp', 'release date', 'video release date', 'IMDb URL'], axis=1)
mean = pd.DataFrame(movie_data.groupby('user_id')['rating'].mean()).reset_index()

In [129]:
#create user profile
def user_vector(user):
    #filter movie data to only show user ratings
    #set movie title to index
    user_ratings = movie_data[movie_data['user_id']==user].set_index('movie title')
    user_ratings = user_ratings.drop(['user_id', 'movie_id'], axis=1)

    #calculate user mean
    #subtract each item rating by mean to normalize ratings
    #multiply content attributes by rating 
    userMean = mean[mean['user_id']==user].rating[user-1]
    ratings = user_ratings['rating']
    ratings -= userMean
    normalized_ratings = user_ratings.mul(user_ratings['rating'], axis=0).drop(['rating'], axis=1)

    #sum all movie vectors to compute user profile
    user_profile = normalized_ratings.sum(axis = 0, skipna = True) 
    #normalize user profile
    user_profile = user_profile / user_profile.sum()
    return user_profile.to_frame()

In [219]:
#make recommendations
def recommend(user):
    user_profile = user_vector(user)

    #item profiles 
    item_profiles = items.drop(['release date','movie title', 'movie_id','video release date', 'IMDb URL'],axis=1).reset_index(drop=True)

    #estimate item ratings
    #sort and recommend top 10 ratings
    ratings = item_profiles.dot(user_profile).set_index(items.set_index('movie title').index)
    ratings.columns = [user]
    ratings = ratings.sort_values(by=[user],ascending=False)
    ratings = ratings[ratings[user]>0]
    highest_rating = ratings.max()
    ratings = ratings.mul(5/highest_rating, axis=1)
    return ratings

In [220]:
def error(user):
    # filter by user test ratings
    user_test_data = ratings_test[ratings_test['user_id']==user]
    test_data = pd.merge(user_test_data, items, on='movie_id')  
    test_data = test_data.drop(['unix_timestamp', 'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
     'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
     'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western','movie_id'], axis=1)
    test_data = test_data.pivot(index='movie title', columns='user_id', values='rating')

    # find estimated ratings from training data
    training_data = recommend(user)

    #calculate root mean squared error 
    return sqrt(test_data.subtract(training_data, axis=1).pow(2).dropna().mean())

In [223]:
error(313)

2.131540137507776

In [221]:
userMean = mean[mean['user_id']==197]
user_ratings = movie_data[movie_data['user_id']==197].set_index('movie title')
user_ratings = user_ratings.drop(['user_id', 'movie_id'], axis=1)

def count(user):
    number = movie_data[movie_data['user_id']==user].set_index('movie title')
    count = number['user_id'].count()
    return [user,count]
review_count = []
for i in range(1,944):
     review_count.append(count(i))

review_count_df =pd.DataFrame(review_count,columns=['user','review count']).set_index('user').sort_values(by=['review count'],ascending=False)
review_count_df.head(10)

Unnamed: 0_level_0,review count
user,Unnamed: 1_level_1
405,516
655,491
13,472
450,373
276,354
234,350
303,346
416,337
537,337
279,324
