In [467]:
import numpy as np  
import pandas as pd
import matplotlib.pyplot as plt
from math import sqrt

In [469]:
# reading ratings file:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=r_cols, encoding='latin-1')

# split the ratings into training and test
ratings_training = ratings.sample(frac=0.7)
ratings_test = ratings.drop(ratings_training.index)

# reading items file:
i_cols = ['movie_id', 'movie title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
items = pd.read_csv('ml-100k/u.item', sep='|', names=i_cols, encoding='latin-1')

# merging ratings and items:
movie_data = pd.merge(ratings_training, items, on='movie_id')  
movie_data = movie_data.drop(['unix_timestamp', 'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'], axis=1)
rating_count = pd.DataFrame(movie_data.groupby('movie_id')['rating'].count()).sort_values(by=['rating'],ascending=False)

In [470]:
# user ratings
# find correlation between users
users_data = movie_data.drop(['movie_id'],axis=1).pivot_table(index='user_id', columns='movie title', values='rating').T
correlation = users_data.corr()

# find nearest neighbors 
def neighbors(user):
    return correlation.filter(items=[user]).sort_values(by=[user], ascending=False, inplace=False).drop([user])

In [471]:
def recommend(user, k):
    # filter matrix to only include neighbors 
    user_corr = neighbors(user).head(k)
    index = user_corr.index.tolist()
    neighbors_data = users_data.filter(items=index)
    
    # normalize neighbors data
    neighbors_average = neighbors_data.mean()
    normalized_neighbors_data = neighbors_data.subtract(neighbors_average, axis=1)

    # sum weigted ratings
    weighted_data = normalized_neighbors_data.mul(user_corr[user], axis=1)
    weighted_sum = weighted_data.sum(axis=1)

    # create correlation matrix 
    # sum correlations 
    correlation_data = neighbors_data.notnull().astype('int').mul(user_corr[user], axis=1)
    correlation_sum = correlation_data.sum(axis=1)
    
    # user ratings mean
    user_mean = users_data[user].mean()
    
    estimated_ratings = user_mean + (weighted_sum/correlation_sum)

    # calculate ratings by dividing weighted_sum by correlation_sum
    return pd.DataFrame(data= estimated_ratings, columns=[user]).sort_values(by=[user], ascending=False, inplace=False)

In [478]:
def error(user, k):
    # filter by user test ratings
    user_test_data = ratings_test[ratings_test['user_id']==user]
    test_data = pd.merge(user_test_data, items, on='movie_id')  
    test_data = test_data.drop(['unix_timestamp', 'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
     'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
     'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western','movie_id'], axis=1)
    test_data = test_data.pivot(index='movie title', columns='user_id', values='rating')

    # find estimated ratings from training data
    training_data = recommend(user,k)

    #calculate root mean squared error 
    return sqrt(test_data.subtract(training_data, axis=1).pow(2).dropna().mean())

In [473]:
def min_error(user):
    error_analysis = []
    for i in range(1,100):
        error_analysis.append(error(user,i))
    return error_analysis.index(min(error_analysis))

In [492]:
error(313,37)

0.86583338417502

In [476]:
recommend(365,37).head(25)

Unnamed: 0_level_0,365
movie title,Unnamed: 1_level_1
Jean de Florette (1986),5.215618
Cold Comfort Farm (1995),5.215618
"Secret of Roan Inish, The (1994)",5.063636
Dances with Wolves (1990),5.063636
Antonia's Line (1995),5.063636
Raise the Red Lantern (1991),5.063636
Home for the Holidays (1995),5.063636
"Birds, The (1963)",5.013097
"Big Sleep, The (1946)",5.013097
Forbidden Planet (1956),4.979132
