In [180]:
import numpy as np  
import pandas as pd
import matplotlib.pyplot as plt
from math import sqrt

In [181]:
#Reading ratings file:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=r_cols, encoding='latin-1')

# split the ratings into training and test
ratings_training = ratings.sample(frac=0.7)
ratings_test = ratings.drop(ratings_training.index)

#Reading items file:
i_cols = ['movie_id', 'movie title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
items = pd.read_csv('ml-100k/u.item', sep='|', names=i_cols, encoding='latin-1')

#Merging ratings and items:
movie_data = pd.merge(ratings_training, items, on='movie_id')  
movie_data = movie_data.drop(['unix_timestamp', 'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'], axis=1)
rating_count = pd.DataFrame(movie_data.groupby('movie title')['rating'].count()).sort_values(by=['rating'],ascending=False)

In [182]:
# user ratings
# find correlation between users
items_data = movie_data.drop(['movie_id'],axis=1).pivot_table(index='user_id', columns='movie title', values='rating')
correlation = items_data.corr()

# find nearest neighbors 
def neighbors(item):
    return correlation.filter(items=[item]).sort_values(by=[item], ascending=False, inplace=False).drop([item])

In [183]:
def estimate_rating(user,item, k):
    # filter matrix to only include neighbors 
    # filter item corr to only include positive correlations
    item_corr = neighbors(item).head(k)
    item_corr = item_corr[item_corr[item]>0]
    index = item_corr.index.tolist()
    neighbors_data = items_data.filter(items=index).filter(items=[user], axis =0)

    # normalize neighbors data
    neighbors_average =  items_data.filter(items=index).mean()
    normalized_neighbors_data = neighbors_data.subtract(neighbors_average, axis=1)
    
    # find weighted ratings
    # sum weigted ratings
    weighted_data = normalized_neighbors_data.mul(item_corr[item], axis=1)
    weighted_sum = weighted_data.sum(axis=1)

    # create correlation matrix 
    # sum correlations 
    correlation_data = neighbors_data.notnull().astype('int').mul(item_corr[item], axis=1)
    correlation_sum = correlation_data.sum(axis=1)
    
    # item ratings mean
    item_mean = items_data[item].mean()
    
    estimated_ratings = item_mean + (weighted_sum/correlation_sum)

    # calculate ratings by dividing weighted_sum by correlation_sum
    return pd.DataFrame(data= estimated_ratings, columns=[item]).T

In [184]:
def recommend(user, k):
    titles = rating_count.index.tolist()
    ratings_df = pd.DataFrame()
    for i in titles:
        ratings_df = ratings_df.append(estimate_rating(user,i,k), ignore_index=False)
    return ratings_df.sort_values(by=[user], ascending=False, inplace=False)

In [185]:
def error(user, k):
    # filter by user test ratings
    user_test_data = ratings_test[ratings_test['user_id']==user]
    test_data = pd.merge(user_test_data, items, on='movie_id')  
    test_data = test_data.drop(['unix_timestamp', 'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
     'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
     'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western','movie_id'], axis=1)
    test_data = test_data.pivot(index='movie title', columns='user_id', values='rating')

    # find estimated ratings from training data
    training_data = recommend(user,k)

    #calculate root mean squared error 
    return sqrt(test_data.subtract(training_data, axis=1).pow(2).dropna().mean())

In [194]:
error(313,37)

1.1341543348393632

In [187]:
recommend(19,37).head(40)

user_id,19
To Live (Huozhe) (1994),5.71875
"Whole Wide World, The (1996)",5.66875
"39 Steps, The (1935)",5.576858
"Umbrellas of Cherbourg, The (Parapluies de Cherbourg, Les) (1964)",5.527574
Notorious (1946),5.501008
"Treasure of the Sierra Madre, The (1948)",5.433036
Pather Panchali (1955),5.150926
"Ghost and Mrs. Muir, The (1947)",5.122596
SubUrbia (1997),5.040179
Orlando (1993),5.007212
