# Data Preprocessing and Set up

In [10]:
import pandas as pd
import numpy as np

ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=['user id', 'movie id', 'rating', 'timestamp'], encoding='latin-1')
movies = pd.read_csv('ml-100k/u.item', sep='|', names=['movie id', 'movie title', 'release date', 'video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', ' Sci-Fi', 'Thriller', 'War', 'Western'], encoding='latin-1')
users = pd.read_csv('ml-100k/u.user', sep='|', names=['user id', 'age', 'gender', 'occupation', 'zip code'], encoding='latin-1')

# Sort by older to newer rating scores as assigned
ratings = ratings.sort_values(by = 'timestamp')

In [11]:
from sklearn.model_selection import train_test_split

def get_data(kind='user'):
    train = pd.DataFrame()
    test = pd.DataFrame()
    
    if kind == 'user':
        for user_id in users['user id']:
            subset_X = ratings.loc[ratings['user id'] == user_id].copy()
            subset_y = ratings.loc[ratings['user id'] == user_id]['user id']
            X_train, X_test, y_train, y_test = train_test_split(subset_X, subset_y, test_size = 0.30, shuffle=False)
            train = train.append(X_train, ignore_index = True)
            test = test.append(X_test, ignore_index = True)
            
    elif kind == 'item':        
        for movie_id in movies['movie id']:
            subset_X = ratings.loc[ratings['movie id'] == movie_id].copy()
            subset_y = ratings.loc[ratings['movie id'] == movie_id]['movie id']
            X_train, X_test, y_train, y_test = train_test_split(subset_X, subset_y, test_size = 0.30, shuffle=False)
            if len(X_train) == 0:
                X_train = X_train.append(X_test, ignore_index = True)
                X_test = pd.DataFrame()
                train = train.append(X_train, ignore_index = True)
                test = test.append(X_test, ignore_index = True)
            else:
                train = train.append(X_train, ignore_index = True)
                test = test.append(X_test, ignore_index = True)
        
    return train, test

In [12]:
from sklearn.metrics import mean_squared_error

def RMSE(y_actual, y_predict):
    return np.sqrt(mean_squared_error(y_actual, y_predict))


def evaluate(model, test):
    pairs = zip(test['user id'], test['movie id'])
    predict = np.array([model(user, movie) for (user, movie) in pairs])
    actual = np.array(test['rating'])
    return RMSE(actual, predict)

# Q1
Build a simple version of the collaborative filter in such a way that outputs the mean rating for the movie by all the users who have rated it. Note that here the ratings of each user is assigned an equal weight. If some movies are available only in the test set and not in the training set, assign default to a rating of 3.0. What is the RMSE score obtained by this model? Provide a snapshot of the console output from you code.

In [13]:
# Define dataset
train_set, test_set = get_data('user')

# R Matrix
r_matrix = train_set.pivot_table(values = 'rating', index = 'user id', columns = 'movie id')

def mean_rating(user_id, movie_id):
    if movie_id in r_matrix:
        rating = r_matrix[movie_id].mean()
    else:
        rating = 3.0
    return rating

print('RMSE score of Q1: '+ str(evaluate(mean_rating, test_set)))

RMSE score of Q1: 1.0682641856279251


# Q2
Build a collaborative recommender system that utilizes the Pearson correlation coefficient to give differential weights to the users. This is to give more preference to those users whose ratings are similar to the user in question than the other users whose ratings are not. If some movies are available only in the test set and not in the training set, assign default to a rating of 3.0, as you did before. What is the RMSE score obtained by this model? Provide a snapshot of the console output from your code.

In [14]:
# Define dataset
train_set, test_set = get_data('user')

# R Matrix
r_matrix = train_set.pivot_table(values = 'rating', index = 'user id', columns = 'movie id')
# Replace missing values by average rating of each user
r_matrix = r_matrix.copy().fillna(r_matrix.mean())

# Pearson Correlation Coefficient Matrix
corr_matrix = r_matrix.transpose().corr(method='pearson')

def weighted_mean_rating(user_id, movie_id):
    corr_mean = corr_matrix[user_id].mean()
    if movie_id in r_matrix:
        user_mean = r_matrix.loc[user_id].mean() # r_a_bar
        # select top 7 users with value threshold >= mean of current user's similarity
        nearest_neighbors = corr_matrix[user_id].loc[corr_matrix[user_id] >= corr_mean].sort_values(ascending = False)[:8]
        weighted_similarity = []
        for n in nearest_neighbors.index:
            if n != user_id:
                result = nearest_neighbors[n] * (r_matrix.loc[n][movie_id] - r_matrix.loc[n].mean()) # sim(a,b) * (r_b - r_b_bar)
                weighted_similarity.append(result)
        # pred = r_a_bar + sum of sim(a,b) * (r_b - r_b_bar) / sum of sim(a,b)
        rating = user_mean + (np.sum(weighted_similarity) / nearest_neighbors.sum())
    else:
        rating = 3.0
    return rating

print('RMSE score of Q2: '+ str(evaluate(weighted_mean_rating, test_set)))

RMSE score of Q2: 1.0503315028071907


# Q3
Build an interactive prediction model in such a way that it returns top 3 movies (movie ID, movie title, predicted rating score, actual rating score) from the testing dataset when a user ID is entered. Do this for 10 random users and report the RMSE score obtained by the interactive sessions? Provide a summary of the results and the RMSE score computed.

In [58]:
# Input section
train_set, test_set = get_data('user')

valid_input = ['fixed', 'random']

select_user = ''

while select_user not in valid_input:
    select_user = input('Please select user testing group "fixed" or "random": ').lower()

# User ids for testing
if select_user == 'fixed':
    user_ids = [1, 7, 99, 171, 177, 200, 340, 555, 857, 937] # fixed for test
else:
    user_ids = np.random.choice(np.array(test_set['user id'].unique()),size = 10, replace = False) # random from test_set

Please select user testing group "fixed" or "random": random


In [60]:
# Recommender System section
from collections import defaultdict

def predict_user_rating(users, test_set):
    recommendation = defaultdict(list)

    for user_id in users:
        # using movie data from test set as required in the question
        for movie_id in test_set['movie id'].unique():
            recommendation[user_id].append(weighted_mean_rating(user_id, movie_id))

    return pd.DataFrame(recommendation, index=test_set['movie id'].unique(), columns=users).transpose()

recommendation = predict_user_rating(user_ids, test_set)

def recommend_top_movies(predicted_table, test_set):
    top = 3
    user_movies = []
    for user in predicted_table.index:
        rated_movies = predicted_table[np.array(test_set.loc[test_set['user id'] == user]['movie id'])]
        top_rating = rated_movies.loc[user].sort_values(ascending = False)[:top]
        top_movies = movies.loc[movies['movie id'].isin(top_rating.index)][['movie id', 'movie title']]
        for t in top_rating.index:
            top_movies.loc[top_movies['movie id'] == t, 'predicted rating score'] = top_rating[t]
            top_movies.loc[top_movies['movie id'] == t, 'actual rating score'] = np.float(test_set.loc[(test_set['user id'] == user) & (test_set['movie id'] == t)]['rating'])

        top_movies = top_movies.sort_values(by = 'predicted rating score', ascending = False)
        user_movies.append(top_movies)
        
    return user_movies

recommended_movies = recommend_top_movies(recommendation, test_set)

In [61]:
# Evaluation section
actual = []
predict = []

for user_movies in recommended_movies:
    display(user_movies)
    user_actual = list(user_movies['actual rating score'])
    user_predict = list(user_movies['predicted rating score'])
    print('RMSE Score: '+ str(RMSE(user_actual, user_predict)))
    print('')
    actual = actual + user_actual
    predict = predict + user_predict
    
print('RMSE score of Q3: '+ str(RMSE(actual, predict)))

Unnamed: 0,movie id,movie title,predicted rating score,actual rating score
367,368,Bio-Dome (1996),4.790471,1.0
927,928,"Craft, The (1996)",4.556793,3.0
546,547,"Young Poisoner's Handbook, The (1995)",4.354293,4.0


RMSE Score: 2.3746436157009985



Unnamed: 0,movie id,movie title,predicted rating score,actual rating score
271,272,Good Will Hunting (1997),4.489538,4.0
347,348,Desperate Measures (1998),4.37317,3.0
314,315,Apt Pupil (1998),4.208649,5.0


RMSE Score: 0.9576844961239827



Unnamed: 0,movie id,movie title,predicted rating score,actual rating score
627,628,Sleepers (1996),4.552925,5.0
282,283,Emma (1996),4.053329,4.0
236,237,Jerry Maguire (1996),4.047153,4.0


RMSE Score: 0.26137060045556443



Unnamed: 0,movie id,movie title,predicted rating score,actual rating score
546,547,"Young Poisoner's Handbook, The (1995)",4.362415,3.0
9,10,Richard III (1995),4.080126,3.0
255,256,When the Cats Away (Chacun cherche son chat) (...,4.050346,4.0


RMSE Score: 1.0042212668093684



Unnamed: 0,movie id,movie title,predicted rating score,actual rating score
178,179,"Clockwork Orange, A (1971)",4.273094,5.0
521,522,Down by Law (1986),4.273094,5.0
156,157,Platoon (1986),4.238842,2.0


RMSE Score: 1.4223454871270722



Unnamed: 0,movie id,movie title,predicted rating score,actual rating score
314,315,Apt Pupil (1998),3.971679,4.0
242,243,Jungle2Jungle (1997),3.863039,3.0
900,901,Mr. Magoo (1997),3.728117,4.0


RMSE Score: 0.5226724823514199



Unnamed: 0,movie id,movie title,predicted rating score,actual rating score
431,432,Fantasia (1940),4.412701,5.0
371,372,Jeffrey (1995),4.240548,5.0
186,187,"Godfather: Part II, The (1974)",4.029208,3.0


RMSE Score: 0.812600132686855



Unnamed: 0,movie id,movie title,predicted rating score,actual rating score
468,469,Short Cuts (1993),4.494133,5.0
78,79,"Fugitive, The (1993)",4.028278,3.0
461,462,Like Water For Chocolate (Como agua para choco...,4.028278,5.0


RMSE Score: 0.8674677187047659



Unnamed: 0,movie id,movie title,predicted rating score,actual rating score
268,269,"Full Monty, The (1997)",4.818669,3.0
331,332,Kiss the Girls (1997),4.617661,3.0
627,628,Sleepers (1996),4.560585,2.0


RMSE Score: 2.0396880216933226



Unnamed: 0,movie id,movie title,predicted rating score,actual rating score
506,507,"Streetcar Named Desire, A (1951)",4.581325,4.0
426,427,To Kill a Mockingbird (1962),4.505228,4.0
87,88,Sleepless in Seattle (1993),4.144461,4.0


RMSE Score: 0.45242416137084157

RMSE score of Q3: 1.2532738772521947


# Q4
For this question, build an item-based collaborative recommender system that utilizes the Cosine similarity measure instead of Pearson. What is the RMSE score obtained by your proposed model? Provide a snapshot of the console output from your code.

In [18]:
from sklearn.metrics.pairwise import cosine_similarity

# Define dataset
train_set, test_set = get_data('item')

# R Matrix
r_matrix = train_set.pivot_table(values='rating', index='movie id', columns='user id')
# Replace missing value by mean
r_matrix = r_matrix.copy().fillna(r_matrix.mean())

# Cosine similarity matrix
cosine_sim = cosine_similarity(r_matrix, r_matrix)
cosine_sim = pd.DataFrame(cosine_sim, index=r_matrix.index, columns=r_matrix.index)

def item_based_rating(user_id, movie_id):
    if user_id in r_matrix:
        similarity = cosine_sim[movie_id] # similary of this movie to others
        movie_rating = r_matrix[user_id] # r_u at movie ith
        rating = np.dot(similarity, movie_rating) / similarity.sum() # sum of sim(i,p) * r_u,i / sum of sim(i,p)
    else:
        rating = 3.0
    return rating

print('RMSE score of Q4: '+ str(evaluate(item_based_rating, test_set)))

RMSE score of Q4: 1.2212350952371644
