# Movie Recommender code for STAT542 final project
#### 05/09/2017

## Load necessary packages

In [1]:
all.start.time = Sys.time()

# inhibit warning messages
options(warn=-1)

# check for required packages
if (!require(recommenderlab)) {
  install.packages("recommenderlab")
}
if (!require(reshape2)) {
  install.packages("reshape2")
}
if (!require(ggplot2)) {
  install.packages("ggplot2")
}
library(recommenderlab)
library(reshape2)
library(ggplot2)

Loading required package: recommenderlab
Loading required package: Matrix
Loading required package: arules

Attaching package: ‘arules’

The following objects are masked from ‘package:base’:

    abbreviate, write

Loading required package: proxy

Attaching package: ‘proxy’

The following object is masked from ‘package:Matrix’:

    as.matrix

The following objects are masked from ‘package:stats’:

    as.dist, dist

The following object is masked from ‘package:base’:

    as.matrix

Loading required package: registry
Loading required package: reshape2
Loading required package: ggplot2


## Read training and testing data into memory

In [2]:
# read in training data
train = read.csv("../data/train.dat",colClasses = c('integer', 'NULL'),header=FALSE,sep = ':')
# set the new column names
colnames(train) = c('UserID', 'MovieID', 'Rating', 'Timestamp')
# remove the timestamp
train = train[,c('UserID', 'MovieID', 'Rating')]
train[1:5,]
nrow(train)

UserID,MovieID,Rating
1,661,3
1,3408,4
1,2355,5
1,1197,3
1,1287,5


In [3]:
# read in testing data
test_raw = read.csv("../data/test.csv",header=TRUE,sep = ',')
test_raw[1:5,]
test = test_raw[,-c(1)]
colnames(test) = c('UserID','MovieID')
test$Rating = NA
test[1:5,]
nrow(test)

ID,user,movie
1,1,1193
2,1,914
3,1,938
4,1,2918
5,1,720


UserID,MovieID,Rating
1,1193,
1,914,
1,938,
1,2918,
1,720,


In [4]:
# combine the train data with the test data
all_data = rbind(train,test)
all_data[600122:600130,]
nrow(all_data)

Unnamed: 0,UserID,MovieID,Rating
600122,6040,2020,3.0
600123,6040,2028,5.0
600124,6040,1091,1.0
600125,6040,1094,5.0
600126,6040,562,5.0
600127,1,1193,
600128,1,914,
600129,1,938,
600130,1,2918,


## Data transformation and model building

In [5]:
# using acast to transform data into user-movie matrix
data_acast = acast(all_data, UserID ~ MovieID)
temp_R = as.matrix(data_acast)  # covert it to a matrix
# convert the matrix R into a realRatingMatrix data structure
real_rating_mat = as(temp_R, "realRatingMatrix")
# we can also normalize the matrix
norm_real_rating_mat = normalize(real_rating_mat)  # not necessarily useful later

Using Rating as value column: use value.var to override.


In [7]:
# Building a recommender object/model to train and fit
#recom_model = Recommender(real_rating_mat[1:nrow(real_rating_mat)],method="UBCF", 
#                        param=list(normalize = "Z-score",method="Cosine",nn=5, minRating=1))
recom_model = Recommender(real_rating_mat[1:nrow(real_rating_mat)],method="UBCF", 
                          param=list(normalize = "Z-score",method="Jaccard",nn=5, minRating=1))
# examine what we got for the mdoel
# Depending upon your selection, examine what you got
print(recom_model)
names(getModel(recom_model))
getModel(recom_model)$nn

recom = predict(recom_model, real_rating_mat[1:nrow(real_rating_mat)], type="ratings")

Available parameter (with default values):
method	 =  cosine
nn	 =  25
sample	 =  FALSE
normalize	 =  center
verbose	 =  FALSE
Recommender of type ‘UBCF’ for ‘realRatingMatrix’ 
learned using 6040 users.


## Create submission files and calculate performance (RMSE)

In [8]:
# create the submission files
recom_list=as(recom,"list")
ratings = NULL
for ( u_id in 1:length(test[,1]))
{    # read the userID and movieID from column 1 and column 2 of test data
    userid = test[u_id,1]
    movieid = test[u_id,2]
    this_user = as.data.frame(recom_list[[userid]])  
    # find all the movie ratings for this user to a dataframe for rating retrieval
    this_user$id=row.names(this_user)
    X_rating = this_user[this_user$id==movieid,1]
    ratings[u_id] = ifelse(length(X_rating) == 0, 0, X_rating)
}
length(ratings)
test_raw$rating = ratings
write.table(test_raw,file="../data/mysubmission2.csv",row.names=FALSE,col.names=TRUE,sep=',')

In [9]:
# calculate the RMSE
true_ratings = read.csv("../data/trueLabel.csv",header=TRUE,sep = ',')[,2]
my_rmse = RMSE(true_ratings,ratings,na.rm=TRUE)
cat('This code gave RMSE as:\t', as.character(my_rmse), '\n')

This code gave RMSE as:	 1.01235395878731 


In [10]:
ratings_round = round(ratings)
my_rmse_round = RMSE(true_ratings,ratings_round,na.rm=TRUE)
cat('Round ratings gave RMSE as:\t', as.character(my_rmse_round), '\n')

Round ratings gave RMSE as:	 1.05195206466597 


In [11]:
# Pring on screen the running time info of this code
all.end.time = Sys.time()
cat("===== Total running time info =====\n")
cat('Start   at:\t', as.character(all.start.time), '\n')
cat('End     at:\t', as.character(all.end.time), '\n')
cat('Total time:\t', all.end.time - all.start.time, '\n')

===== Total running time info =====
Start   at:	 2017-05-09 17:41:16 
End     at:	 2017-05-09 18:06:22 
Total time:	 25.095 


## Acknowledge
#### This code borrowd some ideas and code component from TA's code and website at
https://ashokharnal.wordpress.com/2014/12/18/using-recommenderlab-for-predicting-ratings-for-movielens-data/