# Downloading Rating Data

## Navigating through anonymous web data

In [1]:
if (!require("RCurl")) install.packages("RCurl", repos='http://cran.us.r-project.org')

Loading required package: RCurl



In [2]:
library(RCurl)
URL <- "https://github.com/amirkrifa/ms-web-dataset/raw/master/anonymous-msweb.data"
file <- read.delim(URL, header=FALSE, sep="\n", stringsAsFactors=FALSE)[,1]

In [3]:
file[1:10]

In [4]:
log_attributes <- list()
page_visits <- list()
user_visits <- list()

log_attribute <- setClass("log_attribute", slots = c(page = "numeric", 
                                             id = "numeric", 
                                             description = "character", 
                                             url = "character"))

current_user_id <- -1
current_user_ids <- NULL

for (filerow in 1:length(file)) {
    chunks <- unlist(strsplit(file[filerow], ","))
    entry_type <- chunks[1]
    
    if (entry_type=="A") {
        type <- chunks[1]
        id <- chunks[2] 
        ignored <- chunks[3] 
        description <- chunks[4] 
        url <- chunks[5]
        log_attributes[[as.integer(id)]] <- log_attribute(id=as.integer(id), 
                                                description=description, 
                                                url=url)
        
    } else if (entry_type=="C") {
        if (current_user_id!=-1) {
            user_visits[[current_user_id]] <- unique(current_user_ids)
            current_user_ids <- NULL
        }
        current_user_id <- as.integer(chunks[3])
        
    } else if (entry_type=="V") {
        page_id <- as.integer(chunks[2])
        if (length(current_user_ids) > 0) {
            current_user_ids <- c(current_user_ids, page_id)
        } else { current_user_ids <- page_id}
        
        if (is.null(page_visits[[as.character(page_id)]])) {
            page_visits[[as.character(page_id)]] <- current_user_id
        } else {
            page_visits[[as.character(page_id)]] <- c(page_visits[as.character(page_id)], current_user_id)
        }
    }
    
}

In [5]:
print(paste('Total Number of Attributes:', length(Filter(Negate(is.null), log_attributes))))
print(paste('Total Number of Users:', length(Filter(Negate(is.null), user_visits))))
print(paste('Total Number of VRoots:', length(Filter(Negate(is.null), page_visits))))

[1] "Total Number of Attributes: 294"
[1] "Total Number of Users: 32710"
[1] "Total Number of VRoots: 285"


In [6]:
for (log in log_attributes) {
    if (!(is.null(log))) {
        cat(paste(log@id, log@description, log@url, sep="\t\t"), "\n")
    }
}

1000		regwiz		/regwiz 
1001		Support Desktop		/support 
1002		End User Produced View		/athome 
1003		Knowledge Base		/kb 
1004		Microsoft.com Search		/search 
1005		Norway		/norge 
1006		misc		/misc 
1007		International IE content		/ie_intl 
1008		Free Downloads		/msdownload 
1009		Windows Family of OSs		/windows 
1010		Visual Basic		/vbasic 
1011		MS Office Development		/officedev 
1012		Outlook Development		/outlookdev 
1013		Visual Basic Support		/vbasicsupport 
1014		Office Free Stuff		/officefreestuff 
1015		Excel		/msexcel 
1016		MS Excel		/excel 
1017		Products 		/products 
1018		isapi		/isapi 
1019		MS PowerPoint		/mspowerpoint 
1020		Developer Network		/msdn 
1021		Visual C		/visualc 
1022		Typography Site		/truetype 
1023		Spain		/spain 
1024		Internet Information Server		/iis 
1025		Web Site Builder's Gallery		/gallery 
1026		Internet Site Construction for Developers		/sitebuilder 
1027		Internet Development		/intdev 
1028		OLE Development		/oledev 
1029		Clip Gallery Live		

### Obtaining statistics

In [7]:
nbr_visits <- sum(sapply(Filter(Negate(is.null), user_visits), length))
average_visits <- nbr_visits / length(Filter(Negate(is.null), user_visits))
one_visit <- sum(sapply(Filter(Negate(is.null), user_visits), length)==1)

print(paste("Number of user visits:", nbr_visits))
print(paste("Average number of visits:", average_visits))
print(paste("Users with just one visit:", one_visit))

[1] "Number of user visits: 98653"
[1] "Average number of visits: 3.01598899419138"
[1] "Users with just one visit: 9994"


## Encountering the limits of rating data

### Obtaining the data

In [8]:
url <- "http://files.grouplens.org/datasets/movielens/ml-20m.zip"
destfile <- "ml-20m.zip"

In [9]:
download.file(url, destfile, mode =  "wb")

In [10]:
unzip(destfile, list = T)
unzip(destfile)

Name,Length,Date
<chr>,<dbl>,<dttm>
ml-20m/,0,2016-10-17 10:12:00
ml-20m/genome-scores.csv,323544381,2016-10-17 09:41:00
ml-20m/genome-tags.csv,18103,2016-10-17 09:41:00
ml-20m/links.csv,570090,2016-10-17 09:50:00
ml-20m/movies.csv,1397542,2015-03-31 16:17:00
ml-20m/ratings.csv,533444411,2015-03-31 16:14:00
ml-20m/README.txt,10261,2016-10-17 10:21:00
ml-20m/tags.csv,16603996,2015-03-31 16:01:00


Reading these files into memory can take a long time. Please be patient.

In [11]:
ratings <- read.csv("ml-20m/ratings.csv")
movies  <- read.csv("ml-20m/movies.csv")

With respect to the Python example, we have to sub-sample the ratings data because of R memory limitations.

In [None]:
# sampling for memory efficiency

set.seed(0)
sample_size <- floor(0.10 * dim(ratings)[1])
train_idx <- sample(seq_len(dim(ratings)[1]), size = sample_size)

In [None]:
ratings <- ratings[train_idx,]

In [None]:
movie_data <- merge(ratings, movies, by="movieId")

In [None]:
head(movie_data)

In [None]:
mean_ratings <- aggregate(movie_data$rating, list(movie_data$title), mean)

In [17]:
head(mean_ratings)

Group.1,x
<fct>,<dbl>
'71 (2014),3.666667
"'burbs, The (1989)",3.078864
'Hellboy': The Seeds of Creation (2004),3.038462
'Neath the Arizona Skies (1934),3.833333
'night Mother (1986),3.604651
'R Xmas (2001),3.5


### Cleaning up

In [18]:
remove(ratings)
remove(movies)

In [19]:
file.remove("./ml-20m.zip")

In [20]:
unlink("./ml-20m", recursive=TRUE)

### Massaging the data


In [21]:
reduced_movie <- movie_data[movie_data['rating']>=3.0,]

In [22]:
reduced_movie <- reduced_movie[,c("userId", "rating", "title")]

In [23]:
head(reduced_movie)

userId,rating,title
<int>,<dbl>,<fct>
86152,4.5,Toy Story (1995)
71326,3.5,Toy Story (1995)
123729,4.0,Toy Story (1995)
11333,3.5,Toy Story (1995)
85797,4.0,Toy Story (1995)
87568,5.0,Toy Story (1995)


In [24]:
print(paste("original shape:", paste(dim(movie_data), collapse=',')))
print(paste("reduced shape:", paste(dim(reduced_movie), collapse=',')))

[1] "original shape: 2000026,6"
[1] "reduced shape: 1649135,3"


In [25]:
remove(movie_data)

In [26]:
counts <- table(reduced_movie$title)

In [27]:
cut_off <- 1000
head(counts[counts > cut_off])


                  12 Angry Men (1957)          2001: A Space Odyssey (1968) 
                                 1270                                  2240 
                 28 Days Later (2002)                     Abyss, The (1989) 
                                 1064                                  1572 
    Ace Ventura: Pet Detective (1994) Ace Ventura: When Nature Calls (1995) 
                                 2599                                  1139 

In [28]:
selected_films <- names(counts[counts > cut_off])
reduced_movie <- reduced_movie[reduced_movie$title %in% selected_films,]

In [29]:
print(paste("reduced shape:", paste(dim(reduced_movie), collapse=',')))

[1] "reduced shape: 701676,3"


### Performing collaborative filtering

In R we appraoach the collaborative filtering in a different way. First we find all the people that has seen the film, retaining only their top votes (>=4). Then we collect all their other movies and obtain a mean average rating for them. Finally we report the highest other seen movies, thus providing a recommendation on the movies that people will most likely favor based on the people who have seen our target movie.

In [30]:
target_movie <- 'Pulp Fiction (1994)'

In [31]:
seen_movie <- reduced_movie$title==target_movie
movie_preference <- reduced_movie$rating >=4
target_users <- reduced_movie$userId %in% unique(reduced_movie$userId[seen_movie&movie_preference])

In [32]:
affine_items <- reduced_movie[target_users&(reduced_movie$title!=target_movie),]

In [33]:
scoring <- aggregate(affine_items$rating, list(affine_items$title), mean)

In [34]:
head(scoring[order(-scoring$x),])

Unnamed: 0_level_0,Group.1,x
Unnamed: 0_level_1,<fct>,<dbl>
140,"Godfather, The (1972)",4.537402
344,"Usual Suspects, The (1995)",4.513514
21,Annie Hall (1977),4.475904
282,"Shawshank Redemption, The (1994)",4.464706
16,"Amelie (Fabuleux destin d'AmÃ©lie Poulain, Le) (2001)",4.459854
274,Schindler's List (1993),4.457192
