This notebook has some exploration code for getting correct values for test cases etc.

In [1]:
library(tidyverse)

-- Attaching packages --------------------------------------- tidyverse 1.2.1 --
v ggplot2 2.2.1     v purrr   0.2.4
v tibble  1.4.1     v dplyr   0.7.4
v tidyr   0.7.2     v stringr 1.2.0
v readr   1.1.1     v forcats 0.2.0
-- Conflicts ------------------------------------------ tidyverse_conflicts() --
x dplyr::filter() masks stats::filter()
x dplyr::lag()    masks stats::lag()


In [4]:
ratings = read_csv('ml-latest-small/ratings.csv') %>%
    rename(user=userId, item=movieId)
head(ratings)

Parsed with column specification:
cols(
  userId = col_integer(),
  movieId = col_integer(),
  rating = col_double(),
  timestamp = col_integer()
)


user,item,rating,timestamp
1,31,2.5,1260759144
1,1029,3.0,1260759179
1,1061,3.0,1260759182
1,1129,2.0,1260759185
1,1172,4.0,1260759205
1,1263,2.0,1260759151


In [5]:
item_stats = ratings %>% group_by(item) %>%
    summarize(RatingCount=n(), MeanRating=mean(rating))
item_stats %>% arrange(-RatingCount) %>% head(10)

item,RatingCount,MeanRating
356,341,4.054252
296,324,4.256173
318,311,4.487138
593,304,4.138158
260,291,4.221649
480,274,3.706204
2571,259,4.183398
1,247,3.87247
527,244,4.303279
589,237,4.006329


In [15]:
user_stats = ratings %>% group_by(user) %>%
    summarize(RatingCount=n(), MeanRating=mean(rating))
head(user_stats)

user,RatingCount,MeanRating
1,20,2.55
2,76,3.486842
3,51,3.568627
4,204,4.348039
5,100,3.91
6,44,3.261364


Let's get user-normed ratings.

In [16]:
user_normed = ratings %>% inner_join(user_stats %>% select(user, MeanRating)) %>%
    mutate(rating = rating - MeanRating) %>%
    select(user, item, rating)
head(user_normed)

Joining, by = "user"


user,item,rating
1,31,-0.05
1,1029,0.45
1,1061,0.45
1,1129,-0.55
1,1172,1.45
1,1263,-0.55


## User-User CF test cases

First test case: user 4, item 1016.

Get info for users rating 1016, so we can compute correct CF results.

In [27]:
lim_rates = ratings %>% filter(item == 1016) %>% select(user) %>%
    inner_join(user_normed) %>%
    group_by(user) %>%
    mutate(nr=rating / sqrt(sum(rating * rating)))
print(lim_rates)

Joining, by = "user"


# A tibble: 5,262 x 4
# Groups: user [10]
    user  item rating      nr
   <int> <int>  <dbl>   <dbl>
 1     4    10 -0.348 -0.0258
 2     4    34  0.652  0.0483
 3     4   112  0.652  0.0483
 4     4   141  0.652  0.0483
 5     4   153 -0.348 -0.0258
 6     4   173 -1.35  -0.0998
 7     4   185 -1.35  -0.0998
 8     4   260  0.652  0.0483
 9     4   289 -0.348 -0.0258
10     4   296  0.652  0.0483
# ... with 5,252 more rows


In [30]:
sims = lim_rates %>% select(u1=user, item, u1nr=nr) %>%
    inner_join(lim_rates %>% select(u2=user, item, u2nr=nr)) %>%
    filter(u1 != u2) %>%
    group_by(u1, u2) %>%
    summarize(sim=sum(u1nr * u2nr))
print(sims)

Joining, by = "item"


# A tibble: 90 x 3
# Groups: u1 [?]
      u1    u2    sim
   <int> <int>  <dbl>
 1     4    99 0.0311
 2     4   239 0.0644
 3     4   311 0.0225
 4     4   358 0.0982
 5     4   518 0.114 
 6     4   551 0.0100
 7     4   564 0.0437
 8     4   603 0.0614
 9     4   646 0.0250
10    99     4 0.0311
# ... with 80 more rows


In [31]:
basis = sims %>% filter(u1 == 4) %>%
    select(u1, user=u2, sim) %>%
    inner_join(user_normed) %>%
    filter(item == 1016)
basis

Joining, by = "user"


u1,user,sim,item,rating
4,99,0.03108641,1016,-1.4308511
4,239,0.0643867,1016,-0.7079038
4,311,0.02249353,1016,-1.0063788
4,358,0.09821867,1016,-0.1831442
4,518,0.11448729,1016,-1.572843
4,551,0.01003771,1016,0.6
4,564,0.04373835,1016,0.4475375
4,603,0.06135812,1016,-0.8684211
4,646,0.02495728,1016,-0.1301775


In [34]:
sum(basis$sim * basis$rating) / sum(basis$sim) + filter(user_stats, user == 4)$MeanRating

### Find items with few common users to 4

In [35]:
user4 = user_normed %>% filter(user == 4)

In [37]:
user4 %>% select(item) %>% inner_join(user_normed) %>%
    filter(user != 4) %>%
    group_by(item) %>%
    summarize(UserCount=n()) %>%
    arrange(UserCount)

Joining, by = "item"


item,UserCount
2091,2
2659,2
2903,2
2902,3
1858,5
3040,5
3208,5
1332,6
2102,6
2263,7
