# Recommender system

In [1]:
library(tidyverse)
library(keras)
library(reshape)
library(tensorflow)

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.0 ──

[32m✔[39m [34mggplot2[39m 3.3.2     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.0.4     [32m✔[39m [34mdplyr  [39m 1.0.2
[32m✔[39m [34mtidyr  [39m 1.1.2     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.4.0     [32m✔[39m [34mforcats[39m 0.5.0

── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()


Attaching package: ‘reshape’


The following object is masked from ‘package:dplyr’:

    rename


The following objects are masked from ‘package:tidyr’:

    expand, smiths




In [2]:
# data loading
url <- "https://raw.githubusercontent.com/simecek/dspracticum2020/master/lecture_09/assignment/fake_v1_100x252.csv"

data_entry <- read.csv(url)

In [3]:
# number of ratings
sum(!is.na(data_entry[,-1]))

# percentage of ratings
sum(!is.na(data_entry[,-1])) / (dim(data_entry[,-1])[1] * dim(data_entry[,-1])[2]) 

In [4]:
n_users <- dim(data_entry)[1]  # num. of users
n_tweets <- dim(data_entry)[2] - 1  # num. of tweets

data <- melt(as.data.frame(data_entry)) %>%
  select(user_id = X, tweet_id = variable, rating = value) %>%
  mutate(user_id = rep(1:n_users, n_tweets), tweet_id = rep(1:n_tweets, rep(n_users, n_tweets))) %>%
  na.omit()

Using X as id variables



In [5]:
head(data)

Unnamed: 0_level_0,user_id,tweet_id,rating
Unnamed: 0_level_1,<int>,<int>,<int>
1,1,1,4
4,4,1,4
18,18,1,3
22,22,1,4
30,30,1,5
43,43,1,5


In [6]:
set.seed(1234)
data <- data %>% sample_frac() # nahodne prohazi radky

x_train <- data %>% select(c(user_id, tweet_id)) %>% as.matrix()
y_train <- data %>% pull(rating)

In [7]:
head(x_train)

user_id,tweet_id
16,53
38,33
70,136
86,49
91,226
79,149


# Basic model

Collaborative filtering

In [8]:
embedding_dim <- 5

In [9]:
input_users <- layer_input(shape = 1, name = "users")
input_tweets <- layer_input(shape = 1, name = "tweets")

user_embeddings <- input_users %>% 
  layer_embedding(
    input_dim = n_users + 1,
    output_dim = embedding_dim,
    name = "user_embeddings"
  ) 

tweets_embeddings <- input_tweets %>% 
  layer_embedding(
    input_dim = n_tweets + 1,
    output_dim = embedding_dim,
    name = "tweets_embeddings"
  ) 

In [10]:
dot <- layer_dot(
  inputs = list(user_embeddings, tweets_embeddings),
  axes = 2,
  name = "dot_product"
)

pred <- dot %>% layer_dense(
  units = 1, 
  activation = "relu",
  name = "rating_prediction"
)

In [11]:
model <- keras_model(inputs = c(input_users, input_tweets), outputs = pred)

model %>% compile(
  optimizer = "rmsprop",
  loss = "mse",
  metric = "mae"
)

summary(model)

Model: "functional_1"
________________________________________________________________________________
Layer (type)              Output Shape      Param #  Connected to               
users (InputLayer)        [(None, 1)]       0                                   
________________________________________________________________________________
tweets (InputLayer)       [(None, 1)]       0                                   
________________________________________________________________________________
user_embeddings (Embeddin (None, 1, 5)      505      users[0][0]                
________________________________________________________________________________
tweets_embeddings (Embedd (None, 1, 5)      1265     tweets[0][0]               
________________________________________________________________________________
dot_product (Dot)         (None, 1, 1)      0        user_embeddings[0][0]      
                                                     tweets_embeddings[0][0]    
______

In [12]:
history <- model %>% fit(
  x = list(
    x_train[, "user_id", drop = FALSE],
    x_train[, "tweet_id", drop = FALSE]
  ),
  y = y_train,
  epochs = 20,
  batch_size = 32, 
  validation_split = 0.2,
  callbacks = list(callback_early_stopping(patience = 2))
)

In [13]:
history
# plot(history)


Final epoch (plot to see history):
    loss: 0.3563
     mae: 0.4095
val_loss: 0.4161
 val_mae: 0.445 

In [14]:
best_epoch <- which(history$metrics$val_loss == min(history$metrics$val_loss))
loss <- history$metrics$val_loss[best_epoch] %>% round(4)
mae <- history$metrics$val_mae[best_epoch] %>% round(4)

paste("The best epoch had a loss of ", loss, " and mean absolute error of ", mae)

# Biased model

In [15]:
user_bias <- input_users %>%
  layer_embedding(
    input_dim = n_users + 1,
    output_dim = 1,
    name = "user_bias"
  ) 

tweets_bias <- input_tweets %>%
  layer_embedding(
    input_dim = n_tweets + 1,
    output_dim = 1,
    name = "tweets_bias"
  )

In [16]:
dot_bias <- layer_add(list(dot, user_bias, tweets_bias), name = "add_bias")

pred_bias <- dot_bias %>% layer_dense(units = 1, activation = "relu", 
                                 name = "rating_prediction")

Model

In [17]:
model_bias <- keras_model(inputs = c(input_users, input_tweets), outputs = pred_bias)

model_bias %>% compile(
  optimizer = "rmsprop",
  loss = "mse",
  metric = "mae"
)

summary(model_bias)

Model: "functional_3"
________________________________________________________________________________
Layer (type)              Output Shape      Param #  Connected to               
users (InputLayer)        [(None, 1)]       0                                   
________________________________________________________________________________
tweets (InputLayer)       [(None, 1)]       0                                   
________________________________________________________________________________
user_embeddings (Embeddin (None, 1, 5)      505      users[0][0]                
________________________________________________________________________________
tweets_embeddings (Embedd (None, 1, 5)      1265     tweets[0][0]               
________________________________________________________________________________
dot_product (Dot)         (None, 1, 1)      0        user_embeddings[0][0]      
                                                     tweets_embeddings[0][0]    
______

In [18]:
history_bias <- model_bias %>% fit(
  x = list(
    x_train[, "user_id", drop = FALSE],
    x_train[, "tweet_id", drop = FALSE]
  ),
  y = y_train,
  epochs = 20,
  batch_size = 32, 
  validation_split = 0.2,
  # callbacks = list(callback_early_stopping(patience = 2))
)

history_bias


Final epoch (plot to see history):
    loss: 17.56
     mae: 4.144
val_loss: 17.55
 val_mae: 4.142 

In [19]:
best_epoch <- which(history_bias$metrics$val_loss == min(history_bias$metrics$val_loss))
loss <- history_bias$metrics$val_loss[best_epoch] %>% round(4)
mae <- history_bias$metrics$val_mae[best_epoch] %>% round(4)

paste("The best epoch had a loss of ", loss, " and mean absolute error of ", mae)

# Prediction

In [20]:
# data where the rating is unknown
data_pred <- melt(as.data.frame(data_entry)) %>%
  select(user_id = X, tweet_id = variable, rating = value) %>%
  mutate(user_id = rep(1:n_users, n_tweets), tweet_id = rep(1:n_tweets, rep(n_users, n_tweets))) %>%
  filter(is.na(rating))
data_pred_bias <- data_pred

# predict the ratings
inputs <- list(data_pred$user_id, data_pred$tweet_id)
prediction <- model %>% predict(inputs)  # simple model
prediction_bias <- model_bias %>% predict(inputs)

# mozno zaokruhlit?
head(prediction)
head(prediction_bias)

Using X as id variables



In [21]:
# add predicted rating values to the original data
data_pred$rating <- prediction
data_pred_bias$rating <- prediction_bias

rbind(data_pred, data) %>% spread(., tweet_id, rating, sep = "")

user_id,tweet_id1,tweet_id2,tweet_id3,tweet_id4,tweet_id5,tweet_id6,tweet_id7,tweet_id8,tweet_id9,⋯,tweet_id243,tweet_id244,tweet_id245,tweet_id246,tweet_id247,tweet_id248,tweet_id249,tweet_id250,tweet_id251,tweet_id252
<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,4.000000,4.275599,4.486264,4.195258,4.377076,4.292368,4.197019,4.211802,4.575570,⋯,3.000000,3.000000,4.285399,4.265972,4.125198,4.378028,4.210076,3.964186,4.213996,5.000000
2,4.099943,4.051291,4.241676,5.000000,4.139287,5.000000,4.010329,4.012177,4.332399,⋯,4.072271,4.000000,4.035197,4.037530,3.910022,4.116840,3.000000,3.780839,3.991964,4.434196
3,4.218906,4.000000,4.285257,4.063469,4.245586,4.089353,4.000000,4.093790,4.405373,⋯,4.335568,4.566427,4.133366,4.226872,4.042293,4.332026,4.187989,4.000569,4.104434,4.450416
4,4.000000,4.007133,4.125069,3.904364,4.066030,4.000000,3.928132,4.000000,4.233926,⋯,4.092414,4.000000,3.963133,4.014555,3.860950,4.107394,3.981695,3.796278,3.931175,4.287836
5,4.296748,4.263908,5.000000,4.159848,4.356704,4.254424,4.187782,4.197226,4.533062,⋯,4.341252,4.744712,4.000000,4.000000,4.129161,4.000000,4.209538,3.986037,4.199452,4.663526
6,4.090419,4.084649,4.069908,3.929991,4.108621,3.905428,3.923140,4.000000,4.204069,⋯,5.000000,4.333812,4.010338,4.174112,3.958018,4.300354,4.149464,3.995601,3.987016,4.000000
7,4.200027,4.178940,4.138782,4.031380,4.225726,4.000443,3.990084,3.991858,4.291015,⋯,4.438071,4.435516,4.142715,4.304623,4.081903,4.000000,4.294304,4.120719,4.096274,4.365315
8,4.212715,4.203578,4.196944,4.037774,4.000000,4.037262,4.000000,4.036892,4.314301,⋯,5.000000,4.480993,4.157468,4.337691,4.000000,4.468745,4.287477,4.124598,4.119915,4.000000
9,4.189225,4.168506,5.000000,4.042051,4.228483,4.055007,4.015633,4.024074,4.334850,⋯,4.355938,4.497169,4.000000,4.248432,4.053973,4.385424,4.218429,4.026618,5.000000,4.424945
10,4.146551,4.104795,4.307242,4.000000,4.198207,4.000000,4.045363,4.055967,4.393816,⋯,4.119823,4.584803,4.105453,4.086381,3.958770,4.182173,4.036679,3.810692,4.000000,5.000000


In [22]:
rbind(data_pred_bias, data) %>% spread(., tweet_id, rating, sep = "")

user_id,tweet_id1,tweet_id2,tweet_id3,tweet_id4,tweet_id5,tweet_id6,tweet_id7,tweet_id8,tweet_id9,⋯,tweet_id243,tweet_id244,tweet_id245,tweet_id246,tweet_id247,tweet_id248,tweet_id249,tweet_id250,tweet_id251,tweet_id252
<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,4,0,0,0,0,0,0,0,0,⋯,3,3,0,0,0,0,0,0,0,5
2,0,0,0,5,0,5,0,0,0,⋯,0,4,0,0,0,0,3,0,0,0
3,0,4,0,0,0,0,4,0,0,⋯,0,0,0,0,0,0,0,0,0,0
4,4,0,0,0,0,4,0,4,0,⋯,0,4,0,0,0,0,0,0,0,0
5,0,0,5,0,0,0,0,0,0,⋯,0,0,4,4,0,4,0,0,0,0
6,0,0,0,0,0,0,0,4,0,⋯,5,0,0,0,0,0,0,0,0,4
7,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,4,0,0,0,0
8,0,0,0,0,4,0,4,0,0,⋯,5,0,0,0,4,0,0,0,0,4
9,0,0,5,0,0,0,0,0,0,⋯,0,0,4,0,0,0,0,0,5,0
10,0,0,0,4,0,4,0,0,0,⋯,0,0,0,0,0,0,0,0,4,5
