# [ML on GCP C10] Collaborative Filtering on Google Analytics data

## Overview
This lab shows you how to do collaborative filtering with Weighted Alternating Least Squares (WALS) matrix refactorization approach.

### Objectives
In this lab, you learn to perform the following tasks:

* Prepare the user-item matrix for use with WALS

* Train a WALSMatrixFactorization within TensorFlow locally and on Cloud ML Engine

* Visualize the embedding vectors with principal components analysis


## Introduction
In this lab, you'll be providing article recommendations for users based on the Kurier.at data. Recall that collaborative filtering doesn't need to know anything about the content. We are only interested in the user-item matrix which defines their relationships.

## Open a Datalab notebook
In Cloud Datalab, click on the Home icon, and then navigate to datalab > notebook > training-data-analyst > courses > machine_learning > deepdive > 10_recommend > labs > wals.ipynb.

Read the commentary, click Clear | Clear all Cells, then run the Python snippets (Use Shift+Enter to run each piece of code) in the cell, step by step.


```
data/items.csv  item_mapping
contentId -> itemId [0,1,2...]

data/users.csv  user_mapping
visitorId -> userId [0,1,2...]
```

In [4]:
import tensorflow as tf

`def remap_keys(sparse_tensor):`

In [43]:
#sparse_tensor = tf.SparseTensor(indices=[[0,2],[0,0],[1,0],[1,1],[1,1]], values=[1.5, 0., 3.5, 2.5, 0.], dense_shape=[ 2, 5668])
sparse_tensor = tf.SparseTensor(indices=[[0,0],[0,1],[0,2],[0,2],[1,2],[1,3]], values=[2.5,1.5,4.5,0.,3.5,0.], dense_shape=[ 2, 5668])

In [44]:
with tf.Session() as sess:
    print(sparse_tensor.eval())

SparseTensorValue(indices=array([[0, 0],
       [0, 1],
       [0, 2],
       [0, 2],
       [1, 2],
       [1, 3]]), values=array([2.5, 1.5, 4.5, 0. , 3.5, 0. ], dtype=float32), dense_shape=array([   2, 5668]))


In [45]:
bad_indices = sparse_tensor.indices
# Current values of our SparseTensor that we need to fix
bad_values = sparse_tensor.values 

# Group by the batch_indices and get the count for each  
size = tf.segment_sum(data = tf.ones_like(bad_indices[:,0], dtype = tf.int64), segment_ids = bad_indices[:,0]) - 1
# The number of batch_indices (this should be batch_size unless it is a partially full batch)
length = tf.shape(size, out_type = tf.int64)[0]
# Finds the cumulative sum which we can use for indexing later
cum = tf.cumsum(size)

with tf.Session() as sess:
    print(size.eval())
    print(length.eval())
    print(cum.eval())

[3 1]
2
[3 4]


In [46]:
# The offsets between each example in the batch due to our concatentation of the keys in the decode_example method
length_range = tf.range(start = 0, limit = length, delta = 1, dtype = tf.int64)
# Indices of the SparseTensor's indices member of the rows we added by the concatentation of our keys in the decode_example method
cum_range = cum + length_range

with tf.Session() as sess:
    print(length_range.eval())
    print(cum_range.eval())

[0 1]
[3 5]


In [47]:
# The keys that we have extracted back out of our concatentated SparseTensor
gathered_indices = tf.squeeze(tf.gather(bad_indices, cum_range)[:,1])

# The enumerated row indices of the SparseTensor's indices member
sparse_indices_range = tf.range(tf.shape(bad_indices, out_type = tf.int64)[0], dtype = tf.int64)
    
# We want to find here the row indices of the SparseTensor's indices member that are of our actual data and not the concatentated rows
# So we want to find the intersection of the two sets and then take the opposite of that
x = sparse_indices_range
s = cum_range
    
with tf.Session() as sess:
    print(gathered_indices.eval())
    print(sparse_indices_range.eval())

[2 3]
[0 1 2 3 4 5]


In [48]:
# Number of multiples we are going to tile x, which is our sparse_indices_range
tile_multiples = tf.concat([tf.ones(tf.shape(tf.shape(x)), dtype=tf.int64), tf.shape(s, out_type = tf.int64)], axis = 0)
# Expands x, our sparse_indices_range, into a rank 2 tensor and then multiplies the rows by 1 (no copying) and the columns by the number of examples in the batch
x_tile = tf.tile(tf.expand_dims(x, -1), tile_multiples)
# Essentially a vectorized logical or, that we then negate
x_not_in_s = ~tf.reduce_any(tf.equal(x_tile, s), -1)

with tf.Session() as sess:
    print(tile_multiples.eval())
    print(x_tile.eval())
    print(x_not_in_s.eval())

[1 2]
[[0 0]
 [1 1]
 [2 2]
 [3 3]
 [4 4]
 [5 5]]
[ True  True  True False  True False]


In [49]:
# The SparseTensor's indices that are our actual data by using the boolean_mask we just made above applied to the entire indices member of our SparseTensor
selected_indices = tf.boolean_mask(tensor = bad_indices, mask = x_not_in_s, axis = 0)
# Apply the same boolean_mask to the entire values member of our SparseTensor to get the actual values data
selected_values = tf.boolean_mask(tensor = bad_values, mask = x_not_in_s, axis = 0)

# Need to replace the first column of our selected_indices with keys, so we first need to tile our gathered_indices
tiling = tf.tile(input = tf.expand_dims(gathered_indices[0], -1), multiples = tf.expand_dims(size[0] , -1))

with tf.Session() as sess:
    print(selected_indices.eval())
    print(selected_values.eval())
    print(tiling.eval())

[[0 0]
 [0 1]
 [0 2]
 [1 2]]
[2.5 1.5 4.5 3.5]
[2 2 2]


In [50]:
# We have to repeatedly apply the tiling to each example in the batch
# Since it is jagged we cannot use tf.map_fn due to the stacking of the TensorArray, so we have to create our own custom version
def loop_body(i, tensor_grow):
  return i + 1, tf.concat(values = [tensor_grow, tf.tile(input = tf.expand_dims(gathered_indices[i], -1), multiples = tf.expand_dims(size[i] , -1))], axis = 0)

_, result = tf.while_loop(lambda i, tensor_grow: i < length, loop_body, [tf.constant(1, dtype = tf.int64), tiling])

# Concatenate tiled keys with the 2nd column of selected_indices
selected_indices_fixed = tf.concat([tf.expand_dims(result, -1), tf.expand_dims(selected_indices[:, 1], -1)], axis = 1)

with tf.Session() as sess:
    print(result.eval())
    print(selected_indices_fixed.eval())

[2 2 2 3]
[[2 0]
 [2 1]
 [2 2]
 [3 2]]


In [51]:
# Combine everything together back into a SparseTensor
remapped_sparse_tensor = tf.SparseTensor(indices = selected_indices_fixed, values = selected_values, dense_shape = sparse_tensor.dense_shape)
  
with tf.Session() as sess:
    print(remapped_sparse_tensor.eval())

SparseTensorValue(indices=array([[2, 0],
       [2, 1],
       [2, 2],
       [3, 2]]), values=array([2.5, 1.5, 4.5, 3.5], dtype=float32), dense_shape=array([   2, 5668]))
