# Adding Context to Model

`Context` is very important. For example, on weekdays people tend to watch short clips, and at the weekend people can watch a full-length movie because they have free time. On Amazon, there are probably products that benefit from time.

In this example, we will predict based on the time when a user rated a product and see how it impacts on overall prediction accuracy.

## Importing Packages

In [1]:
from pathlib import Path
comp_dir = Path('../input/amazon-product-reviews')

In [2]:
import numpy as np
import pandas as pd 
import tensorflow as tf
from datetime import datetime,timedelta

import tensorflow_recommenders as tfrs
import warnings
warnings.filterwarnings("ignore") 
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

## Importing & Reducing Data

In [3]:
electronics_data=pd.read_csv(comp_dir / "ratings_Electronics (1).csv", dtype={'rating': 'int8'}
                             ,names=['userId', 'productId','rating','timestamp'], index_col=None, header=0)
electronics_data["datetime"] = pd.to_datetime(electronics_data.timestamp, unit="s")
############ Only count Rating after 2012 ##################
cutoff_year        = 2012  
electronics_data   = electronics_data.loc[electronics_data["datetime"].dt.year > cutoff_year]  #Reducing data
############ products which received >= 50 ##################
cutoff_no_rat = 50    ## Only count products which received more than or equal 50
electronics_data   = electronics_data.loc[electronics_data.groupby("productId")["rating"].transform('count').ge(
                                            cutoff_no_rat)].reset_index(drop=True)
############ users who rated >= 5 ##################
cutoff_no_user    = 5    ## Only count users who rated more than or equal 5
electronics_data  = electronics_data.loc[electronics_data.groupby("userId")["rating"].transform('count').ge(
                                            cutoff_no_user)].reset_index(drop=True)
electronics_data.head()

Unnamed: 0,userId,productId,rating,timestamp,datetime
0,A20XXTXWF2TCPY,972683275,5,1405123200,2014-07-12
1,A2IDCSC6NVONIZ,972683275,5,1367280000,2013-04-30
2,A3BMUBUC1N77U8,972683275,4,1385164800,2013-11-23
3,A3UOSOCRKS3WIH,972683275,5,1368316800,2013-05-12
4,A2HLNXOYLMERTC,972683275,5,1397606400,2014-04-16


In [4]:
userIds           = electronics_data.userId.unique()
productIds        = electronics_data.productId.unique()
unique_productIds = productIds
unique_userIds    = userIds
total_ratings     = len(electronics_data.index)

print("Number of Rating: {:,}".  format(total_ratings) )
print("Number of Users: {:,}".   format(len(userIds) ) )
print("Number of Products: {:,}".format(len(productIds)  ) )

Number of Rating: 437,330
Number of Users: 58,013
Number of Products: 13,824


In [5]:
# Convert Pandas to TF Dataset
ratings    = tf.data.Dataset.from_tensor_slices({"userId":tf.cast( electronics_data.userId.values  ,tf.string),
                                                "productId":tf.cast( electronics_data.productId.values,tf.string),
                                                "rating":tf.cast( electronics_data.rating.values  ,tf.int8),
                                                "timestamp":tf.cast( electronics_data.rating.values  ,tf.int64,) } )
productIds = tf.data.Dataset.from_tensor_slices(productIds)

In [6]:
# Pre-process timestamp
timestamps    = electronics_data.timestamp.values
max_timestamp = timestamps.max()
min_timestamp = timestamps.min()

timestamp_buckets = np.linspace( min_timestamp, max_timestamp, num=1000,)

In [7]:
# Prepare data for model fitting and testing
# For perfect shuffling, a `buffer size` greater than or equal to the full size of the dataset is required.
tf.random.set_seed(123) # set seed so we re-produce the same results very time running
shuffled = ratings.shuffle(10_000_000, seed=123, reshuffle_each_iteration=False)

train = shuffled.take( int(total_ratings*0.8) )
test  = shuffled.skip(int(total_ratings*0.8)).take(int(total_ratings*0.2))

## Building Context Model

To learn more, read from [Tensorflow Recommenders website](https://www.tensorflow.org/recommenders/examples/featurization)

In [8]:
class UserModel(tf.keras.Model):
    def __init__(self, use_timestamps):
        super().__init__()
        self._use_timestamps = use_timestamps
        
        self.user_embeddings = tf.keras.Sequential([
                                    tf.keras.layers.StringLookup(
                                        vocabulary=unique_userIds, mask_token=None),
                                        # add addional embedding to account for unknow tokens
                                    tf.keras.layers.Embedding(len(unique_userIds)+1, 32)
                                    ])
        if use_timestamps:
            self.timestamp_embedding = tf.keras.Sequential([
                                    tf.keras.layers.Discretization( timestamp_buckets.tolist() ),
                                        # add addional embedding to account for unknow tokens
                                    tf.keras.layers.Embedding(len(timestamp_buckets)+1, 32)
                                    ])
            self.normalized_timestamp = tf.keras.layers.experimental.preprocessing.Normalization(axis=None)
            self.normalized_timestamp.adapt(timestamps)
    
    def call(self,inputs):
        if not self._use_timestamps:
            return self.user_embeddings( inputs["userId"])
        return tf.concat([
            self.user_embeddings     ( inputs["userId"]),
            self.timestamp_embedding( inputs["timestamp"]),
            tf.reshape(self.normalized_timestamp(inputs["timestamp"]), (-1, 1)),
            ], axis=1)
    

class ProductModel(tf.keras.Model):
    def __init__(self):
        super().__init__()
        max_tokens = 100_000

        self.title_embedding = tf.keras.Sequential([
                                    tf.keras.layers.StringLookup(
                                        vocabulary=unique_productIds, mask_token=None),
                                        # add addional embedding to account for unknow tokens
                                    tf.keras.layers.Embedding(len(unique_productIds)+1, 32)
                                    ])
        self.title_vectorizer = tf.keras.layers.TextVectorization(max_tokens=max_tokens)
        self.title_text_embedding = tf.keras.Sequential([
                                        self.title_vectorizer,
                                        tf.keras.layers.Embedding(max_tokens, 32, mask_zero=True),
                                        tf.keras.layers.GlobalAveragePooling1D(),
                                        ])
        self.title_vectorizer.adapt(productIds)
    
    def call(self,titles):
        return tf.concat([
            self.title_embedding     ( titles),
            self.title_text_embedding( titles),
            ], axis=1)
    
# Build a model.
class amazonModel(tfrs.models.Model):

    def __init__(self, use_timestamps):
        super().__init__()
        self.query_model     = tf.keras.Sequential([ UserModel(use_timestamps),
                                                 tf.keras.layers.Dense(32) ])
        self.candidate_model = tf.keras.Sequential([ ProductModel(),
                                                 tf.keras.layers.Dense(32) ])
        self.task            = tfrs.tasks.Retrieval( metrics = tfrs.metrics.FactorizedTopK(
                                                        candidates=productIds.batch(1024).map(self.candidate_model) )
                                                    )
            
    def compute_loss(self, features, training=False):
        query_embeddings = self.query_model({ "userId":features["userId"], "timestamp":features["timestamp"] }  )
        product_embeddings = self.candidate_model(features["productId"])
        return self.task( query_embeddings, product_embeddings)    

In [9]:
model = amazonModel(use_timestamps=False)
model.compile(optimizer=tf.keras.optimizers.Adagrad( learning_rate=0.1 ))

In [10]:
cached_train = train.batch(4096).cache()
cached_test  = test.batch(2048).cache()

model.fit(cached_train, epochs=50, verbose=False);

Consider rewriting this model with the Functional API.
Consider rewriting this model with the Functional API.


In [11]:
train_accuracy = model.evaluate( cached_train, return_dict=True)["factorized_top_k/top_100_categorical_accuracy"];
test_accuracy  = model.evaluate( cached_test,  return_dict=True)["factorized_top_k/top_100_categorical_accuracy"];

print(f"Top-100 accuracy (train): {train_accuracy:.2f}.")
print(f"Top-100 accuracy (test): {test_accuracy:.2f}.")

Consider rewriting this model with the Functional API.
Top-100 accuracy (train): 0.86.
Top-100 accuracy (test): 0.06.


In [12]:
time_model = amazonModel(use_timestamps=True)
time_model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

In [13]:
time_model.fit(cached_train, epochs=50, verbose=False);

train_accuracy = time_model.evaluate(cached_train, return_dict=True)["factorized_top_k/top_100_categorical_accuracy"];
test_accuracy  = time_model.evaluate(cached_test, return_dict=True)["factorized_top_k/top_100_categorical_accuracy"];

print(f"Top-100 accuracy (train): {train_accuracy:.2f}.")
print(f"Top-100 accuracy (test): {test_accuracy:.2f}.")

Consider rewriting this model with the Functional API.
Consider rewriting this model with the Functional API.
Consider rewriting this model with the Functional API.
Top-100 accuracy (train): 0.81.
Top-100 accuracy (test): 0.06.


## Conclusion

In this example, `timestamp` has a negligible impact on the predictions for a special type of product. We would love to have more information on the products or users to improve the accuracy of the predictive model.

# Higher Density of Data

In the previous dataset, the number of Ratings is 437,330 but the number of Users is 58,013, which means each user only rated 8 products on average. The number is too small in comparison to the number of Products: 13,824, which is around 0.06%. After training the model, the accuracy of the model is about 0.06 or 6% (it could be improved a bit higher with more iterations), which is about hundreds of times of rated product ratio.

Let's reduce the size of the rating data to filter users who rated much more products. The more user rated, the more information we have about users, hence, the higher precision of the modeling.

In [28]:
############ users who rated >= 5 ##################
cutoff_no_user    = 20    ## Only count users who rated more than or equal 5
electronics_data  = electronics_data.loc[electronics_data.groupby("userId")["rating"].transform('count').ge(
                                            cutoff_no_user)].reset_index(drop=True)
userIds           = electronics_data.userId.unique()
productIds        = electronics_data.productId.unique()
unique_productIds = productIds
unique_userIds    = userIds
total_ratings     = len(electronics_data.index)

print("Number of Rating: {:,}".  format(total_ratings) )
print("Number of Users: {:,}".   format(len(userIds) ) )
print("Number of Products: {:,}".format(len(productIds)  ) )

Number of Rating: 37,356
Number of Users: 1,320
Number of Products: 8,634


In [29]:
# Convert Pandas to TF Dataset
ratings    = tf.data.Dataset.from_tensor_slices({"userId":tf.cast( electronics_data.userId.values  ,tf.string),
                                                "productId":tf.cast( electronics_data.productId.values,tf.string),
                                                "rating":tf.cast( electronics_data.rating.values  ,tf.int8),
                                                "timestamp":tf.cast( electronics_data.rating.values  ,tf.int64,) } )
productIds = tf.data.Dataset.from_tensor_slices(productIds)
# Pre-process timestamp
timestamps    = electronics_data.timestamp.values
max_timestamp = timestamps.max()
min_timestamp = timestamps.min()

timestamp_buckets = np.linspace( min_timestamp, max_timestamp, num=1000,)

In [30]:
tf.random.set_seed(123) # set seed so we re-produce the same results very time running
shuffled = ratings.shuffle(10_000_000, seed=123, reshuffle_each_iteration=False)

train = shuffled.take( int(total_ratings*0.8) )
test  = shuffled.skip(int(total_ratings*0.8)).take(int(total_ratings*0.2))

In [33]:
model = amazonModel(use_timestamps=False)
model.compile(optimizer=tf.keras.optimizers.Adagrad( learning_rate=0.1 ))
cached_train = train.batch(2096).cache()
cached_test  = test.batch(1048).cache()

In [34]:
model.fit(cached_train, epochs=50, verbose=False);
train_accuracy = model.evaluate( cached_train, return_dict=True)["factorized_top_k/top_100_categorical_accuracy"];
test_accuracy  = model.evaluate( cached_test,  return_dict=True)["factorized_top_k/top_100_categorical_accuracy"];
print(f"Top-100 accuracy (train): {train_accuracy:.2f}.")
print(f"Top-100 accuracy (test): {test_accuracy:.2f}.")

Consider rewriting this model with the Functional API.
Consider rewriting this model with the Functional API.
Consider rewriting this model with the Functional API.
Top-100 accuracy (train): 0.95.
Top-100 accuracy (test): 0.07.


## Conclusion

Although adding timestamp does not improve the predictions based on Amazon review, we have a demo of how to implement it to a model. It will be more interesting if there is more available information on different aspects of products or users.