# Tensorflow Expedia

In [1]:
import os
import pprint
import tempfile

from typing import Dict, Text

import datetime as dt

%matplotlib inline
import matplotlib.pyplot as plt

import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
import pandas as pd

import tensorflow_recommenders as tfrs

## Load data

In [2]:
data_dir = '../../data/expedia/'

def convert_date(date_col):
    return (pd.to_datetime(date_col) - dt.datetime(1970,1,1)).dt.total_seconds()

def create_tf_datasets(filename: str):
        
    df = pd.read_csv(data_dir + filename)
    
    # Convert date to unix timestamp
    df['date_time'] = convert_date(df['date_time'])
    
    # Convert to bytes
    df['user_id'] = [bytes(str(uid), 'utf-8') for uid in df['user_id']]
    df['hotel_cluster'] = [bytes(str(uid), 'utf-8') for uid in df['hotel_cluster']]
    
    features = df.drop(['cnt', 'srch_ci', 'srch_co'], axis=1)
    
    # Create tf datasets
    dataset = tf.data.Dataset.from_tensor_slices((dict(features))) 
    hotels = tf.data.Dataset.from_tensor_slices([bytes(str(uid), 'utf-8') for uid in range(0, 99)])
    
    # Column selection
    dataset = dataset.map(lambda x: {
        "hotel_cluster": x["hotel_cluster"],
        "user_id": x["user_id"],
    })
    
    return dataset, hotels, len(df)
    
    

In [3]:
filename = 'train' # 'train_sample'

dataset, hotels, n_records = create_tf_datasets(f'{filename}.csv')

In [4]:
for x in dataset.take(1).as_numpy_iterator():
    pprint.pprint(x)

{'hotel_cluster': b'1', 'user_id': b'12'}


In [6]:
user_ids = dataset.batch(1_000_000).map(lambda x: x["user_id"])
hotel_clusters = hotels.batch(1_000_000)

unique_user_ids = np.unique(np.concatenate(list(user_ids)))
unique_hotel_clusters = np.unique(np.concatenate(list(hotel_clusters)))

unique_hotel_clusters[:10]

array([b'0', b'1', b'10', b'11', b'12', b'13', b'14', b'15', b'16', b'17'],
      dtype=object)

## Model

In [7]:
embedding_dimension = 32

# Query tower
user_model = tf.keras.Sequential([
  tf.keras.layers.experimental.preprocessing.StringLookup(
      vocabulary=unique_user_ids, mask_token=None),
  
  # We add an additional embedding to account for unknown tokens.
  tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension)
])

# Candidate tower
hotel_cluster_model = tf.keras.Sequential([
  tf.keras.layers.experimental.preprocessing.StringLookup(
      vocabulary=unique_hotel_clusters, mask_token=None),
  
    # We add an additional embedding to account for unknown tokens.
    tf.keras.layers.Embedding(len(unique_hotel_clusters) + 1, embedding_dimension)
])

# top K categorical accuracy: how often the true candidate is in the top K candidates for a given query.
metrics = tfrs.metrics.FactorizedTopK(
  candidates=hotels.batch(128).map(hotel_cluster_model)
)

task = tfrs.tasks.Retrieval(
  metrics=metrics
)

In [8]:
class ExpediaModel(tfrs.Model):

    def __init__(self, user_model, movie_model):
        super().__init__()
        self.hotel_cluster_model: tf.keras.Model = hotel_cluster_model
        self.user_model: tf.keras.Model = user_model
        self.task: tf.keras.layers.Layer = task

    def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
        # We pick out the user features and pass them into the user model.
        user_embeddings = self.user_model(features["user_id"])
        # And pick out the movie features and pass them into the movie model,
        # getting embeddings back.
        positive_hotel_clusters = self.hotel_cluster_model(features["hotel_cluster"])

        # The task computes the loss and the metrics.
        return self.task(user_embeddings, positive_hotel_clusters)

In [9]:
model = ExpediaModel(user_model, hotel_cluster_model)
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

In [10]:

# TODO timesplit
tf.random.set_seed(42)
shuffled = dataset.shuffle(n_records, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(int(0.9 * n_records))
test = shuffled.skip(int(0.9* n_records)).take(int(0.1 * n_records))

cached_train = train.batch(8192).cache()
cached_test = test.batch(4096).cache()

In [11]:
fitted_model = model.fit(cached_train, 
          validation_data=cached_test,
          validation_freq=1,
          epochs=5)

Epoch 1/5
Epoch 2/5

KeyboardInterrupt: 

In [12]:
model.evaluate(cached_test, return_dict=True)



{'factorized_top_k': array([0.09070331, 0.3623115 , 0.5172612 , 0.8999405 , 1.        ],
       dtype=float32),
 'factorized_top_k/top_1_categorical_accuracy': 0.09070330858230591,
 'factorized_top_k/top_5_categorical_accuracy': 0.3623115122318268,
 'factorized_top_k/top_10_categorical_accuracy': 0.5172612071037292,
 'factorized_top_k/top_50_categorical_accuracy': 0.8999404907226562,
 'factorized_top_k/top_100_categorical_accuracy': 1.0,
 'loss': 19739.451171875,
 'regularization_loss': 0,
 'total_loss': 19739.451171875}

In [None]:
accuracy = fitted_model.history["val_factorized_top_k/top_5_categorical_accuracy"][-1]
print(f"Top-5 accuracy: {accuracy:.2f}.")

num_validation_runs = len(fitted_model.history["val_factorized_top_k/top_5_categorical_accuracy"])
epochs = [(x + 1)* 5 for x in range(num_validation_runs)]

plt.plot(epochs, fitted_model.history["val_factorized_top_k/top_5_categorical_accuracy"], label="simple model")
plt.title("Accuracy vs epoch")
plt.xlabel("epoch")
plt.ylabel("Top-5 accuracy");
plt.legend();

In [None]:
# Save model
query_path = os.path.join('./models/', f"{filename}_query_model")
model.user_model.save(query_path)

candidate_path = os.path.join('./models/', f"{filename}_candidate_model")
model.hotel_cluster_model.save(candidate_path)

In [None]:
# Load model
user_model = tf.keras.models.load_model(query_path)

candidate_model = tf.keras.models.load_model(candidate_path)
query_embedding = user_model(tf.constant(["10"]))

## Predictions

### Brute Force

In [32]:
# Create a model that takes in raw query features, and
index = tfrs.layers.ann.BruteForce(query_model=model.user_model)

# recommends hotels from all hotel clusters.
index.index(candidates=hotels.batch(100).map(model.hotel_cluster_model), 
            identifiers=hotels)

# users_to_predict = pd.read_csv(data_dir + 'test.csv')['user_id']

prediction_file = 'test.csv' # train_sample.csv
users_to_predict = pd.read_csv(data_dir + prediction_file)['user_id']

In [None]:
_, hotel_clusters = index(queries=tf.constant(["472333"]))
print(f"Recommendations for user 472333: {hotel_cluster[0, :3]}")

In [4]:
results = []


for i, user_id in enumerate(users_to_predict):
    _, hotel_clusters = index(queries=tf.constant([str(user_id)]))
    results.append({'user_id': user_id, 'hotel cluster' : hotel_clusters[0, :5]})
    # print(f"Recommendations for user {user_id}: {hotel_clusters[0, :5]}")
    if i % 10_000 == 0:
        print(f"Parsed {i} users.")

NameError: name 'users_to_predict' is not defined