# Tensorflow Expedia plus features

In [None]:
!pip install tensorflow_recommenders

In [1]:
import os
import pprint
import tempfile

from typing import Dict, Text

import datetime as dt

%matplotlib inline
import matplotlib.pyplot as plt

import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
import pandas as pd

import tensorflow_recommenders as tfrs

In [2]:
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

## Load data

In [25]:
data_dir = '../../data/expedia/'

def convert_date(date_col):
    return (pd.to_datetime(date_col) - dt.datetime(1970,1,1)).dt.total_seconds()

def convert_features(df: pd.DataFrame) -> pd.DataFrame:
    
    # Convert date to unix timestamp
    df['date_time'] = convert_date(df['date_time'])
    
    # Convert to bytes
    df['user_id'] = [bytes(str(uid), 'utf-8') for uid in df['user_id']]
    
    df = df.drop(['srch_ci', 'srch_co'], axis=1)
    
    if 'hotel_cluster' in df.columns:
        df['hotel_cluster'] = [bytes(str(uid), 'utf-8') for uid in df['hotel_cluster']]
        df = df.drop(['cnt'], axis=1)
    
    return df
    

def create_tf_datasets(df: pd.DataFrame):
            
    features = convert_features(df)
        
    # Create tf datasets
    dataset = tf.data.Dataset.from_tensor_slices((dict(features))) 
    hotels = tf.data.Dataset.from_tensor_slices([bytes(str(uid), 'utf-8') for uid in range(0, 99)])
    
    # Column selection
    dataset = dataset.map(lambda x: {
        "hotel_cluster": x["hotel_cluster"],
        "user_id": x["user_id"],
        "is_package" : x["is_package"],
    })
    
    return dataset, hotels, len(df)
    
def create_tf_test_dataset(df: pd.DataFrame):
    
    features = convert_features(df)
    
    dataset = tf.data.Dataset.from_tensor_slices((dict(features))) 

    return dataset.map(lambda x: {
        "user_id": x["user_id"],
        "is_package" : x["is_package"],
    })    

In [7]:
filename = 'train_sample' # 'train_sample'

df = pd.read_csv(data_dir + f'{filename}.csv')


dataset, hotels, n_records = create_tf_datasets(df)

## Colab load data

In [None]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
    print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

In [None]:
import io

df = pd.read_csv(io.StringIO(uploaded['train_sample.csv'].decode('utf-8')))

dataset, hotels, n_records = create_tf_datasets(df)

## Parse data

In [9]:
for x in dataset.take(1).as_numpy_iterator():
    pprint.pprint(x)

{'hotel_cluster': b'15', 'is_package': 0, 'user_id': b'472333'}


In [10]:
user_ids = dataset.batch(1_000_000).map(lambda x: x["user_id"])
hotel_clusters = hotels.batch(1_000_000)

unique_user_ids = np.unique(np.concatenate(list(user_ids)))
unique_hotel_clusters = np.unique(np.concatenate(list(hotel_clusters)))

unique_hotel_clusters[:10]

array([b'0', b'1', b'10', b'11', b'12', b'13', b'14', b'15', b'16', b'17'],
      dtype=object)

In [11]:
is_package = np.concatenate(list(dataset.map(lambda x: x["is_package"]).batch(100)))

## Model

Indicator columns and embedding columns never work on features directly

https://keras.io/guides/preprocessing_layers/

In [12]:
class UserModel(tf.keras.Model):
  
    def __init__(self):
        super().__init__()

        self.user_embedding = tf.keras.Sequential([
            tf.keras.layers.experimental.preprocessing.StringLookup(
                vocabulary=unique_user_ids, mask_token=None),
            tf.keras.layers.Embedding(len(unique_user_ids) + 1, 32),
        ])

        self.package_encoder = (tf.keras
                                .layers.experimental
                                .preprocessing
                                .CategoryEncoding(output_mode="binary"))
        
        self.package_encoder.adapt(is_package)


    def call(self, inputs):

        return tf.concat([
            self.user_embedding(inputs["user_id"]),
            self.package_encoder(inputs["is_package"]),
        ], axis=1)

In [13]:
# Candidate tower
class HotelClusterModel(tf.keras.Model):
    
    def __init__(self):
        super().__init__()
        
        self.hotel_clusters = tf.keras.Sequential([
          tf.keras.layers.experimental.preprocessing.StringLookup(
          vocabulary=unique_hotel_clusters, mask_token=None),
          tf.keras.layers.Embedding(len(unique_hotel_clusters) + 1, embedding_dimension)
        ])
        
    def call(self, hotel_cluster):
        return self.hotel_clusters(hotel_cluster) 


In [14]:
embedding_dimension = 32

# top K categorical accuracy: how often the true candidate is in the top K candidates for a given query.


In [16]:
class ExpediaModel(tfrs.Model):

    def __init__(self):
        super().__init__()
        self.hotel_cluster_model: tf.keras.Model = HotelClusterModel()
                    
        self.user_model: tf.keras.Model = tf.keras.Sequential([
          UserModel(),
          tf.keras.layers.Dense(32)
        ])
        
        metrics = tfrs.metrics.FactorizedTopK(
          candidates=hotels.batch(128).map(self.hotel_cluster_model)
        )

        self.task: tf.keras.layers.Layer = tfrs.tasks.Retrieval(
          metrics=metrics
        )

    def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
        # We pick out the user features and pass them into the user model.
        user_embeddings = self.user_model({
            "user_id": features["user_id"],
            "is_package": features["is_package"],
        })
        # And pick out the movie features and pass them into the movie model,
        # getting embeddings back.
        positive_hotel_clusters = self.hotel_cluster_model(features['hotel_cluster'])

        # The task computes the loss and the metrics.
        return self.task(user_embeddings, positive_hotel_clusters)

In [17]:
model = ExpediaModel()
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

In [18]:
# TODO timesplit
tf.random.set_seed(42)
shuffled = dataset.shuffle(n_records, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(int(0.9 * n_records))
test = shuffled.skip(int(0.9* n_records)).take(int(0.1 * n_records))

cached_train = train.batch(8192).cache()
cached_test = test.batch(4096).cache()

In [19]:
fitted_model = model.fit(cached_train, 
          validation_data=cached_test,
          validation_freq=1,
          epochs=2)

Epoch 1/2
Epoch 2/2


In [20]:
model.evaluate(cached_test, return_dict=True)



{'factorized_top_k': array([0.02544, 0.10766, 0.18688, 0.6587 , 1.     ], dtype=float32),
 'factorized_top_k/top_1_categorical_accuracy': 0.025439999997615814,
 'factorized_top_k/top_5_categorical_accuracy': 0.10766000300645828,
 'factorized_top_k/top_10_categorical_accuracy': 0.18688000738620758,
 'factorized_top_k/top_50_categorical_accuracy': 0.6586999893188477,
 'factorized_top_k/top_100_categorical_accuracy': 1.0,
 'loss': 6107.49609375,
 'regularization_loss': 0,
 'total_loss': 6107.49609375}

In [None]:
accuracy = fitted_model.history["val_factorized_top_k/top_5_categorical_accuracy"][-1]
print(f"Top-5 accuracy: {accuracy:.2f}.")

num_validation_runs = len(fitted_model.history["val_factorized_top_k/top_5_categorical_accuracy"])
epochs = [(x + 1)* 1 for x in range(num_validation_runs)]

plt.plot(epochs, fitted_model.history["val_factorized_top_k/top_5_categorical_accuracy"], label="simple model")
plt.title("Accuracy vs epoch")
plt.xlabel("epoch")
plt.ylabel("Top-5 accuracy");
plt.legend();

In [167]:
import logging
logging.getLogger("tensorflow").setLevel(logging.ERROR)

In [170]:
# Save model
query_path = os.path.join('./models/', f"{filename}_package_query_model")
model.user_model.save(query_path)

candidate_path = os.path.join('./models/', f"{filename}_package_candidate_model")
model.hotel_cluster_model.save(candidate_path)

In [None]:
# Load model
filename = 'train_sample'
query_path = os.path.join('./models/', f"{filename}_package_query_model")
user_model = tf.keras.models.load_model(query_path)

candidate_path = os.path.join('./models/', f"{filename}_package_candidate_model")
candidate_model = tf.keras.models.load_model(candidate_path)
query_embedding = user_model(tf.constant(["10"]))

## Predictions

In [None]:
df = pd.read_csv(data_dir + 'test.csv')

In [29]:
final_test_set = create_tf_test_dataset(df)
final_test_set_cached = final_test_set.batch(4096)

### Annoy

In [21]:
from annoy import AnnoyIndex

index = AnnoyIndex(embedding_dimension, "dot")

In [22]:
hotel_embeddings = hotels.enumerate().map(lambda idx, cluster: (idx, cluster, model.hotel_cluster_model(cluster)))
hotel_embeddings

<MapDataset shapes: ((), (), (32,)), types: (tf.int64, tf.string, tf.float32)>

In [23]:
# We unbatch the dataset because Annoy accepts only scalar (id, embedding) pairs.
for hotel_id, _, hotel_embedding in hotel_embeddings.as_numpy_iterator():
    index.add_item(hotel_id, hotel_embedding)

# Build a 10-tree ANN index.
index.build(10)

True

In [56]:
results = []

for i, row in enumerate(final_test_set.batch(1).take(len(final_test_set))):
    query_embedding = model.user_model(row)[0]
    results.append(index.get_nns_by_vector(query_embedding, 5))
    if i % 10_000 == 0:
        print(f"Parsed {i} users.")

Parsed 0 users.
Parsed 10000 users.
Parsed 20000 users.
Parsed 30000 users.
Parsed 40000 users.
Parsed 50000 users.
Parsed 60000 users.
Parsed 70000 users.
Parsed 80000 users.
Parsed 90000 users.
Parsed 100000 users.
Parsed 110000 users.
Parsed 120000 users.
Parsed 130000 users.
Parsed 140000 users.
Parsed 150000 users.
Parsed 160000 users.
Parsed 170000 users.
Parsed 180000 users.
Parsed 190000 users.
Parsed 200000 users.
Parsed 210000 users.
Parsed 220000 users.
Parsed 230000 users.


KeyboardInterrupt: 

In [60]:
import pickle

with open("./results_230_000.txt", "wb") as fp:   #Pickling
    pickle.dump(results, fp)

In [57]:
len(results)

238897

In [39]:
results

[[43, 93, 59, 90, 5],
 [43, 93, 59, 90, 5],
 [43, 93, 59, 90, 5],
 [80, 44, 73, 27, 84],
 [43, 93, 59, 90, 5],
 [43, 93, 59, 90, 5],
 [43, 93, 59, 90, 5],
 [43, 93, 59, 90, 5],
 [43, 93, 59, 90, 5],
 [43, 93, 59, 90, 5],
 [43, 93, 59, 90, 5],
 [43, 93, 59, 90, 5],
 [43, 93, 59, 90, 5],
 [43, 93, 59, 90, 5],
 [43, 93, 59, 90, 5],
 [43, 93, 59, 90, 5],
 [43, 93, 59, 90, 5],
 [43, 93, 59, 90, 5],
 [43, 93, 59, 90, 5],
 [43, 93, 59, 90, 5],
 [43, 93, 59, 90, 5],
 [43, 93, 59, 90, 5],
 [43, 93, 59, 90, 5],
 [43, 93, 59, 90, 5],
 [43, 93, 59, 90, 5],
 [36, 15, 43, 62, 81],
 [43, 93, 59, 90, 5],
 [43, 93, 59, 90, 5],
 [43, 93, 59, 90, 5],
 [80, 44, 73, 27, 84],
 [43, 93, 59, 90, 5],
 [80, 44, 73, 27, 84],
 [43, 93, 59, 90, 5],
 [43, 93, 59, 90, 5],
 [43, 93, 59, 90, 5],
 [43, 93, 59, 90, 5],
 [43, 93, 59, 90, 5],
 [89, 61, 20, 60, 14],
 [43, 93, 59, 90, 5],
 [43, 93, 59, 90, 5],
 [43, 93, 59, 90, 5],
 [43, 93, 59, 90, 5],
 [43, 93, 59, 90, 5],
 [80, 44, 73, 27, 84],
 [83, 13, 16, 33, 23],
 [9

### Brute Force

In [41]:
# Create a model that takes in raw query features, and
index_bruteforce = tfrs.layers.ann.BruteForce(query_model=model.user_model)

hotel_cluster_embeddings = hotels.batch(100).map(model.hotel_cluster_model)

# recommends hotels from all hotel clusters.
index_bruteforce.index(candidates=hotel_cluster_embeddings, 
                      identifiers=hotels)

In [52]:
results_bruteforce = []

for row in final_test_set.batch(1).take(5_00):
    query_embedding = model.user_model(row)[0]
    _, hotel_cluster = index_bruteforce(queries=row)
    results_bruteforce.append([int(i) for i in hotel_cluster.numpy()[0][:5]])

In [53]:
results_bruteforce

[[43, 74, 93, 59, 90],
 [43, 74, 93, 59, 90],
 [43, 74, 93, 59, 90],
 [80, 44, 73, 27, 53],
 [43, 74, 93, 59, 90],
 [43, 74, 93, 59, 90],
 [43, 74, 93, 59, 90],
 [43, 74, 93, 59, 90],
 [43, 74, 93, 59, 90],
 [43, 74, 93, 59, 90],
 [43, 74, 93, 59, 90],
 [43, 74, 93, 59, 90],
 [43, 74, 93, 59, 90],
 [43, 74, 93, 59, 90],
 [43, 74, 93, 59, 90],
 [43, 74, 93, 59, 90],
 [43, 74, 93, 59, 90],
 [43, 74, 93, 59, 90],
 [43, 74, 93, 59, 90],
 [43, 74, 93, 59, 90],
 [43, 74, 93, 59, 90],
 [43, 74, 93, 59, 90],
 [43, 74, 93, 59, 90],
 [43, 74, 93, 59, 90],
 [43, 74, 93, 59, 90],
 [36, 15, 43, 62, 42],
 [43, 74, 93, 59, 90],
 [43, 74, 93, 59, 90],
 [43, 74, 93, 59, 90],
 [80, 44, 73, 27, 53],
 [43, 74, 93, 59, 90],
 [80, 44, 73, 27, 53],
 [43, 74, 93, 59, 90],
 [43, 74, 93, 59, 90],
 [43, 74, 93, 59, 90],
 [43, 74, 93, 59, 90],
 [43, 74, 93, 59, 90],
 [89, 93, 51, 79, 61],
 [43, 74, 93, 59, 90],
 [43, 74, 93, 59, 90],
 [43, 74, 93, 59, 90],
 [43, 74, 93, 59, 90],
 [43, 74, 93, 59, 90],
 [80, 44, 7

## Archive

In [23]:
_, hotel_cluster = index(queries={'user_id':tf.constant(["472333"]), 
                                   'is_package': tf.constant([1])})
print(f"Recommendations for user 472333: {hotel_cluster[0, :3]}")

Consider rewriting this model with the Functional API.
Recommendations for user 472333: [b'15' b'10' b'76']


In [24]:
users_to_predict

Unnamed: 0,user_id,is_package
0,1,0
1,1,0
2,20,0
3,28,1
4,50,0
...,...,...
2528238,1198754,0
2528239,1198758,0
2528240,1198771,0
2528241,1198775,0


In [None]:
results = []

for i, user in users_to_predict.iterrows():
    _, hotel_clusters = index(queries={'user_id':tf.constant([str(user['user_id'])]), 
                                       'is_package': tf.constant([user['is_package']])})
    results.append({'user_id': user['user_id'], 'hotel cluster' : [int(i) for i in hotel_clusters[0, :5].numpy()]})
    if i % 1_000 == 0:
        print(f"Parsed {i} users.")

In [38]:
pd.DataFrame(results)

Unnamed: 0,user_id,hotel cluster
0,1,"[5, 83, 90, 9, 6]"
1,1,"[5, 83, 90, 9, 6]"
2,20,"[5, 83, 90, 9, 6]"
3,28,"[80, 73, 87, 52, 0]"
4,50,"[5, 83, 90, 9, 6]"
...,...,...
995,3212,"[5, 83, 90, 9, 6]"
996,3220,"[5, 83, 90, 9, 6]"
997,3221,"[3, 30, 61, 53, 29]"
998,3221,"[3, 53, 61, 30, 20]"
