# Wide-and-Deep ML: Model Preparation

In this notebook, we train and evaluate the wide-and-deep collaborative filtering recommender using features engineered in the prior notebook.

In [6]:
# !pip3 install tensorflow

In [2]:
# import required libraries

import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization, StringLookup
import math

In [3]:
# directories that save stuff I don't fully understand
CHECKPOINT_PATH = './tmp/model_checkpoint'
EXPORT_PATH = './tmp/model_export'

## 1. Prepare the data

### 1.1. Load the data

In [75]:
# save models
train_df = pd.read_csv('../data/user_movie_interaction_train.csv')
val_df = pd.read_csv('../data/user_movie_interaction_val.csv')
test_df = pd.read_csv('../data/user_movie_interaction_train.csv')

In [76]:
train_df.head()

Unnamed: 0.1,Unnamed: 0,userId,movieId,rating,title,genres,avg_movie_rating,user_all_genres
0,23695,442,51662,0.4,300 (2007),action fantasy war imax,0.721622,fantasy sci-fi mystery animation documentary w...
1,37754,417,1027,0.4,Robin Hood: Prince of Thieves (1991),adventure drama,0.610526,fantasy sci-fi musical horror mystery western ...
2,18178,394,45499,0.5,X-Men: The Last Stand (2006),action sci-fi thriller,0.638095,sci-fi drama children thriller western film-no...
3,33268,271,60609,0.9,Death Note (2006),adventure crime drama horror mystery,0.9,sci-fi drama children thriller western film-no...
4,47465,489,3301,0.6,"Whole Nine Yards, The (2000)",comedy crime,0.641667,sci-fi drama children thriller western film-no...


In [77]:
# add dataframes to list for convenience
df_list = [train_df, val_df, test_df]

In [78]:
# drop unnecessary columns
for df in df_list:
    df.drop(['Unnamed: 0'], axis=1, inplace=True)

train_df.head()

Unnamed: 0,userId,movieId,rating,title,genres,avg_movie_rating,user_all_genres
0,442,51662,0.4,300 (2007),action fantasy war imax,0.721622,fantasy sci-fi mystery animation documentary w...
1,417,1027,0.4,Robin Hood: Prince of Thieves (1991),adventure drama,0.610526,fantasy sci-fi musical horror mystery western ...
2,394,45499,0.5,X-Men: The Last Stand (2006),action sci-fi thriller,0.638095,sci-fi drama children thriller western film-no...
3,271,60609,0.9,Death Note (2006),adventure crime drama horror mystery,0.9,sci-fi drama children thriller western film-no...
4,489,3301,0.6,"Whole Nine Yards, The (2000)",comedy crime,0.641667,sci-fi drama children thriller western film-no...


### 1.2. Preprocess raw features and make Embeddings with Keras preprocessing layers.

This process involves:
- normalizing numerical features
- turning categorical features into embeddings
- tokenizing textual features to translate them into embeddings

#### 1.2.1. Converting primary features into categorical data

In [86]:
# convert id features into string data to allow
# tokenization with keras
for df in df_list:
    id_cols = df.columns[df.columns.str.contains('Id')].tolist()
    for col in id_cols:
        df[col] = df[col].astype('category').astype('string')

train_df.dtypes

userId              string[python]
movieId             string[python]
rating                     float64
title                       object
genres                      object
avg_movie_rating           float64
user_all_genres             object
dtype: object

The `StringLookup` layer is a non-trainable layer and its *state*, the vocabulary, must be constructed and set before training in a step called "adaptation." It includes one or more unknown - or 'out of vocabulary,' OOV - tokens which allows the layer to handle categorical values that are not in the it, and consequently, ensures that the model can continue to learn using features that have not been seen during vocabulary construction.

In [None]:
# make a keras string lookup layer
userId_lookup_layer = StringLookup(mask_token=None)
movieId_lookup_layer = StringLookup(mask_token=None)

for df in df_list:
    userId_lookup_layer.adapt(df['userId'])
    movieId_lookup_layer.adapt(df['movieId'])

# verify tokenization
userId_lookup_layer.get_vocabulary()[:10]

In [None]:
userId_lookup_layer(train_df['userId'])

#### 1.2.2. Tokenize textual features and translate them into embeddings

In [47]:
# Keras TextVectorization layer turns raw string data into an encoded
# representation that can be read by an embedding or dense layer

# get all columns with string data
str_cols = df.select_dtypes(include=['object']).columns.tolist()
str_cols.remove('title')

for df in df_list:
    for col_name in str_cols:
        vectorizer = TextVectorization()
        vectorizer.adapt(df[col_name])

# verify tokenization
print(f'vocabulary[0:10]: {vectorizer.get_vocabulary()[:10]}')
vectorizer(train_df['genres'])

vocabulary[0:10]: ['', '[UNK]', 'thriller', 'drama', 'comedy', 'action', 'romance', 'adventure', 'crime', 'scifi']


<tf.Tensor: shape=(45794, 7), dtype=int64, numpy=
array([[ 5, 10, 12, ...,  0,  0,  0],
       [ 7,  3,  0, ...,  0,  0,  0],
       [ 5,  9,  2, ...,  0,  0,  0],
       ...,
       [ 5,  4,  9, ...,  0,  0,  0],
       [ 2,  0,  0, ...,  0,  0,  0],
       [ 4,  6,  0, ...,  0,  0,  0]])>

In [66]:
# combine the numerical and vectorized data to create a
# tensorflow dataset

tfd_list = ['train', 'test', 'val']
i = 0

for df in df_list:
    userId = userId_lookup_layer(df['userId'])
    movieId = userId_lookup_layer(df['movieId'])
    rating = df['rating']
    avg_movie_rating = df['avg_movie_rating']
    genres = vectorizer(df['genres'])
    user_all_genres = vectorizer(df['user_all_genres'])
    
    skibidi = (userId, movieId, rating, avg_movie_rating, genres, user_all_genres)
    
    globals()[f'{tfd_list[i]}_tf_dataset'] = tf.data.Dataset.from_tensor_slices(skibidi)
    i += 1

      userId movieId  rating  \
0        442   51662     0.4   
1        417    1027     0.4   
2        394   45499     0.5   
3        271   60609     0.9   
4        489    3301     0.6   
...      ...     ...     ...   
45789    170    2300     0.8   
45790     11   67295     0.8   
45791    422    5459     0.6   
45792    334     457     1.0   
45793    151    2706     0.2   

                                                   title  \
0                                             300 (2007)   
1                   Robin Hood: Prince of Thieves (1991)   
2                           X-Men: The Last Stand (2006)   
3                                      Death Note (2006)   
4                           Whole Nine Yards, The (2000)   
...                                                  ...   
45789                              Producers, The (1968)   
45790  Kung Fu Panda: Secrets of the Furious Five (2008)   
45791  Men in Black II (a.k.a. MIIB) (a.k.a. MIB 2) (...   
45792          

## 2. Create the model

In [37]:
# calculate number of examples
user_num = len(train_df['userId'].unique())
movie_num = len(train_df['movieId'].unique())
genre_num = len(train_df['genres'].unique())
user_all_genres_num = len(train_df['genres'].unique())

# calculate manually added embedding dimensions
user_dim = int(round(math.pow(user_num, 1/3)))
movie_dim = int(round(math.pow(movie_num, 1/3)))
genre_dim = int(round(math.pow(genre_num, 1/3)))
user_all_genres_dim = int(round(math.pow(user_all_genres_num, 1/3)))

# variables to define wide and deep columns from the dataset
LABEL_COL = 'title'

CATEGORICAL_COLS = [
    'userId',
    'movieId'
]

NUMERIC_COLS = [
    'rating',
    'avg_movie_rating'
]

TEXT_COLS = [
    'genres',
    'user_all_genres'
]

HASH_BUCKET_SIZES = {
    'userId': user_num,
    'movieId': movie_num,
    'genres': genre_num,
    'user_all_genres': user_all_genres_num
}

EMBEDDING_DIMENSIONS = {
    'userId': user_dim,
    'movieId': movie_dim,
    'genres': genre_dim,
    'user_all_genres': user_all_genres_dim,
}

# define wide and deep columns
def get_wide_and_deep_columns():
    wide_cols, deep_cols = [], []
    text_buckets = []
    numeric_cols, numeric_buckets = [], []
    cat_hash_bucket_size = genre_num * genre_dim
    l, r = (3/5, 4.5/5)

    # categorical embedding columns
    for col_name in CATEGORICAL_COLS:
        categorical_col = tf.feature_column.categorical_column_with_identity(
            col_name,
            num_buckets = HASH_BUCKET_SIZES[col_name])
        wrapped_col = tf.feature_column.embedding_column(
            categorical_col,
            dimension = EMBEDDING_DIMENSIONS[col_name],
            combiner = 'sqrtn')
        wide_cols.append(categorical_col)
        deep_cols.append(wrapped_col)

    # text data embedding
    for col_name in TEXT_COLS:
        text_col = tf.feature_column.categorical_column_with_identity(
            col_name,
            num_buckets = HASH_BUCKET_SIZES[col_name])
        wrapped_col = tf.feature_column.embedding_column(
            categorical_col,
            dimension = EMBEDDING_DIMENSIONS[col_name],
            combiner = 'sqrtn')
        text_buckets.append(col_name)
        wide_cols.append(text_col)
        deep_cols.append(wrapped_col)

    # numeric columns
    for col_name in NUMERIC_COLS:
        col_name = tf.feature_column.numeric_column(
            col_name,
            shape = (1,),
            dtype = tf.float32)
        col_buckets = tf.feature_column.bucketized_column(
            col_name,
            boundaries=[l, r])
        numeric_cols.append(col_name)
        numeric_buckets.append(col_buckets)
        deep_cols.append(col_name)

    # cross numeric columns, text data columns
    numeric_cols_crossed = tf.feature_column.crossed_column(numeric_buckets, 12)
    text_cols_crossed = tf.feature_column.crossed_column(text_buckets, cat_hash_bucket_size)

    # add buckets and crossed columns to set of wide columns
    wide_cols.extend([numeric_buckets, numeric_cols_crossed, text_cols_crossed])

    return wide_cols, deep_cols

In [38]:
wide_columns, deep_columns = get_wide_and_deep_columns()

In [39]:
wide_columns

[IdentityCategoricalColumn(key='userId', number_buckets=500, default_value=None),
 IdentityCategoricalColumn(key='movieId', number_buckets=6368, default_value=None),
 IdentityCategoricalColumn(key='genres', number_buckets=766, default_value=None),
 IdentityCategoricalColumn(key='user_all_genres', number_buckets=766, default_value=None),
 [BucketizedColumn(source_column=NumericColumn(key='rating', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), boundaries=(0.6, 0.9)),
  BucketizedColumn(source_column=NumericColumn(key='avg_movie_rating', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), boundaries=(0.6, 0.9))],
 CrossedColumn(keys=(BucketizedColumn(source_column=NumericColumn(key='rating', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), boundaries=(0.6, 0.9)), BucketizedColumn(source_column=NumericColumn(key='avg_movie_rating', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), boundaries=(0.6, 0.9)))

In [40]:
deep_columns

[EmbeddingColumn(categorical_column=IdentityCategoricalColumn(key='userId', number_buckets=500, default_value=None), dimension=8, combiner='sqrtn', initializer=<tensorflow.python.ops.init_ops.TruncatedNormal object at 0x7f62ec3174c0>, ckpt_to_load_from=None, tensor_name_in_ckpt=None, max_norm=None, trainable=True, use_safe_embedding_lookup=True),
 EmbeddingColumn(categorical_column=IdentityCategoricalColumn(key='movieId', number_buckets=6368, default_value=None), dimension=19, combiner='sqrtn', initializer=<tensorflow.python.ops.init_ops.TruncatedNormal object at 0x7f62ec317b50>, ckpt_to_load_from=None, tensor_name_in_ckpt=None, max_norm=None, trainable=True, use_safe_embedding_lookup=True),
 EmbeddingColumn(categorical_column=IdentityCategoricalColumn(key='movieId', number_buckets=6368, default_value=None), dimension=9, combiner='sqrtn', initializer=<tensorflow.python.ops.init_ops.TruncatedNormal object at 0x7f62ec317310>, ckpt_to_load_from=None, tensor_name_in_ckpt=None, max_norm=Non

### 1.2. Define the wide-and-deep model

- for some reason, we don't use brackets for the linear optimizers. Find out later (we do?)
- we're using the `Ftrl` and `Adagrad` optimizers that have their default learning rate and other parameters that we can modify later.

In [42]:
# adapted from https://www.tensorflow.org/api_docs/python/tf/estimator/DNNLinearCombinedClassifier
estimator = tf.estimator.DNNLinearCombinedClassifier(
    # wide settings
    linear_feature_columns=wide_columns,
    linear_optimizer=tf.keras.optimizers.Ftrl(),

    # deep settings
    dnn_feature_columns=deep_columns,
    dnn_hidden_units=[100, 50],
    dnn_optimizer=tf.keras.optimizers.Adagrad(),

    # warm-start settings
    model_dir=CHECKPOINT_PATH
)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': './tmp/model_checkpoint', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_checkpoint_save_graph_def': True, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


### 1.3. Create the custom metric

In [43]:
# adapted from: https://github.com/NVIDIA/DeepLearningExamples/blob/master/TensorFlow/Recommendation/WideAndDeep/utils/metrics.py
def map_custom_metric(features, labels, predictions):
    user_ids = tf.reshape(features['userId'], [-1])
    predictions = predictions['probabilities'][:, 1]
    
    # Processing unique userIds, indices and counts
    # Sorting needed in case the same userId occurs in two different places
    sorted_ids = tf.argsort(user_ids)
    user_ids = tf.gather(user_ids, indices=sorted_ids)
    predictions = tf.gather(predictions, indices=sorted_ids)
    labels = tf.gather(labels, indices=sorted_ids)
    
    _, user_ids_idx, user_ids_movies_count = tf.unique_with_counts(user_ids, out_idx=tf.int64)
    pad_length = 30 - tf.reduce_max(user_ids_movies_count)
    pad_fn = lambda x: tf.pad(x, [(0, 0), (0, pad_length)])
    
    preds = tf.RaggedTensor.from_value_rowids(predictions, user_ids_idx).to_tensor()
    labels = tf.RaggedTensor.from_value_rowids(labels, user_ids_idx).to_tensor()
    
    labels = tf.argmax(labels, axis=1)
    
    return {
        'map': tf.compat.v1.metrics.average_precision_at_k(
            predictions=pad_fn(preds),
            labels=labels,
            k=5,
            name="streaming_map"
        )}

In [45]:
estimator = tf.estimator.add_metrics(estimator, map_custom_metric)

INFO:tensorflow:Using config: {'_model_dir': './tmp/model_checkpoint', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_checkpoint_save_graph_def': True, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


### 3. Train the model

## Future development
- use libraries like `PorterStemmer` that allows counter vectorization to find word associations in big paragraphs.
- `sklearn`'s `CountVectorizer` does the vectorization, and `cosine_similarity` computes how closely the vectorized words relate.