# Hybrid NCF -  Construcción

## Importando librerías

In [None]:
import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow import keras

## for machine learning
from sklearn import metrics, preprocessing
from tensorflow.keras.optimizers import Adam
## for deep learning
from tensorflow.keras import models, layers, utils , regularizers, initializers



TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/tensorflow/addons/issues/2807 



In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import pyarrow
from sklearn.model_selection import train_test_split

## Lectura de datos

In [None]:
df = pd.read_parquet(engine="pyarrow", path="/data/data_modelo_deep_learning.parquet")

In [None]:
df_user_idx = 'user_idx'
df_target = 'PURCHASE_PRED'
df_item_idx = 'item_idx'
deep_columns = ['...']

## Entrenamiento de modelo

In [None]:
# Separación set de datos
train, test = train_test_split(df, stratify=df[df_user_idx], test_size=0.2,random_state=42)
train, val = train_test_split(train, stratify=train[df_user_idx], test_size=0.1,random_state=42)

In [None]:
# Definición de dimensiones en modelo de Deep Learning
usr, prd = len(df[df_user_idx].unique())+1, len(df[df_item_idx].unique())+1
deep = len(deep_columns)
print(usr, prd, deep )

18260 221 53


In [None]:
#Hiperparámetros en entrenamiento
batch_size = 64
EPOCHS = 10
embeddings_size= 50
initial_lr=0.00001
max_lr= 0.0001

In [None]:
# Definiendo learning rate cíclico
steps_per_epoch = int(len(train) / batch_size)
clr = tfa.optimizers.CyclicalLearningRate(initial_learning_rate=initial_lr,
    maximal_learning_rate=max_lr,
    scale_fn=lambda x: 1/(2.**(x-1)),
    step_size= 2 * steps_per_epoch)

In [None]:
# Guardado de resultados en clase para conservar propiedades de listas en objeto
class CustomCallback(keras.callbacks.Callback):
    def __init__(self):
        self.train_epoch_error = []
        self.val_epoch_error = []
        self.train_epoch_loss = []
        self.val_epoch_loss = []

    def on_epoch_end(self, epoch, logs=None):
        keys = list(logs.keys())
        self.train_epoch_loss.append(logs.get("loss"))
        self.val_epoch_loss.append(logs.get("val_loss"))
        self.train_epoch_error.append(logs.get("root_mean_squared_error"))
        self.val_epoch_error.append(logs.get("val_root_mean_squared_error"))

In [None]:
# Clientes (1,embedding_size)
# Input layer
xusers_in = layers.Input(name="xusers_in", shape=(1,))
xproducts_in = layers.Input(name="xproducts_in", shape=(1,))
xfeatures_deep = layers.Input(name="features_deep", shape=(deep,))

# A) Matrix Factorization

# Clientes (1,embedding_size)
xusers_emb = layers.Embedding(name="xusers_emb", input_dim=usr, output_dim=embeddings_size,embeddings_regularizer=keras.regularizers.l2(1e-6))(xusers_in)
xusers = layers.Reshape(name='xusers', target_shape=(embeddings_size,))(xusers_emb)

# Productos (1,embedding_size)
xproducts_emb = layers.Embedding(name="xproducts_emb", input_dim=prd, output_dim=embeddings_size,embeddings_regularizer=keras.regularizers.l2(1e-6))(xproducts_in)
xproducts = layers.Reshape(name='xproducts', target_shape=(embeddings_size,))(xproducts_emb)

# Productos (1)
xx = layers.multiply([xusers, xproducts])

# B) Red Neuronal
## Metadatos
xfeatures_deep = layers.Input(name="features_deep", shape=(deep,))
## embeddings & reshape
nn_xusers_emb = layers.Embedding(name="nn_xusers_emb", input_dim=usr, output_dim=embeddings_size,embeddings_regularizer=keras.regularizers.l2(1e-6))(xusers_in)
nn_xusers = layers.Reshape(name='nn_xusers', target_shape=(embeddings_size,))(nn_xusers_emb)
## embeddings & reshape
nn_xproducts_emb = layers.Embedding(name="nn_xproducts_emb", input_dim=prd, output_dim=embeddings_size,embeddings_regularizer=keras.regularizers.l2(1e-6))(xproducts_in)
nn_xproducts = layers.Reshape(name='nn_xproducts', target_shape=(embeddings_size,))(nn_xproducts_emb)
## concat & dense
nn_xx_1 = layers.Concatenate()([nn_xusers, nn_xproducts,xfeatures_deep])
nn_xx_2 = layers.Dense(name="nn_xx_1", units=128, activation='relu',kernel_initializer=initializers.GlorotUniform(seed=1),activity_regularizer=keras.regularizers.L2(1e-6))(nn_xx_1)
nn_xx_3 = layers.Dense(name="nn_xx_2", units=64, activation='relu',kernel_initializer=initializers.GlorotUniform(seed=2),activity_regularizer=keras.regularizers.L2(1e-6))(nn_xx_2)
nn_xx_4 = layers.Dense(name="nn_xx_3", units=32, activation='relu',kernel_initializer=initializers.GlorotUniform(seed=3),activity_regularizer=keras.regularizers.L2(1e-6))(nn_xx_3)


# C) Concatenación y salida
nn_xx = layers.Concatenate()([nn_xx_4,xx])

# Neurona de salida
y_out = layers.Dense(name="y_out", units=1, activation='linear')(nn_xx)

# Compilando
optimizer = Adam(learning_rate=clr)
callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=1)
model = models.Model(inputs=[xusers_in,xproducts_in,xfeatures_deep], outputs=y_out, name="Hybrid NCF")
model.compile(optimizer=optimizer, loss=tf.keras.losses.Huber(delta=0.75), metrics=[tf.keras.metrics.RootMeanSquaredError()])

In [None]:
# Creando objeto CustomCallback
callback_results = CustomCallback()

In [None]:
model.summary()

Model: "CollaborativeFiltering"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 xusers_in (InputLayer)         [(None, 1)]          0           []                               
                                                                                                  
 xproducts_in (InputLayer)      [(None, 1)]          0           []                               
                                                                                                  
 nn_xusers_emb (Embedding)      (None, 1, 50)        913000      ['xusers_in[0][0]']              
                                                                                                  
 nn_xproducts_emb (Embedding)   (None, 1, 50)        11050       ['xproducts_in[0][0]']           
                                                                             

In [None]:
#Default epochs 100
training = model.fit(x=[train[df_user_idx], train[df_item_idx], train[deep_columns]], y=train[df_target], epochs=EPOCHS, callbacks=[callback_results]
                    ,batch_size=batch_size, shuffle=True, verbose=1, validation_data=([val[df_user_idx], val[df_item_idx],val[deep_columns]],val[df_target]))
model = training.model

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


## Evaluando resultados

In [None]:
# Evaluando en set de datos de prueba
results = model.evaluate([test[df_user_idx], test[df_item_idx],test[deep_columns]], test[df_target], batch_size=batch_size)
print("Prueba huber loss, Prueba rmse:", results)

Evaluate on test data
test huber loss, test rmse: [0.009726803749799728, 0.13473962247371674]


In [None]:
# Resultados entrenamiento y validación
df = pd.DataFrame(list(zip(callback_results.train_epoch_loss, callback_results.val_epoch_loss, callback_results.train_epoch_error, callback_results.val_epoch_error)),
              columns=['train_loss','val_loss', 'train_error', 'val_error'])

In [None]:
# Resultados testing
df_2 = pd.DataFrame(list(zip([results[0]],[results[1]])),
              columns=['test_loss', 'test_error'])

In [None]:
# Obteniendo predicciones en Prueba
predicciones = model.predict([test[df_user_idx], test[df_item_idx],test[deep_columns]])

In [None]:
# Función para obtener resultados en ranking
def compute_predictions(df_pred,k=6):
  df_pred['rank_by_client'] = df_pred.groupby(df_user_idx)[df_target].rank(method='first',ascending=False)
  df_pred['rank_by_model'] = df_pred.groupby(df_user_idx)[column_estimation].rank(method='first',ascending=False)
  df_pred = df_pred.sort_values([df_user_idx, 'rank_by_client'],ascending = [True, True])
  df_pred_clients_k = df_pred[df_pred[df_user_idx].isin(df_pred[df_pred['rank_by_client']==int(k*2)][df_user_idx].unique())]
  df_pred_clients_k = df_pred_clients_k[df_pred_clients_k['rank_by_client']<=k].copy()
  df_pred_clients_k['Discounted_Gain'] =    np.where(df_pred_clients_k['rank_by_model']<=k,((k+1 - df_pred_clients_k['rank_by_client'])/k)/ np.log2(df_pred_clients_k['rank_by_model'] + 1),0)
  df_pred_clients_k['Ideal_Discounted_Gain'] =  ((k+1- df_pred_clients_k['rank_by_client']) / k )/ np.log2(df_pred_clients_k['rank_by_client'] + 1)
  df_pred_clients_k['Precions_k'] = np.where(df_pred_clients_k['rank_by_model']<=k,1,0)
  df_pred_clients_k['Accuracy'] = np.where(df_pred_clients_k['rank_by_model']==df_pred_clients_k['rank_by_client'],1,0)
  df_pred_clients_k['Penalized_Ranking'] =  np.where(df_pred_clients_k['rank_by_client']<df_pred_clients_k['rank_by_model'],0,df_pred_clients_k['rank_by_model'] / df_pred_clients_k['rank_by_client'])
  df_pred_clients_k['MRR'] = np.where(df_pred_clients_k['rank_by_model']==df_pred_clients_k['rank_by_client'], 1/df_pred_clients_k['rank_by_client'] ,0)
  df_pred_clients_k['IMRR'] = 1/df_pred_clients_k['rank_by_client']
  grouped = df_pred_clients_k.groupby(df_user_idx).agg({'MRR': 'sum', 'IMRR': 'sum',  'Ideal_Discounted_Gain': 'sum' ,'Discounted_Gain': 'sum' })
  grouped['ndcgk'] =grouped['Discounted_Gain'] / grouped['Ideal_Discounted_Gain']
  ndcgk_mean = grouped['ndcgk'].mean()
  mrr_ratio =  grouped['MRR'].mean()/ grouped['IMRR'].mean()
  return [df_pred_clients_k['Precions_k'].mean(),df_pred_clients_k['Accuracy'].mean(),df_pred_clients_k['Penalized_Ranking'].mean(),ndcgk_mean, mrr_ratio]

In [None]:
# Agregando columna de predicciones en prueba
column_estimation = 'Prediction'
test[column_estimation] = predicciones
(abs((test.PURCHASE_PRED - test.Prediction))).mean()

In [None]:
# Obteniendo métricas de rendimiento en ranking
compute_predictions(test,k=6)

In [None]:
# Gráfica en función de perdida en entrenamiento y validación por época
sns.set_theme()
plt.plot(range(1, len(callback_results.train_epoch_loss) + 1), callback_results.train_epoch_loss, label='Hybrid NCF Loss')
plt.plot(range(1, len(callback_results.val_epoch_loss) + 1), callback_results.val_epoch_loss, label='Hybrid NCF Loss')
plt.title('Comparación modelos en entrenamiento - Huber Loss')
plt.xlabel('Iteración')
plt.ylabel('Huber Score')
plt.legend()
plt.show()