# Test RecSys for suggesting answers to customer questions, in order to help senior customer service executives

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing import text
from keras.models import Sequential
from keras.layers import Dense, Dropout, LeakyReLU
from numpy.random import randint, uniform
from tqdm import tqdm
import random
from keras import backend as K
from icecream import ic
from typing import Union
import jellyfish as jello
import pickle

In [None]:
# Business users explicitly asked to provide a list of topics that a client can ask about and the basic set of acceptable answers
topics = {
    'topic1' : ['bla', 'blabla', 'blablabla'],
    'some question': ['generic answer', 'specific answer', 'customized answer']
}

In [None]:
input_df = pd.read_parquet("recommender_inputs/real_output.gzip") # one pair of question and answer with metadata per row

In [None]:
topics_found = {}

In [None]:
def buscar_temas(seq: str) -> str:
    global topics, topics_found
    '''
    Assign a predetermined topic to each Q&A pair
    '''
    if seq in topics_found.keys():
        return topics_found[seq]
    else:
        words = seq.split(" ")
        temas = {}
        tt = 0
        for w in words:
            for k in topics.keys():
                research = True
                for t in topics[k]:
                    if k != "default":
                        if int(len(w)* 0.1) >= jello.damerau_levenshtein_distance(w.lower(), t) or t in w.lower():
                            if k not in temas.keys():
                                temas[k] = 1
                            else:
                                temas[k] = temas[k] + 1
                            tt += 1
                            research = False
                            break
                if tt == 3 or not research:
                    break
            if tt == 3:
                break
        if len(list(temas.keys())) == 0:
            topics_found[seq] = "default"
            return "default"
        else:
            final = list(dict(sorted(temas.items(), key=lambda x: x[1], reverse=True)).keys())
            topics_found[seq] = final[0]
            return final[0]

In [None]:
input_df.loc[:, "Tema"] = input_df.loc[:, "cliente"].apply(buscar_temas)


In [None]:
input_df.head()

In [None]:
def generate_embeddings(output_df_atom:pd.DataFrame, embedding_size=250, vocab_size=10000):
    """ Generar matrices con los embeddings de las dimensiones especificadas

    Args:
        output_df_atom (pd.DataFrame): Dataframe de pares mensaje cliente - acción tomada a codificar
        embedding_size (int, optional): Tamaño del vector de embedding de cada mensaje. Defaults to 250.
        vocab_size (int, optional): Tamaño del vocabulario a procesar como máximo. Defaults to 10000.

    Returns:
        [type]: [description]
    """
    tokenizer = text.Tokenizer(num_words=vocab_size)
    tokenizer.fit_on_texts(output_df_atom["accion_ejecutivo"] + output_df_atom["mensaje_cliente"].apply(lambda x: " " + x))

    def prep_text(texts, tokenizer, max_sequence_length):
        # Turns text into into padded sequences.
        text_sequences = tokenizer.texts_to_sequences(texts)
        return sequence.pad_sequences(text_sequences, maxlen=max_sequence_length)

    text_input = prep_text(output_df_atom["mensaje_cliente"], tokenizer, embedding_size)
    text_action = prep_text(output_df_atom["accion_ejecutivo"], tokenizer, embedding_size)

    return text_input, text_action, tokenizer

In [None]:
def sequence_to_text(list_of_indices):
    # Function takes a tokenized sentence and returns the words
    # Looking up words in dictionary
    # Creating a reverse dictionary
    words = [reverse_word_map.get(letter) for letter in list_of_indices]
    words = [w for w in words if not w is None]
    return (words)

In [None]:
def translate(id_msje:int) -> str:
    """ Traducir ids de mensajes a los mensajes en texto plano

    Args:
        id_msje (int): Id representativo de embeddings

    Returns:
        str: String del mensaje correspondiente al embedding
    """
    global originals
    seq = originals[id_msje].astype(np.int32).tolist()
    # Creating texts
    my_texts = list(map(sequence_to_text, [seq]))
    return " ".join(my_texts[0])

## Parámetros

In [None]:
SAMPLE_LEN = 250       # number N of points where a curve is sampled
SAMPLE_SIZE = len(input_df)   # number of curves in the training set
EPOCHS = 100
BATCH = 100 # number of batches

## Datos de entrada / Input data

#### Estas siguientes celdas son mandatorias

In [None]:
def create_dummy(input_df: pd.DataFrame) -> Union[pd.DataFrame, text.Tokenizer, dict]:
    '''
    Create a dummy vector per original row to represent a phrase as an integer of less than 16 bits.
    '''
    input_df = input_df.copy().reset_index() # ensure creation of local variable
    text_input, text_action, tokenizer = generate_embeddings(input_df)
    metadata = input_df[["weekday", "day", "month"]].to_numpy()
    input_data = np.concatenate((text_input, metadata), axis=1)

    # Crear pares item - "usuario", donde el usuario es el mensaje del usuario y el item es el mensaje del ejecutivo. Es para usar cualquier recomendador común.
    dummy = pd.DataFrame(columns=["UserId", "ActionId", "Rating", "Timestamp"], index = range(input_data.shape[0]))
    originals = {}

    for index in range(input_data.shape[0]):
        iden = input_data[index].tolist()
        identificador = ''
        for num in iden:
            identificador += str(int(num))
        identificador = identificador[len(identificador)-13:len(identificador)-4]
        input_id = int(identificador)
        dummy.at[index, "UserId"] = input_id
        if input_id not in originals.keys():
            originals[input_id] = text_input[index]

    for index in range(text_action.shape[0]):
        iden = text_action[index].tolist()
        identificador = ''
        for num in iden:
            identificador += str(int(num))
        identificador = identificador[len(identificador)-13:len(identificador)-4]
        input_id = int(identificador)
        dummy.at[index, "ActionId"] = input_id
        if input_id not in originals.keys():
            originals[input_id] = text_action[index]

    dummy["Rating"] = input_df["Puntaje encuesta"]
    dummy["Timestamp"] = input_df["Timestamp"].astype(np.int32)
    return dummy, tokenizer, originals

In [None]:
dummy = create_dummy(input_df)

# Implementación recsys basado en contenido

In [None]:
# set the environment path to find Recommenders
import sys
import pyspark
from pyspark.ml.recommendation import ALS
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField
from pyspark.sql.types import StringType, FloatType, IntegerType, LongType

from recommenders.utils.timer import Timer
from recommenders.datasets import movielens
from recommenders.utils.notebook_utils import is_jupyter
from recommenders.datasets.spark_splitters import spark_random_split
from recommenders.evaluation.spark_evaluation import SparkRatingEvaluation, SparkRankingEvaluation
from recommenders.utils.spark_utils import start_or_get_spark

print("System version: {}".format(sys.version))
print("Spark version: {}".format(pyspark.__version__))

In [None]:
# top k items to recommend
TOP_K = 50

# Column names for the dataset
COL_USER = "UserId"
COL_ITEM = "ActionId"
COL_RATING = "Rating"
COL_TIMESTAMP = "Timestamp"

In [None]:
# the following settings work well for debugging locally on VM - change when running on a cluster
# set up a giant single executor with many threads and specify memory cap
spark = start_or_get_spark("ALS PySpark", memory="16g")
# spark = SparkSession.builder.master("spark://192.168.68.63:8080").getOrCreate()
# spark.conf.set("spark.executor.memory", "16g")

In [None]:
schema = StructType(
    (
        StructField(COL_USER, IntegerType()),
        StructField(COL_ITEM, IntegerType()),
        StructField(COL_RATING, FloatType()),
        StructField(COL_TIMESTAMP, LongType()),
    )
)
# enableing the Apache Arrow for converting
# Pandas to pySpark DF(DataFrame)
spark.conf.set("spark.sql.execution.arrow.enabled", "true")
# Creating the DataFrame
data = spark.createDataFrame(dummy, schema=schema) # .dropDuplicates(['UserId'])
# data = movielens.load_spark_df(spark, size=MOVIELENS_DATA_SIZE, schema=schema)
data.show()

In [None]:
train, test = spark_random_split(data, ratio=0.75, seed=123)
print ("N train", train.cache().count())
print ("N test", test.cache().count())

In [None]:
header = {
    "userCol": COL_USER,
    "itemCol": COL_ITEM,
    "ratingCol": COL_RATING,
}


als = ALS(
    rank=10,
    maxIter=15,
    implicitPrefs=False,
    regParam=0.05,
    coldStartStrategy='drop',
    nonnegative=False,
    seed=42,
    **header
)

In [None]:
with Timer() as train_time:
    model = als.fit(train)

print("Took {} seconds for training.".format(train_time.interval))

In [None]:
with Timer() as test_time:

    # Get the cross join of all user-item pairs and score them.
    users = train.select(COL_USER).distinct()
    items = train.select(COL_ITEM).distinct()
    user_item = users.crossJoin(items)
    dfs_pred = model.transform(user_item).alias("pred").withColumnRenamed('UserId', 'User_id').withColumnRenamed('ActionId', 'Action_id')
    train = train.alias("train")

    # ic(users.count(), items.count(), dfs_pred.count())

    # Remove seen items.
    dfs_pred_exclude_train = dfs_pred.alias("pred").join(
        train.alias("train"),
        [dfs_pred.User_id.alias("User_id") == train.UserId, dfs_pred.Action_id.alias("Action_id") == train.ActionId],
        how='outer'
    ).select("User_id", "Rating", "Action_id", "prediction")# .withColumnRenamed(train.ActionId, 'Action_Id')

    dfs_pred_exclude_train = dfs_pred_exclude_train.withColumnRenamed('User_id', COL_USER).withColumnRenamed('Action_id', COL_ITEM).alias("pred")

    top_all = dfs_pred_exclude_train.filter(dfs_pred_exclude_train[f"{COL_RATING}"].isNull()) \
        .select('pred.' + COL_USER, 'pred.' + COL_ITEM, 'pred.' + "prediction")

    # In Spark, transformations are lazy evaluation
    # Use an action to force execute and measure the test time
    top_all.cache().count()

print("Took {} seconds for prediction.".format(test_time.interval))

In [None]:
top_all.show(20)

In [None]:
test.show()

In [None]:
rank_eval = SparkRankingEvaluation(test, top_all, k = TOP_K, col_user=COL_USER, col_item=COL_ITEM,
                                    col_rating=COL_RATING, col_prediction="prediction",
                                    relevancy_method="top_k")

In [None]:
print("Model:\tALS",
      "Top K:\t%d" % rank_eval.k,
      "MAP:\t%f" % rank_eval.map_at_k(),
      "NDCG:\t%f" % rank_eval.ndcg_at_k(),
      "Precision@K:\t%f" % rank_eval.precision_at_k(),
      "Recall@K:\t%f" % rank_eval.recall_at_k(), sep='\n')

In [None]:
# Generate predicted ratings.
prediction = model.transform(test)
prediction.cache().show()

In [None]:
pd_predictions = prediction.toPandas()
pd_predictions.loc[:, "ActionId"] = pd_predictions["ActionId"].apply(translate)
pd_predictions.loc[:, "UserId"] = pd_predictions["UserId"].apply(translate)

In [None]:
pd_predictions.head()

In [None]:
rating_eval = SparkRatingEvaluation(test, prediction, col_user=COL_USER, col_item=COL_ITEM, 
                                    col_rating=COL_RATING, col_prediction="prediction")

print("Model:\tALS rating prediction",
      "RMSE:\t%f" % rating_eval.rmse(),
      "MAE:\t%f" % rating_eval.mae(),
      "Explained variance:\t%f" % rating_eval.exp_var(),
      "R squared:\t%f" % rating_eval.rsquared(), sep='\n')

# Implementación Bi VAE (adapted from docs)

In [None]:
import sys
import os
import torch
import cornac
import papermill as pm
import scrapbook as sb
import pandas as pd
from recommenders.datasets import movielens
from recommenders.datasets.python_splitters import python_random_split
from recommenders.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k
from recommenders.models.cornac.cornac_utils import predict_ranking
from recommenders.utils.timer import Timer
from recommenders.utils.constants import SEED

print("System version: {}".format(sys.version))
print("PyTorch version: {}".format(torch.__version__))
print("Cornac version: {}".format(cornac.__version__))

In [None]:

# top k items to recommend
TOP_K = 50

# Model parameters
LATENT_DIM = 50
ENCODER_DIMS = [100]
ACT_FUNC = "tanh"
LIKELIHOOD = "pois"
NUM_EPOCHS = 500
BATCH_SIZE = 128
LEARNING_RATE = 0.001

In [None]:
def BiVAE_suite(input_df:pd.DataFrame, save: bool, predict: bool) -> dict:
    """ Entrena diversas BiVAE por tema de conversación

    Args:
        input_df (pd.DataFrame): Datos preprocesados
        save (bool): True si se quieren guardar datos localmente (NO USAR EN COLAB)
        predict (bool): True si se quiere hacer predicción con la BiVAE recién entrenada (usa mucha memoria, cuidado!)

    Returns:
        dict: Diccionario donde los keys son los temas y los valores son: (BiVAE, tokenizer utiliado para codificar frases, diccionario que traduce simplificaciones de frases a frases completas)
    """
    global topics, TOP_K, LATENT_DIM, ENCODER_DIMS, ACT_FUNC, LIKELIHOOD, NUM_EPOCHS, BATCH_SIZE, LEARNING_RATE
    temas = list(topics.keys())
    suite = {}
    for tema in temas:
        print("---------------------- Topic to train: {} ----------------------".format(tema))
        dummy, tokenizer, originals = create_dummy(input_df, tema)
        if len(dummy) >= 4:
            train, test = python_random_split(dummy, 0.75)
            test.rename(columns = {'UserId':'userID'}, inplace = True)
            test.rename(columns = {'Rating':'rating'}, inplace = True)
            test.rename(columns = {'ActionId':'itemID'}, inplace = True)

            train.rename(columns = {'UserId':'userID'}, inplace = True)
            train.rename(columns = {'Rating':'rating'}, inplace = True)
            train.rename(columns = {'ActionId':'itemID'}, inplace = True)
            train_set = cornac.data.Dataset.from_uir(train.itertuples(index=False), seed=SEED)

            print('Number of users: {}'.format(train_set.num_users))
            print('Number of items: {}'.format(train_set.num_items))
            bivae = cornac.models.BiVAECF(
                k=LATENT_DIM,
                encoder_structure=ENCODER_DIMS,
                act_fn=ACT_FUNC,
                likelihood=LIKELIHOOD,
                n_epochs=NUM_EPOCHS,
                batch_size=BATCH_SIZE,
                learning_rate=LEARNING_RATE,
                seed=SEED,
                use_gpu=torch.cuda.is_available(),
                verbose=True
            )

            with Timer() as t:
                bivae.fit(train_set)
            print("Topic {}: Took {} seconds for training.".format(tema, t))
            if save:
                file = open('BiVAE_{}.pkl'.format(tema), 'wb')
                pickle.dump(bivae, file)
                file.close()
                file = open('tokenizer_{}.pkl'.format(tema), 'wb')
                pickle.dump(tokenizer, file)
                file.close()
                file = open('originals_{}.pkl'.format(tema), 'wb')
                pickle.dump(originals, file)
                file.close()
            suite[tema] = (bivae, tokenizer, originals)
            if predict:
                with Timer() as t:
                    all_predictions = predict_ranking(bivae, train, usercol='userID', itemcol='itemID', remove_seen=True)
                print("Took {} seconds for prediction.".format(t))
                all_predictions.loc[:,"prediction"] = all_predictions.prediction.astype(np.int32)
                test.loc[:,"rating"] = test["rating"].astype(np.int32)
                all_predictions.rename(columns = {'ActionId':'itemID'}, inplace = True)
                all_predictions.rename(columns = {'UserId':'userID'}, inplace = True)
                test.loc[:,"rating"] = test["rating"].astype(np.int32)
                eval_map = map_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)
                eval_ndcg = ndcg_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)
                eval_precision = precision_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)
                eval_recall = recall_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)

                print("MAP:\t%f" % eval_map,
                        "NDCG:\t%f" % eval_ndcg,
                        "Precision@K:\t%f" % eval_precision,
                        "Recall@K:\t%f" % eval_recall, sep='\n')
        else:
            print("Topic {} has less than 4 message pairs...".format(tema))
    return suite

In [None]:
BiVAE_suite(input_df, False, True)

# Implementación GAN (basic proof of concept for automatic phrase suggestions)

## Discriminador

### Versión más básica (usar esta por mientras)

In [None]:
DROPOUT = Dropout(0.4)        # Empirical hyperparameter
discriminator = Sequential()
discriminator.add(Dense(SAMPLE_LEN, activation="relu"))
discriminator.add(DROPOUT)
discriminator.add(Dense(SAMPLE_LEN, activation="relu"))
discriminator.add(DROPOUT)
discriminator.add(Dense(SAMPLE_LEN, activation="relu"))
discriminator.add(DROPOUT)
discriminator.add(Dense(SAMPLE_LEN, activation="relu"))
discriminator.add(DROPOUT)
discriminator.add(Dense(SAMPLE_LEN, activation="relu"))
discriminator.add(DROPOUT)
discriminator.add(Dense(1, activation = "sigmoid"))
discriminator.compile(optimizer = tf.keras.optimizers.Adam(0.001), loss = "binary_crossentropy", metrics = ["accuracy"])

In [None]:
discriminator.summary()

## Generador (choose one version and execute only that cell)

### Version 1 (más simple, peor en entrenamiento)

In [None]:
LEAKY_RELU = LeakyReLU(0.2)   # Empirical hyperparameter
generator = Sequential()
generator.add(tf.keras.layers.Embedding(2*SAMPLE_LEN**2, SAMPLE_LEN*2, input_length = in_len))
generator.add(tf.keras.layers.SpatialDropout1D(0.2))
generator.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(SAMPLE_LEN, dropout=0.05, recurrent_dropout=0.2)))
generator.add(tf.keras.layers.Activation('softmax'))
#generator.add(Dense(SAMPLE_LEN))
#generator.add(LEAKY_RELU)
generator.add(Dense(512))
generator.add(LEAKY_RELU)
generator.add(Dense(SAMPLE_LEN, activation = "tanh"))
generator.compile(optimizer = "adam", loss = "mse", metrics = ["accuracy"])

### Version 2 (más compleja, captura más datos)

In [None]:
def loss_function(real, pred):
    """
    We redefine our own loss function in order to get rid of the '0' value
    which is the one used for padding. This to avoid that the model optimize itself
    by predicting this value because it is the padding one.
    
    :param real: the truth
    :param pred: predictions
    :return: a masked loss where '0' in real (due to padding)
                are not taken into account for the evaluation
    """

    # to check that pred is numric and not nan
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_object_ = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True,
                                                                 reduction='none')
    loss_ = loss_object_(real, pred)
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

In [None]:
LEAKY_RELU = LeakyReLU(0.2)   # Empirical hyperparameter
_input = tf.keras.layers.Input(shape=[in_len], dtype='int32')
generate = tf.keras.layers.Embedding(2*SAMPLE_LEN**2, SAMPLE_LEN*2, input_length = in_len, mask_zero=True)(_input)
generate = tf.keras.layers.SpatialDropout1D(0.2)(generate)
generate = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(SAMPLE_LEN, dropout=0.05, recurrent_dropout=0.2, return_sequences=True))(generate)
generate = tf.keras.layers.SpatialDropout1D(0.2)(generate)
generate = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(SAMPLE_LEN, dropout=0.05, recurrent_dropout=0.2, return_sequences=True))(generate)
generate = tf.keras.layers.SpatialDropout1D(0.2)(generate)
generate = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(SAMPLE_LEN, dropout=0.05, recurrent_dropout=0.2, return_sequences=True))(generate)

# create encoding padding mask
encoding_padding_mask = tf.math.logical_not(tf.math.equal(_input, 0))

# Self attention so key=value in inputs
att = tf.keras.layers.Attention(use_scale=False, causal=True,
                                name='attention')(inputs=[generate, generate],
                                                    mask=[encoding_padding_mask,
                                                        encoding_padding_mask])
# generator_tail = tf.keras.layers.TimeDistributed(Dense(SAMPLE_LEN, activation='sigmoid'))(att)
#generator.add(Dense(SAMPLE_LEN))
#generator.add(LEAKY_RELU)
query_encoding = tf.keras.layers.GlobalAveragePooling1D()(
    generate)
att = tf.keras.layers.GlobalAveragePooling1D()(
    att)
RNN_attention_parallel = tf.keras.layers.Concatenate()(
    [query_encoding, att])
generator_tail = Dense(SAMPLE_LEN, activation = "tanh")(RNN_attention_parallel)
#generator.add(LEAKY_RELU)

generator = tf.keras.Model(inputs=_input, outputs=generator_tail)
generator.compile(optimizer = tf.keras.optimizers.Adam(0.001), loss="categorical_crossentropy", metrics = ['sparse_categorical_accuracy'])

In [None]:
generator.summary()

### Entrenar / Train

In [None]:
gan = Sequential()
gan.add(generator)
gan.add(discriminator)
gan.compile(optimizer = "adam", loss="binary_crossentropy", metrics = ["accuracy"])

In [None]:
gan.summary()

In [None]:
ONES = np.ones((SAMPLE_SIZE))
ZEROS = np.zeros((SAMPLE_SIZE))

print("epoch | dis. loss | dis. acc | gen. loss | gen. acc")
print("------+-----------+----------+-----------+----------")

for e in tqdm(range(EPOCHS)):
    for k in range(SAMPLE_SIZE//BATCH):
        # Addestra il discriminatore a riconoscere le sinusoidi vere da quelle prodotte dal generatore
        n = randint(0, in_len, size = BATCH)
        # Ora prepara un batch di training record per il discriminatore
        # print(input_data[n].shape)
        p = generator.predict(input_data[n])
        # print(text_action[n].shape, p.shape)
        x = np.concatenate((text_action[n], p))
        y = np.concatenate((ONES[n], ZEROS[n]))
        d_result = discriminator.train_on_batch(x, y)
        discriminator.trainable = False
        g_result = gan.train_on_batch(input_data[n], ONES[n])
        discriminator.trainable = True
    print(f" {e:04n} |  {d_result[0]:.5f}  |  {d_result[1]:.5f} |  {g_result[0]:.5f}  |  {d_result[1]:.5f}")