In [18]:
import pandas as pd
import numpy as np
from automation_process_dataV1 import united_functions
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import tensorflow as tf
from scipy import sparse

In [2]:
df_ = pd.read_parquet('../../data/raw/data_model/dataset_process_trainV1.0.parquet')#Carga de datos 

In [3]:
df_processed = united_functions(df_)# Script de procesamiento de datos

In [5]:
df_processed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15839932 entries, 0 to 15839931
Data columns (total 15 columns):
 #   Column       Dtype  
---  ------       -----  
 0   userId       int64  
 1   movieId      int64  
 2   rating       float64
 3   title        object 
 4   genres       object 
 5   tagId        int64  
 6   relevance    float64
 7   tag          object 
 8   year_rate    int32  
 9   month_rate   int32  
 10  day_rate     int32  
 11  launch_year  int32  
 12  len_title    int64  
 13  main_genre   object 
 14  valoration   object 
dtypes: float64(2), int32(4), int64(4), object(5)
memory usage: 1.5+ GB


In [12]:
def extract_numerics_columns(X):
    numeric_columns = X.select_dtypes(include='number')
    return numeric_columns

In [13]:
def extract_categorical_columns(X):
    categorical_columns = X.select_dtypes(exclude='number')
    return categorical_columns

In [14]:
# Codificacion de variables numericas
def numeric_normalize(X):
    # Extraemos las columnas numéricas
    numeric_columns=extract_numerics_columns(X)

    # Creamos un MinMaxScaler y normalizamos las columnas numéricas
    normalizer = StandardScaler()
    normalized_numeric_columns = normalizer.fit_transform(numeric_columns)

    # Retorna ndarray 
    return normalized_numeric_columns

In [15]:
# Codificacion de variables categoricas
def categorical_encoder(X):
    # Extraemos las columnas categoricas
    categorical_columns=extract_categorical_columns(X)
    categorical_columns_encoded = {}

    for col in categorical_columns:
        # Utilizamos pd.Categorical para realizar la codificación
        encoded_column = pd.Categorical(X[col]).codes
        # Almacenamos la columna codificada en el diccionario
        categorical_columns_encoded[col] = encoded_column
    return categorical_columns_encoded

In [19]:
# Concatenamos la codificacion de los valores numericos y categoricos
def concat_types(X):
    numeric=numeric_normalize(X)
    categorical=categorical_encoder(X)
    
    return np.concatenate((numeric,categorical), axis=1)

In [20]:
X_processed=concat_types(df_processed) # Concatenacion

Creacion del modelo autoencoder V1

In [None]:
# Convertimos a tensor la matriz de numpy
X_processed_tensor = tf.convert_to_tensor(X_processed, dtype=tf.float32)

Eliminar variables que ya no van a ser utilizadas (liberamos memoria)

In [None]:
del df_processed
del X_processed
del df_

In [6]:
# Atamos pesos entre capas con keras (capa personalizada)
class DenseTranspose(tf.keras.layers.Layer):
    def __init__(self, dense_layer, activation=None, **kwargs):
        super().__init__(**kwargs)
        self.dense_layer = dense_layer
        self.activation = tf.keras.activations.get(activation)
    
    def build(self, batch_input_shape):
        self.biases = self.add_weight(name='bias',
                                      shape=self.dense_layer.input_shape[-1],
                                      initializer='zeros')
        super().build(batch_input_shape)
    
    def call(self, inputs):
        Z = tf.matmul(inputs, self.dense_layer.weights[0], transpose_b=True)
        return self.activation(Z + self.biases)

In [7]:
# Cargamos el autocodificador creando su estructura y red
dense_1 = tf.keras.layers.Dense(100, activation='relu')
dense_2 = tf.keras.layers.Dense(30, activation='relu')

tied_encoder = tf.keras.Sequential([
    tf.keras.layers.Flatten(),
    dense_1,
    dense_2
])

tied_decoder = tf.keras.Sequential([
    DenseTranspose(dense_2, activation='relu'),
    DenseTranspose(dense_1, activation='linear'),
    tf.keras.layers.Reshape((15,))
])

tied_ae = tf.keras.Sequential([tied_encoder, tied_decoder])




In [8]:
# Compila el modelo autoencoder
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
tied_ae.compile(optimizer=optimizer, loss='mse')

In [None]:
# Los datos de entrada tienen el tamaño (15839932, 15)
# Entrenar el modelo autoencoder
batch_size = 64
epochs = 40
tied_ae.fit(X_processed_tensor, X_processed_tensor, batch_size=batch_size, epochs=epochs)

In [27]:
# Guardar el modelo en formato SavedModel
tied_ae.save('models/model_ae_V1.0', save_format='tf')

INFO:tensorflow:Assets written to: models/model_ae_V1.0\assets


INFO:tensorflow:Assets written to: models/model_ae_V1.0\assets


In [2]:
# Cargar el modelo desde el formato SavedModel
tied_ae = tf.keras.models.load_model('models/model_ae_V1.0')





Predecir el modelo entrenado con el conjunto de test

In [4]:
df_test = pd.read_parquet('../../data/raw/data_model/dataset_process_testV1.0.parquet')#Carga de datos 

In [5]:
df_processed_test = united_functions(df_test)# Script de procesamiento de datos

In [12]:
X_processed_test=concat_types(df_processed_test) # Concatenacion

In [13]:
# Convertimos a tensor la matriz de numpy
X_processed_test_tensor = tf.convert_to_tensor(X_processed_test, dtype=tf.float32)

In [14]:
# Prediccion (Loss)
test_loss = tied_ae.evaluate(X_processed_test_tensor, X_processed_test_tensor, batch_size=1)
print(f"Pérdida en el conjunto de prueba: {test_loss}")


Pérdida en el conjunto de prueba: 0.8477622866630554


El error obtenido es de 0.84, mas bajo que la Baseline (1.07)