In [1]:
import warnings

warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
from gensim import downloader
from sklearn.metrics import mean_squared_error
import datetime

In [3]:
%load_ext nb_black

In [4]:
def vectorize(description, vectorizer):
    arr = description.split(" ")
    r = None
    for a in arr:
        try:
            if r is None:
                r = np.array([vectorizer[a]])
            else:
                r = np.vstack([r, vectorizer[a]])
        except KeyError:
            pass
    if r is None:
        return np.array([])
    return r


def padArr(arr, arr_size, tot_size):
    if len(arr) == 0:
        return np.zeros((tot_size, arr_size))
    for i in range(len(arr), tot_size):
        arr = np.vstack([arr, np.zeros(arr_size)])
    return arr


def padSer(ser, arr_size):
    tot_size = ser.apply(len).max()
    return ser.apply(lambda arr: padArr(arr, arr_size, tot_size))

In [5]:
word_2_vec = downloader.load("word2vec-google-news-300")

In [6]:
w2v_len = word_2_vec["hola"].shape[0]

In [7]:
train = catalog.load("train_contract_value_rnn")
cv = catalog.load("cv_contract_value_rnn")

In [8]:
inputs = keras.layers.Input(
    shape=(None, len(train.columns) + w2v_len - 3), dtype=tf.float64
)
x = keras.layers.Masking(
    mask_value=0.0,
    input_shape=(None, len(train.columns) + w2v_len - 3),
)(inputs)
x = keras.layers.LSTM(50, dropout=0.3, return_sequences=True)(x)
x = keras.layers.LSTM(100, dropout=0.3)(x)
x = keras.layers.Dense(10, activation="relu")(x)
outputs = keras.layers.Dense(1)(x)
model = keras.Model(inputs, outputs)

Metal device set to: Apple M1 Max


2022-08-13 23:52:16.872633: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-08-13 23:52:16.872823: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [9]:
model.compile(
    optimizer=keras.optimizers.RMSprop(),
    loss=keras.losses.MeanSquaredError(),
    metrics=[keras.metrics.RootMeanSquaredError()],
)

In [10]:
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, None, 327)]       0         
                                                                 
 masking (Masking)           (None, None, 327)         0         
                                                                 
 lstm (LSTM)                 (None, None, 50)          75600     
                                                                 
 lstm_1 (LSTM)               (None, 100)               60400     
                                                                 
 dense (Dense)               (None, 10)                1010      
                                                                 
 dense_1 (Dense)             (None, 1)                 11        
                                                                 
Total params: 137,021
Trainable params: 137,021
Non-trainable

In [11]:
num_epochs = 50
batch_size = 1000
num_batches = np.ceil(len(train) / batch_size)
num_batches_cv = np.ceil(len(cv) / batch_size)
w2v_len = word_2_vec["hola"].shape[0]

In [13]:
num_batches

In [12]:
rmses = []
rmses_cv = []
min_rmse = np.inf
cont = 0
for ep in range(num_epochs):
    for i in range(int(num_batches)):
        train_other = (
            train.drop(
                ["index", "full_contract_description", "log_valor_del_contrato"], axis=1
            )
            .iloc[i * batch_size : (i + 1) * batch_size]
            .values
        )
        train_text = (
            train["full_contract_description"]
            .iloc[i * batch_size : (i + 1) * batch_size]
            .apply(lambda x: vectorize(x, word_2_vec))
        )
        train_text = np.stack(padSer(train_text, w2v_len))
        x_train = np.concatenate(
            [
                train_text,
                np.tile(
                    train_other.reshape(
                        (train_other.shape[0], 1, train_other.shape[1])
                    ),
                    (1, train_text.shape[1], 1),
                ),
            ],
            axis=2,
        )
        y_train = (
            train["log_valor_del_contrato"]
            .iloc[i * batch_size : (i + 1) * batch_size]
            .values
        )
        rmses += [
            model.fit(x_train, y_train, verbose=0).history["root_mean_squared_error"][0]
        ]

        if i % 20 == 0:
            print(
                f"Ep:{ep} - it:{i} - rmse:{np.mean(rmses[-20:])} - {datetime.datetime.today().strftime('%H:%M:%S')}"
            )

    cv_pred = np.array([])
    print(f"Ep:{ep} - {datetime.datetime.today().strftime('%H:%M:%S')}")
    for i in range(int(num_batches_cv)):
        cv_other = (
            cv.drop(
                ["index", "full_contract_description", "log_valor_del_contrato"], axis=1
            )
            .iloc[i * batch_size : (i + 1) * batch_size]
            .values
        )
        cv_text = (
            cv["full_contract_description"]
            .iloc[i * batch_size : (i + 1) * batch_size]
            .apply(lambda x: vectorize(x, word_2_vec))
        )
        cv_text = np.stack(padSer(cv_text, w2v_len))
        x_cv = np.concatenate(
            [
                cv_text,
                np.tile(
                    cv_other.reshape((cv_other.shape[0], 1, cv_other.shape[1])),
                    (1, cv_text.shape[1], 1),
                ),
            ],
            axis=2,
        )
        if len(cv_pred) > 0:
            cv_pred = np.concatenate([cv_pred, model(x_cv)])
        else:
            cv_pred = model(x_cv)

    y_cv = cv["log_valor_del_contrato"].values
    rmses_cv += [mean_squared_error(y_cv, cv_pred)]
    print(f"Ep:{ep} - cv rmse:{rmses_cv[-1]}")
    if rmses_cv[-1] < min_rmse:
        min_rmse = rmses_cv[-1]
        cont = 0
        model.save("/Volumes/TOSHIBA EXT/Secop/data/06_models/rnn")
    else:
        cont += 1
    if cont > 8:
        break

2022-08-13 23:52:36.860023: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2022-08-13 23:52:39.542228: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-08-13 23:52:40.257123: W tensorflow/core/common_runtime/forward_type_inference.cc:231] Type inference failed. This indicates an invalid graph that escaped type checking. Error message: INVALID_ARGUMENT: expected compatible input types, but input 1:
type_id: TFT_OPTIONAL
args {
  type_id: TFT_PRODUCT
  args {
    type_id: TFT_TENSOR
    args {
      type_id: TFT_LEGACY_VARIANT
    }
  }
}
 is neither a subtype nor a supertype of the combined inputs preceding it:
type_id: TFT_OPTIONAL
args {
  type_id: TFT_PRODUCT
  args {
    type_id: TFT_TENSOR
    args {
      type_id: TFT_FLOAT
    }
  }
}

	while inferring type of node 'cond_40/output/_24'
2022-08-13 23:52:40.261307: I tensorflow/core/grappler/optimizers/cu

Ep:0 - it:0 - rmse:8.580056190490723 - 23:52:42


2022-08-13 23:52:43.694358: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-08-13 23:52:44.383848: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-08-13 23:52:44.602695: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-08-13 23:52:44.844890: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-08-13 23:52:45.116454: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Ep:0 - it:20 - rmse:1.2949955463409424 - 23:53:05
Ep:0 - 23:53:05
Ep:0 - cv rmse:1.1970998048782349


Ep:1 - it:0 - rmse:1.2871843457221985 - 23:53:16
Ep:1 - it:20 - rmse:1.246018075942993 - 23:53:31
Ep:1 - 23:53:31
Ep:1 - cv rmse:0.9838765859603882
Ep:2 - it:0 - rmse:1.2383159279823304 - 23:53:32
