In [1]:
import warnings

warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
from gensim import downloader
import datetime

In [3]:
%load_ext nb_black

In [4]:
def vectorize(description, vectorizer):
    arr = description.split(" ")
    r = None
    for a in arr:
        try:
            if r is None:
                r = np.array([vectorizer[a]])
            else:
                r = np.vstack([r, vectorizer[a]])
        except KeyError:
            pass
    if r is None:
        return np.array([])
    return r


def padArr(arr, arr_size, tot_size):
    if len(arr) == 0:
        return np.zeros((tot_size, arr_size))
    for i in range(len(arr), tot_size):
        arr = np.vstack([arr, np.zeros(arr_size)])
    return arr


In [5]:
word_2_vec = downloader.load("word2vec-google-news-300")

In [6]:
w2v_len = word_2_vec["hola"].shape[0]

In [7]:
train = catalog.load("train_contract_value_rnn")
cv = catalog.load("cv_contract_value_rnn")

In [8]:
class ContractDataset:
    def __init__(
        self,
        df,
        vectorizer,
    ):
        self.df = df
        self.vectorizer = vectorizer
        self.vec_len = vectorizer["hola"].shape[0]
        self.max_len = df["full_contract_description"].apply(len).max()
        self.size = len(df)
        self.cols = [
            c
            for c in self.df.columns
            if c
            not in [
                "index",
                "full_contract_description",
                "log_valor_del_contrato",
            ]
        ]

    def __len__(self):
        return self.size

    def __getitem__(self, idx):

        features_other = (self.df.iloc[idx][self.cols]).values
        features_text = vectorize(
            self.df.iloc[idx]["full_contract_description"], self.vectorizer
        )
        features_text = padArr(features_text, self.vec_len, self.max_len)
        return (
            np.concatenate(
                [
                    np.tile(
                        features_other.reshape(1, len(features_other)),
                        (features_text.shape[0], 1),
                    ),
                    features_text,
                ],
                axis=1,
            ),
            self.df.iloc[idx]["log_valor_del_contrato"],
        )

    def __call__(self):
        for i in range(self.__len__()):
            yield self.__getitem__(i)

In [9]:
train_generator = ContractDataset(train, word_2_vec)
cv_generator = ContractDataset(cv, word_2_vec)

In [10]:
batch_size = 500

In [11]:
ot = (tf.float32, tf.float32)
os = (
    tf.TensorShape([None, len(train_generator.cols) + train_generator.vec_len]),
    tf.TensorShape([]),
)

In [12]:
train_ds = tf.data.Dataset.from_generator(
    train_generator, output_types=ot, output_shapes=os
).batch(batch_size)
cv_ds = tf.data.Dataset.from_generator(
    cv_generator, output_types=ot, output_shapes=os
).batch(batch_size)

Metal device set to: Apple M1 Max


2022-08-13 20:58:37.699736: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-08-13 20:58:37.700230: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [13]:
for x, y in train_ds:
    break

2022-08-13 20:58:45.808006: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


In [15]:
x.shape

In [16]:
early_stopping = keras.callbacks.EarlyStopping(
    monitor="val_loss", patience=20, restore_best_weights=True
)

In [24]:
n_out = np.random.choice([10, 20, 50, 100])
dropout = np.random.random() * 0.8
# Model
inputs = keras.layers.Input(
    shape=(None, len(train_generator.cols) + train_generator.vec_len), dtype=tf.float64
)
x = keras.layers.Masking(
    mask_value=0.0,
    input_shape=(None, len(train_generator.cols) + train_generator.vec_len),
)(inputs)
x = keras.layers.LSTM(n_out, dropout=dropout)(x)
x = keras.layers.Dense(10, activation="sigmoid")(x)
outputs = keras.layers.Dense(1)(x)
model = keras.Model(inputs, outputs)
model.compile(
    optimizer=keras.optimizers.Adam(),
    loss=keras.losses.MeanSquaredError(),
    metrics=[keras.metrics.RootMeanSquaredError()],
)

In [None]:
model.fit(
            train_ds,
            validation_data=cv_ds,
            epochs=1000,
            callbacks=[early_stopping],)

Epoch 1/1000


2022-08-13 20:37:25.218592: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2022-08-13 20:37:26.721335: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-08-13 20:37:42.245662: W tensorflow/core/common_runtime/forward_type_inference.cc:231] Type inference failed. This indicates an invalid graph that escaped type checking. Error message: INVALID_ARGUMENT: expected compatible input types, but input 1:
type_id: TFT_OPTIONAL
args {
  type_id: TFT_PRODUCT
  args {
    type_id: TFT_TENSOR
    args {
      type_id: TFT_LEGACY_VARIANT
    }
  }
}
 is neither a subtype nor a supertype of the combined inputs preceding it:
type_id: TFT_OPTIONAL
args {
  type_id: TFT_PRODUCT
  args {
    type_id: TFT_TENSOR
    args {
      type_id: TFT_FLOAT
    }
  }
}

	while inferring type of node 'cond_40/output/_19'
2022-08-13 20:37:42.249957: I tensorflow/core/grappler/optimizers/cu

     66/Unknown - 1130s 17s/step - loss: 223.2881 - root_mean_squared_error: 14.9428

In [None]:
n_cv = 2
best_rmse = np.inf
best_model = None
fit_hists = []
for i in range(n_cv):
    print(
        f"Iteration {i+1} - {n_cv}: CV - {datetime.datetime.today().strftime('%H:%M:%S')}"
    )
    n_out = np.random.choice([10, 20, 50, 100])
    dropout = np.random.random() * 0.8
    # Model
    inputs = keras.layers.Input(shape=(None, len(train_generator.cols) + train_generator.vec_len), dtype=tf.float64)
    x = keras.layers.Masking(mask_value=0.0, input_shape=(None, len(train_generator.cols) + train_generator.vec_len))(inputs)
    x = keras.layers.LSTM(n_out, dropout=dropout)(x)
    x = keras.layers.Dense(10, activation="sigmoid")(x)
    outputs = keras.layers.Dense(1)(x)
    model = keras.Model(inputs, outputs)
    model.compile(
        optimizer=keras.optimizers.Adam(),
        loss=keras.losses.MeanSquaredError(),
        metrics=[keras.metrics.RootMeanSquaredError()],
    )
    # Fit
    fit_hists += [
        model.fit(
            train_ds,
            validation_data=cv_ds,
            epochs=1000,
            callbacks=[early_stopping],
            verbose=0,
        )
    ]
    cv_rmse = fit_hists[-1]["val_loss"][-1]
    if best_rmse > cv_rmse:
        print(
            f"Iteration {i+1}, n_out: {n_out}, dropout: {dropout} - F1 score :{f1_cv}"
        )
        best_rmse = cv_rmse
        best_model = keras.models.clone_model(model)
        model.save("./RNN_CV")

Iteration 1 - 2: CV - 20:33:51


2022-08-13 20:33:51.626938: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2022-08-13 20:33:53.103716: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-08-13 20:34:08.600439: W tensorflow/core/common_runtime/forward_type_inference.cc:231] Type inference failed. This indicates an invalid graph that escaped type checking. Error message: INVALID_ARGUMENT: expected compatible input types, but input 1:
type_id: TFT_OPTIONAL
args {
  type_id: TFT_PRODUCT
  args {
    type_id: TFT_TENSOR
    args {
      type_id: TFT_INT32
    }
  }
}
 is neither a subtype nor a supertype of the combined inputs preceding it:
type_id: TFT_OPTIONAL
args {
  type_id: TFT_PRODUCT
  args {
    type_id: TFT_TENSOR
    args {
      type_id: TFT_FLOAT
    }
  }
}

	while inferring type of node 'cond_40/output/_23'
2022-08-13 20:34:08.605000: I tensorflow/core/grappler/optimizers/custom_grap