# 3.1.2 Classifier Model Training

Use tensorflow to fit a model to the data using a dense neural network

## Load packages and data

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import re
import shutil
import string
import tensorflow as tf
import keras_tuner

2024-02-10 20:01:40.237931: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
with np.load('2.1.2.TrainTestValData.npz', allow_pickle=True) as data:
    train_ds_import = data['train']
    train_labels = data['train_targets'].astype(np.int32)
    test_ds_import = data['test']
    test_labels = data['test_targets'].astype(np.int32)
    val_ds_import = data['val']
    val_labels = data['val_targets'].astype(np.int32)


In [4]:
train_ds_import.shape, train_labels.shape

((347, 1), (347,))

In [5]:
train_labels[:10]

array([0, 0, 0, 0, 1, 0, 1, 1, 1, 0], dtype=int32)

In [6]:
raw_train_ds = tf.data.Dataset.from_tensor_slices((train_ds_import, train_labels))
raw_test_ds = tf.data.Dataset.from_tensor_slices((test_ds_import, test_labels))
raw_val_ds = tf.data.Dataset.from_tensor_slices((val_ds_import, val_labels))

2024-02-10 20:08:02.851279: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-02-10 20:08:02.907338: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


In [7]:
batch_size = 10
buffer_size = 10000


raw_train_ds = raw_train_ds.shuffle(buffer_size).batch(batch_size)

#raw_test_ds = raw_test_ds.shuffle(buffer_size)
raw_test_ds = raw_test_ds.batch(test_ds_import.shape[0])

#raw_val_ds = raw_val_ds.shuffle(buffer_size)
raw_val_ds = raw_val_ds.batch(val_ds_import.shape[0])

## Remove punctuation and tokenize the data

In [8]:
for text_batch, label_batch in raw_train_ds.take(1):
  for i in range(1):
    print("TA", text_batch.numpy()[i])
    print("Label", label_batch.numpy()[i])

TA [b'Hydroxy functional alkyl polyurea crosslinkers \rA hydroxy functional alkyl polyurea is disclosed having the formula presented in claim 1, wherein R comprises an isocyanurate moiety, biuret moiety, allophonate moiety, glycoluril moiety, benzoguanamine moiety, polyetheramine moiety, and/or polymeric moiety different from a polyetheramine and having an Mn of 500 or greater; wherein each RI is independently a hydrogen, alkyl having at least 1 carbon, or a hydroxy functional alkyl having 2 or more carbons and at least one R1 is a hydroxy functional alkyl having 2 or more carbons; and n is 2-6. Further disclosed is a coating comprising: a film-forming resin; and a hydroxy functional alkyl polyurea crosslinker having the formula presented in claim 4, wherein R2 is a substituted or unsubstituted C1 to C36 alkyl group, an aromatic group, an isocyanurate moiety, biuret moiety, allophonate moiety, glycoluril moiety, benzoguanamine moiety, polyetheramine moiety, and/or polymeric moiety diff

In [9]:
def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_html_1 = tf.strings.regex_replace(lowercase, '\r', ' ')
    return tf.strings.regex_replace(stripped_html_1,
                                  '[%s]' % re.escape(string.punctuation),
                                  '')

In [10]:
# retrieve a batch (of 10 reviews and labels) from the dataset
text_batch, label_batch = next(iter(raw_train_ds))
first_patent, first_label = train_ds_import[0], train_labels[0]
print("TA", first_patent)
print("Label", first_label)
print("Standardized TA", custom_standardization(first_patent))

TA ['Formation of coating film \rPURPOSE:To obtain a coating film excellent in smoothness, gloss and image clarity with a shortened coating stage in the 2-coat-1-bake method by a cationic electrodeposition coating and an org.-solvent coating by using an electrodeposition coating specified with the loss of coating film when heated and hardened. CONSTITUTION:A cationic electrodeposition coating A is applied to form a coating film, the surface of the unhardened film is coated with an org.-solvent coating, and both coating films are simultaneously hardened to form a double- layer coating film. The coating A with the coating film loss X controlled to 10wt.% when the dehydrated electrodeposition coating film is heated and hardened is used in this method. The electrodeposited film is not bled and the film is smoothly hardened by this method, and the desired coating film is obtained. The loss X is obtained by performing cationic electrodeposition under ordinary conditions, pulling up the obtai

In [11]:
max_features = 10000
sequence_length = 500

vectorize_layer = tf.keras.layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length)

In [12]:
# Make a text-only dataset (without labels), then call adapt

train_text = raw_train_ds.map(lambda x, y: x)
vectorize_layer.adapt(train_text)

Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


In [15]:
vocab = vectorize_layer.get_vocabulary()
vocab[40:80]

['acid',
 'are',
 'which',
 'groups',
 'be',
 'parts',
 'on',
 'containing',
 'coated',
 'has',
 'comprises',
 'group',
 'wherein',
 'can',
 'aqueous',
 'relates',
 'metal',
 'corrosion',
 'primer',
 'material',
 'more',
 'c',
 'curable',
 'thereof',
 'functional',
 'surface',
 'excellent',
 'amine',
 'adhesion',
 '1',
 'such',
 'layer',
 'anticorrosive',
 'andor',
 'hydroxyl',
 'first',
 'applied',
 'substrates',
 'coatings',
 'forming']

In [24]:
def vectorize_text(text, label):
  #text = tf.expand_dims(text, -1)
  return vectorize_layer(text), label

In [25]:
# retrieve a batch reviews and labels from the dataset
text_batch, label_batch = next(iter(raw_train_ds))
first_patent, first_label = text_batch[0], label_batch[0]
print("TA", first_patent)
print("Label", first_label)
print("Vectorized TA", vectorize_text(first_patent, first_label))

TA tf.Tensor([b'Compositions of epoxy curing agent incorporating naphthol and naphthol derivatives \rThe present invention relates to epoxy curing agent compositions comprising naphthol and naphthol derivatives in combination with at least one polyamine having three or more active amine hydrogens, and use of these curing agents as hardener for epoxy resins. These curing agent compositions may be used to cure, harden and/or crosslink an epoxy resin.\r'], shape=(1,), dtype=string)
Label tf.Tensor(0, shape=(), dtype=int32)
Vectorized TA (<tf.Tensor: shape=(1, 500), dtype=int64, numpy=
array([[  32,    5,   12,   30,   19,  676,  986,    4,  986,  692,    3,
          34,   24,   55,    7,   12,   30,   19,   32,   28,  986,    4,
         986,  692,   13,  302,   17,   16,   21,   25,  188,   22,  478,
          14,   60,  277,   67, 1644,    4,   95,    5,  583,   30,  353,
          26,  607,   15,   12,  219,  583,   30,   19,   32,  111,   44,
         106,    7,  276, 2724,   73, 129

In [26]:
train_ds = raw_train_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)
test_ds = raw_test_ds.map(vectorize_text)

In [27]:
text_batch, label_batch = next(iter(train_ds))
first_review, first_label = text_batch[0], label_batch[0]
print("Review", first_review)
print(first_label)

Review tf.Tensor(
[ 461  619  131    6    4   81   23   63    3   24  232    2  461  619
  131    6    4    2   81   23   63    3  461  619  131    6   50    8
    2   20    4    2   29   20   52    3  343  159    5    3    2   20
  333  493   40  143  140   10  522   19  693   19  505   19  476  854
 1854   96  167  135 2147    4   97   52    3   20   29   11   30   19
    3  121   86    5    3  461  619  131    6  122   18    3   24   53
  486   60  112  637    3  461  619  131    6   49    3 1772    5   99
  121   86    4    3  288    5    2   27   98   18 1558  397   46    2
   56   33   53  486 3571  988  453  847    3  491  370    5    3   35
   53  486   60  112  578 1228    4    3  535    5    3  619   35   11
 1564  163    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    

In [28]:
AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)


## Tune hyperparameters of the neural network using Keras Tuner

@misc{omalley2019kerastuner,
    title        = {KerasTuner},
    author       = {O'Malley, Tom and Bursztein, Elie and Long, James and Chollet, Fran\c{c}ois and Jin, Haifeng and Invernizzi, Luca and others},
    year         = 2019,
    howpublished = {\url{https://github.com/keras-team/keras-tuner}}

In [29]:
def build_model(hp):
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Embedding(
        input_dim = max_features, 
        output_dim = hp.Int("embedding_dim", min_value=16, max_value=128, step=16)
                           )),
    if hp.Boolean("dropout_1"):
        model.add(tf.keras.layers.Dropout(rate=0.5)),
    model.add(tf.keras.layers.GlobalAveragePooling1D()),
    
    # Tune the number of layers.
    for i in range(hp.Int("num_layers", 1, 3)):
        model.add(
            tf.keras.layers.Dense(
                # Tune number of units separately.
                units=hp.Int(f"units_{i}", min_value=32, max_value=512, step=32),
                activation=hp.Choice(f"activation_{i}", ["relu", "tanh", "sigmoid"]),
            )
        )
    if hp.Boolean("dropout_2"):
        model.add(tf.keras.layers.Dropout(rate=0.5))
    model.add(tf.keras.layers.Dense(1)),
    
    learning_rate = hp.Float("lr", min_value=1e-3, max_value=1e-2, sampling="log")
    model.compile(
        optimizer= tf.keras.optimizers.Adam(learning_rate=learning_rate),
        loss= tf.keras.losses.BinaryCrossentropy(from_logits=True),
        metrics= tf.metrics.BinaryAccuracy(threshold=0.0),
    )
    return model

early_stopping = tf.keras.callbacks.EarlyStopping(
        monitor='val_binary_accuracy',
        restore_best_weights=True, 
        patience=5)
build_model(keras_tuner.HyperParameters())


<keras.engine.sequential.Sequential at 0x7f1c4c78ee30>

In [30]:
tuner = keras_tuner.RandomSearch(
    hypermodel=build_model,
    objective="val_binary_accuracy",
    max_trials=100,
    executions_per_trial=1,
    overwrite=True,
    directory="3.1.2.Classifier",
    project_name="Article_classifier",
)

In [31]:
tuner.search(train_ds, 
             epochs=30, 
             validation_data=val_ds,
             callbacks=[early_stopping]
            )

Trial 100 Complete [00h 02m 21s]
val_binary_accuracy: 0.7631579041481018

Best val_binary_accuracy So Far: 0.7894737124443054
Total elapsed time: 04h 07m 12s


## Examine the test accuracy of the top 10 models

In [34]:
models = tuner.get_best_models(num_models=10)
for model in range(len(models)):
    loss, accuracy = models[model].evaluate(test_ds)
    print(f'Model {model} accuracy:', accuracy)

Model 0 accuracy: 0.7209302186965942
Model 1 accuracy: 0.7674418687820435
Model 2 accuracy: 0.8372092843055725
Model 3 accuracy: 0.7441860437393188
Model 4 accuracy: 0.7674418687820435
Model 5 accuracy: 0.7674418687820435
Model 6 accuracy: 0.7906976938247681
Model 7 accuracy: 0.7209302186965942
Model 8 accuracy: 0.6976743936538696
Model 9 accuracy: 0.7209302186965942


## Models 2 and 6 have ~80% test accuracy, package them and test on sample text

In [38]:
export_model = tf.keras.Sequential([
    models[2],
    tf.keras.layers.Activation('sigmoid')
])

export_model.compile(
    loss=tf.keras.losses.BinaryCrossentropy(from_logits=False), 
    optimizer="adam", 
    metrics=['accuracy']
)

export_model_6 = tf.keras.Sequential([
    models[6],
    tf.keras.layers.Activation('sigmoid')
])

export_model_6.compile(
    loss=tf.keras.losses.BinaryCrossentropy(from_logits=False), 
    optimizer="adam", 
    metrics=['accuracy']
)

In [41]:
examples = ['fast curing direct-to-metal corrosion resistant primer',
                    'powder coating composition for metal substrates',
                    'clear coat with high gloss and abrasion resistance',
                    'electrodeposition coating']

examples_vector = vectorize_layer(examples)

In [42]:
print(examples_vector)

tf.Tensor(
[[394  30 523 ...   0   0   0]
 [ 90   6   9 ...   0   0   0]
 [216 202  17 ...   0   0   0]
 [193   6   0 ...   0   0   0]], shape=(4, 500), dtype=int64)


In [43]:
export_model.predict(examples_vector), export_model_6.predict(examples_vector)



(array([[0.6304569 ],
        [0.44846344],
        [0.45617196],
        [0.41814595]], dtype=float32),
 array([[0.8257912 ],
        [0.52622527],
        [0.54823023],
        [0.4800891 ]], dtype=float32))

## Save the models and the vector layer and vocab

In [44]:
export_model.save("3.1.2.Model2.keras")
export_model_6.save('3.1.2.Model6.keras')

In [45]:
import pickle

In [46]:
pickle.dump({'config': vectorize_layer.get_config(),
             'weights': vectorize_layer.get_weights()}
            , open("3.1.2ClassifierVectorizeLayer.pkl", "wb"))

In [47]:
#save vocab
vocab = pd.DataFrame(vectorize_layer.get_vocabulary())
vocab.to_csv('3.1.2.vocab.txt', index = False, header=False)