In [1]:
import tensorflow as tf
import keras_nlp
import pandas as pd
import numpy as np

2023-08-06 21:54:14.735893: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-08-06 21:54:14.762809: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Define classifier & model hyperparameters:

In [2]:
model_preset = "gpt2_base_en"
model_penalty = 0.8
clf_vocab_size = 32000
clf_maxlen = 200

Load GPT2 model

In [3]:
gpt2_lm = keras_nlp.models.GPT2CausalLM.from_preset(model_preset)
tokenizer = keras_nlp.models.GPT2Tokenizer.from_preset(model_preset)

2023-08-06 21:54:16.158251: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-08-06 21:54:16.172698: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-08-06 21:54:16.172812: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysf



Read the dataset and extract text data from it

In [4]:
data = pd.read_csv("datasets/original.csv")
inputs = data.comment_text

This dataset contains float toxicity scores, but only 0 and 1 are needed to train the classifier, and its necessary to split toxicity values into two columns

In [5]:
conditions = [
    (data['severe_toxicity'] == 0), 
    (data['severe_toxicity'] > 0.0001),
    ]

val = [1, 0]

data['not_toxic'] = np.select(conditions, val)

Join toxicity columns in one dataframe

In [6]:
t = data.pop("severe_toxicity")
nt = data.pop("not_toxic")
targets = pd.DataFrame(t).join(pd.DataFrame(nt))

Replace the remaining values with 1

In [7]:
targets["severe_toxicity"].loc[targets["severe_toxicity"] > 0.0001] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  targets["severe_toxicity"].loc[targets["severe_toxicity"] > 0.0001] = 1


In [8]:
targets = targets.astype(int)

In [9]:
targets.head()

Unnamed: 0,severe_toxicity,not_toxic
0,0,1
1,0,1
2,0,1
3,0,1
4,1,0


Define classifier model:

In [10]:
vectorize_layer = tf.keras.layers.TextVectorization(
 max_tokens=clf_vocab_size,
 output_mode='int',
 output_sequence_length=clf_maxlen)

In [11]:
vectorize_layer.adapt(inputs[:2000])

In [12]:
classifier = tf.keras.models.Sequential([
    tf.keras.layers.Input(shape=(1,), dtype=tf.string),
    vectorize_layer,
    tf.keras.layers.Embedding(input_dim=clf_vocab_size+1, output_dim=256),
    tf.keras.layers.Dropout(0.1),
    tf.keras.layers.Conv1D(256, 5, activation='relu'),
    tf.keras.layers.GlobalMaxPooling1D(),
    tf.keras.layers.Dropout(0.1),
    tf.keras.layers.Dense(2, activation='sigmoid')
])

In [13]:
classifier.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization (TextVec  (None, 200)              0         
 torization)                                                     
                                                                 
 embedding (Embedding)       (None, 200, 256)          8192256   
                                                                 
 dropout (Dropout)           (None, 200, 256)          0         
                                                                 
 conv1d (Conv1D)             (None, 196, 256)          327936    
                                                                 
 global_max_pooling1d (Globa  (None, 256)              0         
 lMaxPooling1D)                                                  
                                                                 
 dropout_1 (Dropout)         (None, 256)               0

In [14]:
classifier.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4), loss=tf.keras.losses.BinaryCrossentropy(from_logits=False))

In [15]:
classifier.fit(inputs[:20000], targets[:20000], epochs=12)

Epoch 1/12


2023-08-06 21:54:26.238503: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:424] Loaded cuDNN version 8600
2023-08-06 21:54:26.366047: I tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:637] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
2023-08-06 21:54:26.368812: I tensorflow/compiler/xla/service/service.cc:169] XLA service 0x7f41ae0d4a30 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2023-08-06 21:54:26.368826: I tensorflow/compiler/xla/service/service.cc:177]   StreamExecutor device (0): NVIDIA GeForce RTX 3060, Compute Capability 8.6
2023-08-06 21:54:26.371383: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2023-08-06 21:54:26.444668: I ./tensorflow/compiler/jit/device_compiler.h:180] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the p

Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


<keras.callbacks.History at 0x7f45b0134d90>

Test classifier

In [16]:
p = classifier.predict(tf.Variable(["Hello, world!"]))
if p[0][0] > p[0][1]:
    print("toxic")
else:
    print("normal")

normal


2023-08-06 21:55:51.617066: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [1]
	 [[{{node Placeholder/_0}}]]


Define word-wise crossentropy

In [17]:
@tf.function
def word_loss(x,y,p=None):
    x = x[0][y]
    if p is not None:
        return -tf.math.log(tf.nn.softmax(x)[y]) + p #Add punishment value, if its given
    return -tf.math.log(tf.nn.softmax(x)[y])

Execute classifier on a given sentence to get its toxicity score

In [18]:
@tf.function
def get_score(sentence):
    result = classifier(tf.Variable([sentence]))
    if result[0][0] > result[0][1]:
        return 0 #0 = toxic
    return 1 #1 = normative

Compute model sentence loss considering score of classifier

In [19]:
@tf.function
def sentence_loss(score, x, y, penalty, i):
    score = tf.cast(score, tf.float32)
    mask = tf.range(start=0, limit=y.shape[1], delta=1)
    
    pd = 1 - i * 0.05 #Value to control penalty
    
    v = tf.map_fn(lambda j: word_loss(x, j), mask, dtype=tf.float32)
    out = 1 / y.shape[1] * tf.reduce_sum(v)
    
    #Punishment function
    pe = tf.cast(penalty, tf.float32) * tf.cast(pd, tf.float32) * (1.0 - score) * tf.cast(out, tf.float32)
    
    #Sentence loss
    l = tf.map_fn(lambda j: word_loss(x, j, pe), mask, dtype=tf.float32)
    out2 = 1 / y.shape[1] * tf.reduce_sum(l)
    
    return out2


Define optimizer and training function

In [20]:
optimizer = tf.keras.optimizers.AdamW(learning_rate=4e-5, epsilon=1e-7, weight_decay=0.0)

In [21]:
@tf.function
def train_step(sentence, step_n):
    sl = len(sentence)
    s2 = gpt2_lm.generate(sentence, max_length=60) #Generate continuation sentence
    
    if len(sentence) >= len(s2):
        result = sentence
    else:
        result = s2[sl+1:]
    
    score = get_score(result) #Get classifier score
    
    if result:
        with tf.GradientTape() as tape:
            result = tf.expand_dims(tokenizer.tokenize(result),0)

            mask = np.not_equal(result, 0) #Create padding mask
            logits = gpt2_lm([result, mask])

            loss = sentence_loss(score, logits, result, model_penalty, step_n)

        grads = tape.gradient(loss, gpt2_lm.trainable_variables)
        optimizer.apply_gradients(zip(grads, gpt2_lm.trainable_variables))

        return loss, score

In [22]:
epochs = 10
c = 1

In [23]:
for i in range(1,epochs+1):
    step = 1
    for sentence in inputs[0:400]:
        loss, score = train_step(sentence, c)
        print("Epoch: "+str(i)+" step: "+str(step)+" loss: "+str(loss.numpy()) + " score: "+str(score) + " ", end='\r')
        c = c + 1
        step = step + 1



Epoch: 1 step: 82 loss: 2.1192818 score: 1  

2023-08-06 21:58:23.818283: W tensorflow/tsl/framework/bfc_allocator.cc:485] Allocator (GPU_0_bfc) ran out of memory trying to allocate 40.07MiB (rounded to 42014976)requested by op StridedSliceGrad
If the cause is memory fragmentation maybe the environment variable 'TF_GPU_ALLOCATOR=cuda_malloc_async' will improve the situation. 
Current allocation summary follows.
Current allocation summary follows.
2023-08-06 21:58:23.818578: I tensorflow/tsl/framework/bfc_allocator.cc:1039] BFCAllocator dump for GPU_0_bfc
2023-08-06 21:58:23.818604: I tensorflow/tsl/framework/bfc_allocator.cc:1046] Bin (256): 	Total Chunks: 2828, Chunks in use: 2828. 707.0KiB allocated for chunks. 707.0KiB in use in bin. 21.7KiB client-requested in use in bin.
2023-08-06 21:58:23.818621: I tensorflow/tsl/framework/bfc_allocator.cc:1046] Bin (512): 	Total Chunks: 42, Chunks in use: 41. 22.8KiB allocated for chunks. 22.0KiB in use in bin. 16.0KiB client-requested in use in bin.
2023-08-06 21:58:23.818637: I tensorflo

ResourceExhaustedError: {{function_node __wrapped__StridedSliceGrad_device_/job:localhost/replica:0/task:0/device:GPU:0}} OOM when allocating tensor with shape[209,50257] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc [Op:StridedSliceGrad]

In [None]:
gpt2_lm.generate("hello", max_length=60)