# MiniGPT For Generating Synthetic Text Data

by Kris Smith

# ***WARNING*** 

## The data required to train the model for this task is known to be vulgar, offensive, toxic, racist, and otherwise not pleasant.

## Problem Statement

Toxic comments online come in many forms and in many arenas. There are currently several ways to mitigate these comments(for those organizations who wish to do so). Some of these ways include human moderators, and training machine learning models to detect toxicity in online comments.

The issue with human moderators is that some of these platforms have grown so large so quickly that there are not nearly enough moderators to achieve any sense of control for most of these comments. The shear volume of toxicity and bots online makes it unrealistic to think we could do this job with humans at this point.

Many companies are employing machine learning to assist with identifying toxic comments online automatically. The problem with this approach is the lack of labeled training data to train the models on.

This is the problem I am going to solve using generative deep learning techniques. 

## Data

The data I will be using to train the generative model was released on Kaggle as part of an ongoing series of competitions sponsored by the [Google company Jigsaw](https://en.wikipedia.org/wiki/Jigsaw_(company)).

The data consists of online comments various levels of severity 

## Setup

In [1]:
import os
import string
import random
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import TextVectorization

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


## Implement a Transformer block as a layer

In [2]:
def causal_attention_mask(batch_size, n_dest, n_src, dtype):
    """
    Mask the upper half of the dot product matrix in self attention.
    This prevents flow of information from future tokens to current token.
    1's in the lower triangle, counting from the lower right corner.
    """
    i = tf.range(n_dest)[:, None]
    j = tf.range(n_src)
    m = i >= j - n_src + n_dest
    mask = tf.cast(m, dtype)
    mask = tf.reshape(mask, [1, n_dest, n_src])
    mult = tf.concat(
        [tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)], 0
    )
    return tf.tile(mask, mult)


class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att = layers.MultiHeadAttention(num_heads, embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs):
        input_shape = tf.shape(inputs)
        batch_size = input_shape[0]
        seq_len = input_shape[1]
        causal_mask = causal_attention_mask(batch_size, seq_len, seq_len, tf.bool)
        attention_output = self.att(inputs, inputs, attention_mask=causal_mask)
        attention_output = self.dropout1(attention_output)
        out1 = self.layernorm1(inputs + attention_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output)
        return self.layernorm2(out1 + ffn_output)

## Implement an embedding layer

Create two separate embedding layers: one for tokens and one for token index (positions).

In [3]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super().__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

## Implement the miniature GPT model

In [4]:
vocab_size = 20000  # Only consider the top 20k words
maxlen = 80  # Max sequence size
embed_dim = 256  # Embedding size for each token
num_heads = 2  # Number of attention heads
feed_forward_dim = 256  # Hidden layer size in feed forward network inside transformer


def create_model():
    inputs = layers.Input(shape=(maxlen,), dtype=tf.int32)
    embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
    x = embedding_layer(inputs)
    transformer_block = TransformerBlock(embed_dim, num_heads, feed_forward_dim)
    x = transformer_block(x)
    outputs = layers.Dense(vocab_size)(x)
    model = keras.Model(inputs=inputs, outputs=[outputs, x])
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    model.compile(
        "adam", loss=[loss_fn, None],
    )  # No loss and optimization based on word embeddings from transformer block
    return model

## Prepare the data for word-level language modelling

Download the IMDB dataset and combine training and validation sets for a text generation task.

In [5]:
!curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xf aclImdb_v1.tar.gz

curl: /opt/conda/lib/libcurl.so.4: no version information available (required by curl)
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 80.2M  100 80.2M    0     0  9419k      0  0:00:08  0:00:08 --:--:-- 17.8M


In [6]:
batch_size = 128

# The dataset contains each review in a separate text file
# The text files are present in four different folders
# Create a list all files
filenames = []
directories = [
    "aclImdb/train/pos",
    "aclImdb/train/neg",
    "aclImdb/test/pos",
    "aclImdb/test/neg",
]
for dir in directories:
    for f in os.listdir(dir):
        filenames.append(os.path.join(dir, f))

print(f"{len(filenames)} files")

# Create a dataset from text files
random.shuffle(filenames)
text_ds = tf.data.TextLineDataset(filenames)
text_ds = text_ds.shuffle(buffer_size=256)
text_ds = text_ds.batch(batch_size)


def custom_standardization(input_string):
    """ Remove html line-break tags and handle punctuation """
    lowercased = tf.strings.lower(input_string)
    stripped_html = tf.strings.regex_replace(lowercased, "<br />", " ")
    return tf.strings.regex_replace(stripped_html, f"([{string.punctuation}])", r" \1")


# Create a vectorization layer and adapt it to the text
vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size - 1,
    output_mode="int",
    output_sequence_length=maxlen + 1,
)
vectorize_layer.adapt(text_ds)
vocab = vectorize_layer.get_vocabulary()  # To get words back from token indices

## Functoin to create target column
def prepare_lm_inputs_labels(text):
    """
    Shift word sequences by 1 position so that the target for position (i) is
    word at position (i+1). The model will use all words up till position (i)
    to predict the next word.
    """
    text = tf.expand_dims(text, -1)
    tokenized_sentences = vectorize_layer(text)
    x = tokenized_sentences[:, :-1]
    y = tokenized_sentences[:, 1:]
    return x, y


text_ds = text_ds.map(prepare_lm_inputs_labels)
text_ds = text_ds.prefetch(tf.data.AUTOTUNE)

50000 files


In [7]:
data1 = pd.read_csv('../input/jigsaw-toxic-severity-rating/comments_to_score.csv')
data1 = data1['text']

data2 = pd.read_csv('../input/jigsaw-toxic-severity-rating/validation_data.csv')
data2 = data2['more_toxic']

data3 = pd.read_csv('../input/jigsaw-toxic-severity-rating/validation_data.csv')
data3 = data3['less_toxic']

# Concatenate columns into a single column
text_column = pd.concat([data1, data2, data3], axis=0, ignore_index=True)
len(text_column.unique())

14251

In [8]:
# data_1 = pd.read_csv('../input/jigsaw-toxic-severity-rating/comments_to_score.csv')
# data_1 = data_1['text']

# data = pd.read_csv('../input/jigsaw-toxic-severity-rating/validation_data.csv')
# text_column = data['more_toxic']

# print(len(text_column.unique())

# Create a dataset from the pandas column
text_ds = tf.data.Dataset.from_tensor_slices(text_column)

# Shuffle and batch the dataset
text_ds = text_ds.shuffle(buffer_size=256)
text_ds = text_ds.batch(batch_size)

# Create a vectorization layer and adapt it to the text
vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size - 1,
    output_mode="int",
    output_sequence_length=maxlen + 1,
)

vectorize_layer.adapt(text_ds)
vocab = vectorize_layer.get_vocabulary()  # To get words back from token indices

text_ds = text_ds.map(prepare_lm_inputs_labels)
text_ds = text_ds.prefetch(tf.data.AUTOTUNE)

In [9]:
# Preview a sample from the text_ds dataset
sample = text_ds.take(5)  # Take one sample from the dataset

for x, y in sample:
    # Convert token indices back to words
    input_words  = [vocab[i] for i in x[0].numpy()]
    target_words = [vocab[i] for i in y[0].numpy()]

    # Print the input and target sequences
    print("Input Sequence:")
    print(" ".join(input_words))
    print("\nTarget Sequence:")
    print(" ".join(target_words))

Input Sequence:
1st off , im not banned , 2nd of , i will just push on your belly and a few will com out your butt ! anyway , im going to bring over a [UNK] battery , i think your ready for it .                                    

Target Sequence:
off , im not banned , 2nd of , i will just push on your belly and a few will com out your butt ! anyway , im going to bring over a [UNK] battery , i think your ready for it .                                     
Input Sequence:
" it is not npov because any thing negative is " [UNK] " " and anything [UNK] , is . it is pov 'ed because nothing negative is allowed to be posted without [UNK] it . glosses over the fact that he lied to the american people , started needless wars , killed many americans , glosses over his drug scandal , glosses over his violations of the first amendment , glosses over the fact that he lets his religion

Target Sequence:
it is not npov because any thing negative is " [UNK] " " and anything [UNK] , is . it is pov 'ed 

## Implement a Keras callback for generating text

In [10]:
# class TextGenerator(keras.callbacks.Callback):
#     """A callback to generate text from a trained model.
#     1. Feed some starting prompt to the model
#     2. Predict probabilities for the next token
#     3. Sample the next token and add it to the next input

#     Arguments:
#         max_tokens: Integer, the number of tokens to be generated after prompt.
#         start_tokens: List of integers, the token indices for the starting prompt.
#         index_to_word: List of strings, obtained from the TextVectorization layer.
#         top_k: Integer, sample from the `top_k` token predictions.
#         print_every: Integer, print after this many epochs.
#     """

#     def __init__(
#         self, max_tokens, start_tokens, index_to_word, top_k=10, print_every=1
#     ):
#         self.max_tokens = max_tokens
#         self.start_tokens = start_tokens
#         self.index_to_word = index_to_word
#         self.print_every = print_every
#         self.k = top_k

#     def sample_from(self, logits):
#         logits, indices = tf.math.top_k(logits, k=self.k, sorted=True)
#         indices = np.asarray(indices).astype("int32")
#         preds = keras.activations.softmax(tf.expand_dims(logits, 0))[0]
#         preds = np.asarray(preds).astype("float32")
#         return np.random.choice(indices, p=preds)

#     def detokenize(self, number):
#         return self.index_to_word[number]

#     def on_epoch_end(self, epoch, logs=None):
#         start_tokens = [_ for _ in self.start_tokens]
#         if (epoch + 1) % self.print_every != 0:
#             return
#         num_tokens_generated = 0
#         tokens_generated = []
#         while num_tokens_generated <= self.max_tokens:
#             pad_len = maxlen - len(start_tokens)
#             sample_index = len(start_tokens) - 1
#             if pad_len < 0:
#                 x = start_tokens[:maxlen]
#                 sample_index = maxlen - 1
#             elif pad_len > 0:
#                 x = start_tokens + [0] * pad_len
#             else:
#                 x = start_tokens
#             x = np.array([x])
#             y, _ = self.model.predict(x)
#             sample_token = self.sample_from(y[0][sample_index])
#             tokens_generated.append(sample_token)
#             start_tokens.append(sample_token)
#             num_tokens_generated = len(tokens_generated)
#         txt = " ".join(
#             [self.detokenize(_) for _ in self.start_tokens + tokens_generated]
#         )
#         print(f"generated text:\n{txt}\n")
        
        

# # Tokenize starting prompt
# word_to_index = {}
# for index, word in enumerate(vocab):
#     word_to_index[word] = index


In [11]:
class TextGenerator(keras.callbacks.Callback):
    """A callback to generate text from a trained model.
    1. Feed some starting prompt to the model
    2. Predict probabilities for the next token
    3. Sample the next token and add it to the next input

    Arguments:
        max_tokens: Integer, the number of tokens to be generated after prompt.
        start_tokens: List of integers, the token indices for the starting prompt.
        index_to_word: List of strings, obtained from the TextVectorization layer.
        top_k: Integer, sample from the `top_k` token predictions.
        print_every: Integer, print after this many epochs.
    """

    def __init__(
        self, max_tokens, start_tokens, index_to_word, top_k=10, print_every=1
    ):
        self.max_tokens = max_tokens
        self.start_tokens = start_tokens
        self.index_to_word = index_to_word
        self.print_every = print_every
        self.k = top_k

    def sample_from(self, logits):
        logits, indices = tf.math.top_k(logits, k=self.k, sorted=True)
        indices = np.asarray(indices).astype("int32")
        preds = keras.activations.softmax(tf.expand_dims(logits, 0))[0]
        preds = np.asarray(preds).astype("float32")
        return np.random.choice(indices, p=preds)

    def detokenize(self, number):
        return self.index_to_word[number]

    def on_epoch_end(self, epoch, logs=None):
        if (epoch + 1) % self.print_every != 0:
            return
        print(f"Epoch {epoch+1}: loss = {logs['loss']:.4f}")
        start_tokens = [_ for _ in self.start_tokens]
        num_tokens_generated = 0
        tokens_generated = []
        while num_tokens_generated <= self.max_tokens:
            pad_len = maxlen - len(start_tokens)
            sample_index = len(start_tokens) - 1
            if pad_len < 0:
                x = start_tokens[:maxlen]
                sample_index = maxlen - 1
            elif pad_len > 0:
                x = start_tokens + [0] * pad_len
            else:
                x = start_tokens
            x = np.array([x])
            y, _ = self.model.predict(x)
            sample_token = self.sample_from(y[0][sample_index])
            tokens_generated.append(sample_token)
            start_tokens.append(sample_token)
            num_tokens_generated = len(tokens_generated)
        txt = " ".join(
            [self.detokenize(_) for _ in self.start_tokens + tokens_generated]
        )
        print(f"Generated text:\n{txt}\n")

In [13]:
# Tokenize starting prompt
word_to_index = {}
for index, word in enumerate(vocab):
    word_to_index[word] = index

In [14]:
start_prompt = "this movie is"
start_prompt = "what is the"


start_tokens = [word_to_index.get(_, 1) for _ in start_prompt.split()]
num_tokens_generated = 40
text_gen_callback = TextGenerator(num_tokens_generated, start_tokens, vocab)

## Train the model

In [None]:
from tqdm import tqdm

# Create a wrapper function for the training loop
def train_with_progress_bar(model, dataset, epochs, callbacks):
    # Disable the progress bar by setting `disable=True`
    with tqdm(total=epochs, disable=True) as pbar:
        for epoch in range(epochs):
            # Perform one epoch of training
            model.fit(dataset, verbose=0, epochs=1, callbacks=[text_gen_callback])
            
            # Update the progress bar manually
            pbar.set_postfix({'Epoch': epoch + 1})
            pbar.update(1)

# Train the model using the wrapper function
train_with_progress_bar(model, text_ds, N_EPOCHS, callbacks=[text_gen_callback])

In [16]:
model = create_model()

N_EPOCHS = 100
verbose = 0 ## Set to a number such as 2 to see each steps progress bar
model.fit(text_ds, verbose=0, epochs=N_EPOCHS, callbacks=[text_gen_callback])

Epoch 1: loss = 3.0517
Generated text:
what is the hell ? ? ? ? ? how to do it is ? ? ? ? ? ? ? ? ? ! ? ? ! ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?

Epoch 2: loss = 2.2536
Generated text:
what is the hell ! you 've got a life ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! !



KeyboardInterrupt: 

Now we can generate text continuuing from a new prmopt 

In [None]:
new_start_prompt = "start something"
new_start_tokens = [word_to_index.get(word, 1) for word in new_start_prompt.split()]

text_gen_callback.start_tokens = new_start_tokens
text_gen_callback.on_epoch_end(0)