### Text Generation using GPT2

In [1]:
# For Colab
import os
os.chdir('/content/drive/Othercomputers/AKATSUKI-PC/PycharmProjects/chatopotamus')

In [1]:
import tensorflow as tf
from transformers import GPT2Tokenizer, TFGPT2LMHeadModel
from tensorflow import keras
from all_legacy_code.src.preprocess import text_prep

In [5]:
# For Local Interpreter
import os
os.chdir('C:\\Users\\thory\\PycharmProjects\\chatopotamus')

**Getting Data**

In [6]:
DATA_PATH = 'resources/gpt/data'
ORDERS_PATH = os.path.join(DATA_PATH, 'orders.txt')
ENQUIRY_PATH = os.path.join(DATA_PATH, 'enquiry.txt')
COMPLAINS_PATH = os.path.join(DATA_PATH, 'complains.txt')

In [7]:
*final_sequence_order, intent_order  = text_prep.file_to_sequences(ORDERS_PATH, intent='order', split_speaker=True)
*final_sequence_enquiry, intent_enquiry = text_prep.file_to_sequences(ENQUIRY_PATH, intent='enquiry', split_speaker=True)
*final_sequence_complain, intent_complain = text_prep.file_to_sequences(COMPLAINS_PATH, intent='complain', split_speaker=True)

In [8]:
final_sequences = final_sequence_complain+final_sequence_enquiry+final_sequence_order

In [9]:
# ONLY FOR SPLIT_SPEAKER
all_customer_dialogues = []
all_system_dialogues = []

# Function to append dialogues to the respective lists
def append_dialogues(customer_dialogues, system_dialogues):
    all_customer_dialogues.extend(customer_dialogues)
    all_system_dialogues.extend(system_dialogues)

# Append dialogues from each file
append_dialogues(*final_sequence_order)
append_dialogues(*final_sequence_enquiry)
append_dialogues(*final_sequence_complain)

# Now all_customer_dialogues and all_system_dialogues contain all dialogues
final_sequences_pair = (all_customer_dialogues, all_system_dialogues)

In [11]:
final_sequences_pair

70

**Tokenizing**

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.padding_side = 'left'
tokenizer.pad_token = tokenizer.eos_token
tokenized_inputs = tokenizer(final_sequences[0], max_length=40, padding='max_length', truncation=True, return_tensors='tf')

In [9]:
tokenized_inputs

{'input_ids': <tf.Tensor: shape=(1, 40), dtype=int32, numpy=
array([[50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        23144,   263,    25, 23105,   616,  6891,   373,  1165, 12922,
         1080,    25,  7926,   284,  3285,   326,     0,   356,  4031,
          329, 20187,   407, 35987,   644,   460,   356,   466,   284,
          787,   340,   826,    30]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(1, 40), dtype=int32, numpy=
array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
      dtype=int32)>}

**Preprocessing** (this step encapsulates tokenization)

In [10]:
prepped_data = text_prep.preprocess_for_gpt2(final_sequences, tokenizer, train=True, one_shift=True)
prepped_data

<_PrefetchDataset element_spec=({'input_ids': TensorSpec(shape=(None, 39), dtype=tf.int32, name=None), 'attention_mask': TensorSpec(shape=(None, 39), dtype=tf.int32, name=None)}, TensorSpec(shape=(None, 39), dtype=tf.int32, name=None))>

In [11]:
for i in prepped_data.take(1):
    print(i[0], i[1])

{'input_ids': <tf.Tensor: shape=(16, 39), dtype=int32, numpy=
array([[50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 23144,   263,    25,   262,  8887,  1312,
         6149,   373,   300,  4649, 31975,  1080,    25,   356, 16521,
          329,   326,   561,   345,   588,   257,  3024,  9014,   393,
          257,  1180,  4144],
       [23144,   263,    25,   326,   338,   477,   329,   783,  5875,
          345,     0,  1080,    25,   345,   821,  7062,     0,   534,
         1502,   329,   257,  1588,  3223, 32595,  6891, 34240,  9891,
         3425,  5362,   290, 11311, 11594, 27563,   259,   318,  4999,
          534,  2472,   318],
       [50256, 50256, 23144,   263,    25,  1312,  1549,  9144,   326,
         5875,   345,  1080,    25,   345,   821,  7062,     0,   356,
          447,   247,   260,   319,   340,   534,  1844,   698,    83,
          481,   307,  3492,   287,   257,  7644, 32920,   329,   262,
         5022,    12,   92

**Train-Test Split**

In [12]:
train_dataset = prepped_data.take(200)
test_dataset = prepped_data.skip(200)

**Model Training**

In [None]:
gpt2_oneshift = TFGPT2LMHeadModel.from_pretrained('gpt2')

In [5]:
gpt2_oneshift.load_weights('resources/gpt/saved/gpt2_oneshift_weights.h5')

In [14]:
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
gpt2_oneshift.compile(optimizer=optimizer, loss=loss)

In [16]:
import time
es_callback = keras.callbacks.EarlyStopping(patience=6, restore_best_weights=True)
run_id = time.strftime('run_%H-%M')
tb_callback = keras.callbacks.TensorBoard(log_dir=f'tb_logs/gen_training/{run_id}')
onecycle_callback = text_prep.OneCycleLRSchedule(max_lr=5e-3, total_steps=30 * 200 / 16, lr_start=5e-5, lr_end=5e-5)

In [43]:
gpt2_oneshift.fit(train_dataset, epochs=30, validation_data=test_dataset, callbacks=[es_callback])
gpt2_oneshift.save_weights('resources/gpt/saved/gpt2_oneshift_weights.h5')

In [18]:
gpt2_oneshift.fit(train_dataset, epochs=10, validation_data=test_dataset, callbacks=[es_callback])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7b3644642290>

In [39]:
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-7)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
gpt2_oneshift.compile(optimizer=optimizer, loss=loss)

In [40]:
gpt2_oneshift.fit(train_dataset, epochs=10, validation_data=test_dataset, callbacks=[es_callback])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10


<keras.src.callbacks.History at 0x7b353d468dc0>

**Round 2**

In [6]:
*final_sequence_order, intent_order  = text_prep.file_to_sequences(ORDERS_PATH, intent='order', split_speaker=True)
*final_sequence_enquiry, intent_enquiry = text_prep.file_to_sequences(ENQUIRY_PATH, intent='enquiry', split_speaker=True)
*final_sequence_complain, intent_complain = text_prep.file_to_sequences(COMPLAINS_PATH, intent='complain', split_speaker=True)

In [7]:
# ONLY FOR SPLIT_SPEAKER
all_customer_dialogues = []
all_system_dialogues = []

# Function to append dialogues to the respective lists
def append_dialogues(customer_dialogues, system_dialogues):
    all_customer_dialogues.extend(customer_dialogues)
    all_system_dialogues.extend(system_dialogues)

# Append dialogues from each file
append_dialogues(*final_sequence_order)
append_dialogues(*final_sequence_enquiry)
append_dialogues(*final_sequence_complain)

# Now all_customer_dialogues and all_system_dialogues contain all dialogues
final_sequences_pair = (all_customer_dialogues, all_system_dialogues)

In [8]:
final_sequences_pair[0][10]

'customer: just a medium hot chocolate to finish'

In [11]:
prepped_data_pair = text_prep.preprocess_for_gpt2(final_sequences_pair, tokenizer, train=True, one_shift=False)
prepped_data_pair

<_PrefetchDataset element_spec=({'input_ids': TensorSpec(shape=(None, 20), dtype=tf.int32, name=None), 'attention_mask': TensorSpec(shape=(None, 20), dtype=tf.int32, name=None)}, TensorSpec(shape=(None, 20), dtype=tf.int32, name=None))>

In [12]:
train_dataset_pair = prepped_data_pair.take(200)
test_dataset_pair = prepped_data_pair.skip(200)

In [13]:
gpt2_combined = keras.models.clone_model(gpt2_oneshift)
gpt2_combined.build()
gpt2_combined.set_weights(gpt2_oneshift.get_weights())

In [48]:
for i in train_dataset_pair.take(1):
    print(i)

({'input_ids': <tf.Tensor: shape=(16, 20), dtype=int32, numpy=
array([[23144,   263,    25,  3763,  1312,  1549,   588,   257, 11311,
        19550,   836,   315,   290,   257,  1402, 48718, 16858,   351,
          281,  3131],
       [50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 23144,   263,    25,   290,   257,  1402, 48247,
         4274,  4274],
       [50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 23144,   263,    25,  1342,
         6029,  3387],
       [50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 23144,   263,    25,
         3763,  3387],
       [50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 23144,   263,
           25,  1402],
       [50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 23144,
          263,    25,   466,   345,  2897

In [36]:
optimizer_c = tf.keras.optimizers.Adam(learning_rate=0.00001)
loss_c = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
gpt2_combined.compile(optimizer=optimizer_c, loss=loss_c)

In [39]:
#history_c1 = gpt2_combined.fit(prepped_data_pair, epochs=30, callbacks=[es_callback], validation_data=test_dataset_pair)
gpt2_combined.save_weights('resources/gpt/saved/gpt2_combined_weights.h5')

**Augmentation**

In [28]:
replacements_2 = {
        "order ": "drink ",
        "i will":"i can",
        "i will take": "can i get",
        "i will take a":"i want the",
        "that is all": "that is everything yes",
        "i will have a": "i wanna get a",
        "get me a": "how about a",
        "Actually" : "You know what"
}

replacements_1 ={"place an order": "order",
        "i'd like to": "i will",
        "i'll": "i will",
        "i'll start with": "i will take",
        "i'd like a": "i will take a",
        "that'll be it": "that is all",
        "Also":"As well as"}

def augment_dialogue_pairs(dialogue_pair, replacements):
    def apply_replacements(dialogues):
        augmented_dialogues = []
        for i in range(len(dialogues)):
            original_dialogue = dialogues[i]
            for original, replacement in replacements.items():
                dialogues[i] = dialogues[i].replace(original, replacement)
            # Add to the list only if changes were made
            if dialogues[i] != original_dialogue:
                augmented_dialogues.append(dialogues[i])
        return augmented_dialogues

    # Unpack the customer and system dialogues
    customer_dialogues, system_responses = dialogue_pair

    # Combine customer and system dialogues into pairs
    combined_dialogues = [f"Customer: {c}\nSystem: {s}" for c, s in zip(customer_dialogues, system_responses)]

    # Apply replacements
    updated_combined_dialogues = apply_replacements(combined_dialogues)

    # Split the combined dialogues back into customer and system dialogues
    updated_customer_dialogues = []
    updated_system_responses = []
    for dialogue in updated_combined_dialogues:
        customer, system = dialogue.split('\n')
        updated_customer_dialogues.append(customer.replace("Customer: ", ""))
        updated_system_responses.append(system.replace("System: ", ""))

    return (updated_customer_dialogues, updated_system_responses)



In [35]:
new_pairs_2 = augment_dialogue_pairs(final_sequences_pair, replacements_2)

In [37]:
len(new_pairs_2[0])

793

In [38]:
# ONLY FOR SPLIT_SPEAKER
all_customer_dialogues = []
all_system_dialogues = []

# Function to append dialogues to the respective lists
def append_dialogues(customer_dialogues, system_dialogues):
    all_customer_dialogues.extend(customer_dialogues)
    all_system_dialogues.extend(system_dialogues)

# Append dialogues from each file
append_dialogues(*new_pairs_2)
append_dialogues(*final_sequences_2)

# Now all_customer_dialogues and all_system_dialogues contain all dialogues
final_sequences_3 = (all_customer_dialogues, all_system_dialogues)

In [44]:
final_sequences_3[1][6000]

"system: sure we'll prepare a new hot chocolate with less sweetness for you anything else"

In [45]:
reprepped_data_pairs = text_prep.preprocess_for_gpt2(final_sequences_3, tokenizer, train=True, one_shift=False)
reprepped_data_pairs

<_PrefetchDataset element_spec=({'input_ids': TensorSpec(shape=(None, 20), dtype=tf.int32, name=None), 'attention_mask': TensorSpec(shape=(None, 20), dtype=tf.int32, name=None)}, TensorSpec(shape=(None, 20), dtype=tf.int32, name=None))>

In [47]:
retrain_set_pairs = reprepped_data_pairs.take(315)
retest_set_pairs = reprepped_data_pairs.skip(315)

In [None]:
gpt2_combined = TFGPT2LMHeadModel.from_pretrained('gpt2')
gpt2_combined.load_weights('resources/gpt/saved/gpt2_oneshift_weights.h5')
gpt2_combined.build()

In [62]:
optimizer_c = tf.keras.optimizers.Nadam(learning_rate=5e-8)
loss_c = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
gpt2_combined.compile(optimizer=optimizer_c, loss=loss_c)
es_callback = keras.callbacks.EarlyStopping(patience=6, restore_best_weights=True)

In [63]:
history_c3 = gpt2_combined.fit(retrain_set_pairs, epochs=20, validation_data=retest_set_pairs, callbacks=[es_callback])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20


In [64]:
gpt2_combined.save_weights('resources/gpt/saved/gpt2_combined_weights.h5')

In [54]:
history_c4 = gpt2_combined.fit(retrain_set_pairs, epochs=20, validation_data=retest_set_pairs, callbacks=[es_callback])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20


In [55]:
gpt2_combined.save_weights('resources/gpt/saved/gpt2_combined_weights.h5')

**Prediction**

In [70]:

from all_legacy_code.src.training_and_prediction import predict

gpt2_oneshift = TFGPT2LMHeadModel.from_pretrained('gpt2')
gpt2_oneshift.build()
gpt2_oneshift.load_weights('resources/gpt/saved/gpt2_oneshift_weights.h5')
predict.generate_text('customer: hi ill have a small mocha', tokenizer, gpt2_oneshift, max_length=50)

All PyTorch model weights were used when initializing TFGPT2LMHeadModel.

All the weights of TFGPT2LMHeadModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'customer: hi ill have a small mocha but it’s a detective mystery system: smallmochas with a plot as rich and complex as its chocolate what else can we uncover for'

In [66]:
reprepped_data_pairs.save('resources/gpt/data/reprepped_data_gen')