### Training GPT-2 for Text Generation and Intent Recognition using Multi-Model Learning

**Imports and Initial Setup**

In [1]:
# If Colab
import os
os.chdir('/content/drive/Othercomputers/AKATSUKI-PC/PycharmProjects/chatopotamus')

In [1]:
from transformers import DistilBertTokenizer, TFDistilBertModel, DistilBertConfig
import tensorflow as tf
from tensorflow import keras
import os
import joblib
import re
import pickle
import numpy as np
from src.training import text_prep, ner_prep

In [3]:
# If Local Interpreter
os.chdir('C:\\Users\\thory\\PycharmProjects\\chatopotamus')

**Path Constants**

In [3]:
DATA_PATH = 'resources/gpt/data'
ORDERS_PATH = os.path.join(DATA_PATH, 'orders.txt')
ENQUIRY_PATH = os.path.join(DATA_PATH, 'enquiry.txt')
COMPLAINS_PATH = os.path.join(DATA_PATH, 'complains.txt')

**Convert into Sequences**

In [4]:
final_sequence_order, intent_order  = text_prep.file_to_sequences(ORDERS_PATH, intent='order')
final_sequence_enquiry, intent_enquiry = text_prep.file_to_sequences(ENQUIRY_PATH, intent='enquiry')
final_sequence_complain, intent_complain = text_prep.file_to_sequences(COMPLAINS_PATH, intent='complain')

In [5]:
final_sequences = final_sequence_complain + final_sequence_enquiry + final_sequence_order
final_intents = intent_complain+intent_enquiry+intent_order

In [6]:
final_sequences[3800], final_intents[3800]

("customer: throw in a small french vanilla but it's the first love system: small french vanilla sweet and a little bit dreamy the kind of sip you never forget that’ll be <price> how do you want to reminisce cash or card?",
 'order')

**Tokenize**

In [7]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

In [8]:
tokenized_inputs = tokenizer(final_sequences, max_length=40, truncation=True, padding=True, return_tensors="tf")

In [9]:
tokenized_inputs

{'input_ids': <tf.Tensor: shape=(4320, 40), dtype=int32, numpy=
array([[  101,  8013,  1024, ...,     0,     0,     0],
       [  101,  8013,  1024, ...,  2005,  2017,   102],
       [  101,  8013,  1024, ...,     0,     0,     0],
       ...,
       [  101,  8013,  1024, ..., 14744,  1000,   102],
       [  101,  8013,  1024, ...,  1037, 25751,   102],
       [  101,  8013,  1024, ...,  3424,  6895,   102]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(4320, 40), dtype=int32, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1]], dtype=int32)>}

**Preprocessing**

In [10]:
prepped_data = text_prep.preprocess_for_intent(tokenized_inputs, final_intents)
prepped_data

<_PrefetchDataset element_spec=({'input_ids': TensorSpec(shape=(None, 40), dtype=tf.int32, name=None), 'attention_mask': TensorSpec(shape=(None, 40), dtype=tf.int32, name=None)}, TensorSpec(shape=(None, 3), dtype=tf.float32, name=None))>

In [11]:
for i in prepped_data.take(1):
    print(i)

({'input_ids': <tf.Tensor: shape=(16, 40), dtype=int32, numpy=
array([[  101,  8013,  1024,  1045,  1005,  1049,  2036, 26369,  1037,
         4977, 11611,  2252,  6090,  5498, 15174,  2098,  1998,  3531,
         5587,  4469, 11611,  2291,  1024,  2469,  1037,  4977, 11611,
         2252,  6090,  5498, 15174,  2098,  2007,  4469, 11611,  2505,
         2842,  2000,  3143,   102],
       [  101,  8013,  1024,  2026,  2980,  7967,  2001,  2205,  2980,
         1045,  2481,  1005,  1056,  4392,  2009,  2291,  1024,  2008,
         1005,  1055,  2205,  2980,  2000,  5047,  2292,  1005,  1055,
         4658,  2009,  2091,  2000,  1996,  3819, 24747,  4860,  2005,
         2017,   102,     0,     0],
       [  101,  8013,  1024,  2008,  1005,  2222,  2022,  2009,  4067,
         2017,   999,  2291,  1024,  2017,  1005,  2128,  6160,   999,
         2115,  2344,  2005,  1037,  5396,  9587,  7507,  2630,  9766,
        14163, 15379,  5396, 28248,  5572,  1998,  4524,  2884,  2007,
         69

**Train-Test Split**

In [12]:
total_size = 0
for _ in prepped_data:
    total_size += 1

In [13]:
total_size

270

In [15]:
train_set = prepped_data.take(200)

val_set = prepped_data.skip(200).take(30)

test_set = prepped_data.skip(230)

**Modeling and Training**

**1. Intent Classification**

In [None]:
config = DistilBertConfig.from_pretrained('distilbert-base-uncased')
model = TFDistilBertModel.from_pretrained('distilbert-base-uncased', config=config)

In [28]:
input_ids = tf.keras.layers.Input(shape=(40,), dtype=tf.int32, name='input_ids')
attention_mask = tf.keras.layers.Input(shape=(40,), dtype=tf.int32, name='attention_mask')

# Use the DistilBERT model
distilbert_output = model(input_ids, attention_mask=attention_mask)[0]

# Get the output for the [CLS] token (first token)
pooled_output = distilbert_output[:, 0]

# Additional dropout layer for regularization
dropout = tf.keras.layers.Dropout(0.3)(pooled_output)

# Classifier layer for your 3 classes
classifier = tf.keras.layers.Dense(3, activation='softmax')(dropout)

# Final model
final_model = tf.keras.models.Model(inputs=[input_ids, attention_mask], outputs=classifier)

In [29]:
final_model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_ids (InputLayer)      [(None, 40)]                 0         []                            
                                                                                                  
 attention_mask (InputLayer  [(None, 40)]                 0         []                            
 )                                                                                                
                                                                                                  
 tf_distil_bert_model (TFDi  TFBaseModelOutput(last_hid   6636288   ['input_ids[0][0]',           
 stilBertModel)              den_state=(None, 40, 768),   0          'attention_mask[0][0]']      
                              hidden_states=None, atten                                     

In [30]:
root_log_dir = os.path.join(os.curdir, 'tb_logs', 'ir_training')
def get_run_logdir():
    import time
    run_id = time.strftime('run_%H-%M')
    run_logdir = os.path.join(root_log_dir, run_id)
    return run_logdir

In [31]:
max_lr = 0.0001  # Peak learning rate
num_epochs = 50
batch_size = 16
num_samples = 270

# Initialize the scheduler
onecycle_lr_scheduler = text_prep.OneCycleLRSchedule(max_lr, total_steps=num_epochs * (num_samples // batch_size))

In [32]:
optimizer = keras.optimizers.Adam(learning_rate=0.00001)
final_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy', keras.metrics.Precision(), keras.metrics.Recall(), ner_prep.F1Score()])

In [33]:
run_logdir = get_run_logdir()
tb_callback = keras.callbacks.TensorBoard(log_dir=run_logdir)
es_callback = keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)

In [34]:
history = final_model.fit(train_set, epochs=7, validation_data=val_set, callbacks=[tb_callback, es_callback, onecycle_lr_scheduler], use_multiprocessing=True)

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


![Epoch Loss](C:\Users\thory\PycharmProjects\chatopotamus\tb_logs\images\epoch_loss.png)

**Evaluation**

In [36]:
loss, accuracy, precision, recall, f1_score = final_model.evaluate(test_set)
print(f'Loss : {loss}\nAccuracy : {accuracy}\nPrecision : {precision}\nRecall : {recall}\nF1 Score : {f1_score}')

Loss : 0.07061126083135605
Accuracy : 0.9828125238418579
Precision : 0.9843505620956421
Recall : 0.9828125238418579
F1 Score : 0.983580470085144


**Saving/Loading**

In [39]:
model.save_weights('resources/bert/ir_init_model.h5')
final_model.save_weights('resources/bert/ir_final_weights.h5')

In [23]:
text_prep.save_file('resources/gpt/data/tokenized_inputs.pkl', tokenized_inputs)
prepped_data.save('resources/gpt/prepped_data_gen')

In [8]:
with open('resources/gpt/data/tokenized_inputs.pkl', 'rb') as f:
    tokenized_inputs = joblib.load(f)

In [4]:
final_model = keras.models.load_weights('resources/bert/ir_final_weights.h5')

TypeError: load_weights() missing 1 required positional argument: 'filepath'