### Training GPT-2 for Text Generation and Intent Recognition using Multi-Model Learning

**Imports and Initial Setup**

In [1]:
# If Colab
import os
os.chdir('/content/drive/Othercomputers/AKATSUKI-PC/PycharmProjects/chatopotamus')

In [2]:
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
import tensorflow as tf
from tensorflow import keras
import os
import joblib
import re
import pickle
import numpy as np
from src.training import text_prep, ner_prep

In [2]:
# If Local Interpreter
os.chdir('C:\\Users\\thory\\PycharmProjects\\chatopotamus')

**Path Constants**

In [3]:
DATA_PATH = 'resources/gpt/data'
ORDERS_PATH = os.path.join(DATA_PATH, 'orders.txt')
ENQUIRY_PATH = os.path.join(DATA_PATH, 'enquiry.txt')
COMPLAINS_PATH = os.path.join(DATA_PATH, 'complains.txt')

**Convert into Sequences**

In [4]:
final_sequence_order, intent_order  = text_prep.file_to_sequences(ORDERS_PATH, intent='order')
final_sequence_enquiry, intent_enquiry = text_prep.file_to_sequences(ENQUIRY_PATH, intent='enquiry')
final_sequence_complain, intent_complain = text_prep.file_to_sequences(COMPLAINS_PATH, intent='complain')

In [5]:
final_sequences = final_sequence_complain + final_sequence_enquiry + final_sequence_order
final_intents = intent_complain+intent_enquiry+intent_order

In [6]:
final_sequences[89], final_intents[89]

("customer: my hot chocolate was too hot to drink system: that's not good would you like us to remake it at a cooler temperature or something else? customer: a cooler remake would be great system: you got it! we'll make sure your hot chocolate is at a comfortable temperature anything else? customer: no thank you system: perfect we're on it your hot chocolate will be ready shortly thanks for letting us correct it! customer: the grilled cheese i ordered was too dry system: we apologize for that would you like a new one maybe with extra butter or a different sandwich?",
 'complain')

**Tokenize**

In [7]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [8]:
tokenized_inputs = tokenizer(final_sequences, max_length=150, truncation=True, padding=True, return_tensors="tf")

In [9]:
tokenized_inputs

{'input_ids': <tf.Tensor: shape=(1083, 150), dtype=int32, numpy=
array([[  101,  8013,  1024, ...,     0,     0,     0],
       [  101,  8013,  1024, ...,  1055,  2025,   102],
       [  101,  8013,  1024, ...,     0,     0,     0],
       ...,
       [  101,  8013,  1024, ...,  1045, 12533,   102],
       [  101,  8013,  1024, ...,  2208,  2003,   102],
       [  101,  8013,  1024, ...,     0,     0,     0]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(1083, 150), dtype=int32, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 0, 0, 0]], dtype=int32)>}

**Preprocessing**

In [25]:
prepped_data = text_prep.preprocess_for_intent(tokenized_inputs, final_intents)
prepped_data

<_PrefetchDataset element_spec=({'input_ids': TensorSpec(shape=(None, 150), dtype=tf.int32, name=None), 'attention_mask': TensorSpec(shape=(None, 150), dtype=tf.int32, name=None)}, TensorSpec(shape=(None, 3), dtype=tf.float32, name=None))>

**Train-Test Split**

In [26]:
total_size = 0
for _ in prepped_data:
    total_size += 1
    
train_size = int(0.6 * total_size)
val_size = int(0.2 * total_size)
test_size = total_size - train_size - val_size

In [27]:
train_set = prepped_data.take(train_size+val_size)
val_set = train_set.skip(train_size)
train_set = train_set.take(train_size)
test_set = prepped_data.skip(train_size+val_size)

**Modeling and Training**

**1. Intent Classification**

In [None]:
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3)

In [28]:
# Adding a dropout layer for regularization
input_ids = tf.keras.layers.Input(shape=(150,), dtype=tf.int32, name='input_ids')
attention_mask = tf.keras.layers.Input(shape=(150,), dtype=tf.int32, name='attention_mask')

# Use the DistilBERT model
distilbert_output = model.distilbert(input_ids, attention_mask=attention_mask)[0]
pooled_output = distilbert_output[:, 0]

# Additional dropout layer
dropout = tf.keras.layers.Dropout(0.3)(pooled_output)

# Classification layer
classifier = tf.keras.layers.Dense(3, activation='softmax')(dropout)

# Final model
final_model = tf.keras.models.Model(inputs=[input_ids, attention_mask], outputs=classifier)

In [29]:
final_model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_ids (InputLayer)      [(None, 150)]                0         []                            
                                                                                                  
 attention_mask (InputLayer  [(None, 150)]                0         []                            
 )                                                                                                
                                                                                                  
 distilbert (TFDistilBertMa  TFBaseModelOutput(last_hid   6636288   ['input_ids[0][0]',           
 inLayer)                    den_state=(None, 150, 768)   0          'attention_mask[0][0]']      
                             , hidden_states=None, atte                                     

In [None]:
root_log_dir = os.path.join(os.curdir, 'tb_logs', 'ir_training')
def get_run_logdir():
    import time
    run_id = time.strftime('run_%H-%M')
    run_logdir = os.path.join(root_log_dir, run_id)
    return run_logdir

In [30]:
optimizer = keras.optimizers.Adam(learning_rate=0.00008)
final_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy', keras.metrics.Precision(), keras.metrics.Recall(), ner_prep.F1Score()])

In [31]:
run_logdir = get_run_logdir()
tb_callback = keras.callbacks.TensorBoard(log_dir=run_logdir)
es_callback = keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)

In [32]:
history = final_model.fit(train_set, epochs=100, validation_data=val_set, callbacks=[tb_callback, es_callback], use_multiprocessing=True)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100

KeyboardInterrupt: 

**Saving/Loading**

In [23]:
text_prep.save_file('resources/gpt/data/tokenized_inputs.pkl', tokenized_inputs)
prepped_data.save('resources/gpt/prepped_data_gen')

In [8]:
with open('resources/gpt/data/tokenized_inputs.pkl', 'rb') as f:
    tokenized_inputs = joblib.load(f)