### Training GPT-2 for Text Generation and Intent Recognition using Multi-Model Learning

**Imports and Initial Setup**

In [2]:
# If Colab
import os
os.chdir('/content/drive/Othercomputers/AKATSUKI-PC/PycharmProjects/chatopotamus')

In [1]:
from transformers import GPT2Tokenizer, TFGPT2LMHeadModel
import tensorflow as tf
import os
import joblib
import re
import pickle
import numpy as np
from src.training import gpt_trainer

In [None]:
model = TFGPT2LMHeadModel.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

In [3]:
# If Local Interpreter
os.chdir('C:\\Users\\thory\\PycharmProjects\\chatopotamus')

**Path Constants**

In [4]:
DATA_PATH = 'resources/gpt/data'
ORDERS_PATH = os.path.join(DATA_PATH, 'orders.txt')
ENQUIRY_PATH = os.path.join(DATA_PATH, 'enquiry.txt')
COMPLAINS_PATH = os.path.join(DATA_PATH, 'complains.txt')

**Convert into Sequences**

In [5]:
final_sequence_order, intent_order  = gpt_trainer.file_to_sequences(ORDERS_PATH, intent='order')
final_sequence_enquiry, intent_enquiry = gpt_trainer.file_to_sequences(ENQUIRY_PATH, intent='enquiry')
final_sequence_complain, intent_complain = gpt_trainer.file_to_sequences(COMPLAINS_PATH, intent='complain')

In [6]:
final_sequences = final_sequence_complain + final_sequence_enquiry + final_sequence_order
final_intents = intent_complain+intent_enquiry+intent_order

In [7]:
final_sequences[89], final_intents[89]

("customer: my hot chocolate was too hot to drink system: that's not good would you like us to remake it at a cooler temperature or something else? customer: a cooler remake would be great system: you got it! we'll make sure your hot chocolate is at a comfortable temperature anything else? customer: no thank you system: perfect we're on it your hot chocolate will be ready shortly thanks for letting us correct it! customer: the grilled cheese i ordered was too dry system: we apologize for that would you like a new one maybe with extra butter or a different sandwich?",
 'complain')

**Tokenize**

In [8]:
tokenizer.pad_token = tokenizer.eos_token
tokenized_inputs = tokenizer(final_sequences, max_length=150, truncation=True, padding=True, return_tensors="tf")

In [9]:
tokenized_inputs

{'input_ids': <tf.Tensor: shape=(1083, 150), dtype=int32, numpy=
array([[23144,   263,    25, ..., 50256, 50256, 50256],
       [23144,   263,    25, ...,   318,   262,   691],
       [23144,   263,    25, ..., 50256, 50256, 50256],
       ...,
       [23144,   263,    25, ...,    30,  1080,    25],
       [23144,   263,    25, ...,   257,   467,     0],
       [23144,   263,    25, ..., 50256, 50256, 50256]])>, 'attention_mask': <tf.Tensor: shape=(1083, 150), dtype=int32, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 0, 0, 0]])>}

**Preprocessing**

In [10]:
prepped_data_gen = gpt_trainer.preprocess_for_generation(tokenized_inputs)
prepped_data_gen

<PrefetchDataset element_spec=({'input_ids': TensorSpec(shape=(None, 140), dtype=tf.int32, name=None), 'attention_mask': TensorSpec(shape=(None, 140), dtype=tf.int32, name=None)}, {'input_ids': TensorSpec(shape=(None, 140), dtype=tf.int32, name=None), 'attention_mask': TensorSpec(shape=(None, 140), dtype=tf.int32, name=None)})>

In [11]:
prepped_data_intent = gpt_trainer.preprocessing_for_intent(tokenized_inputs, final_intents)
prepped_data_intent

<PrefetchDataset element_spec=({'input_ids': TensorSpec(shape=(None, 150), dtype=tf.int32, name=None), 'attention_mask': TensorSpec(shape=(None, 150), dtype=tf.int32, name=None)}, TensorSpec(shape=(None, 3), dtype=tf.float32, name=None))>

**Train-Test Split**

In [13]:
gen_splits = []
intent_splits = []
for prepped_data, type_ in zip([prepped_data_intent, prepped_data_gen], [gen_splits, intent_splits]):
    total_size = 0
    for _ in prepped_data:
        total_size += 1
    
    train_size = int(0.6 * total_size)
    val_size = int(0.2 * total_size)
    test_size = total_size - train_size - val_size
    
    type_ += [prepped_data.take(train_size), prepped_data.skip(train_size).take(val_size), prepped_data.skip(train_size+val_size)]

**Saving/Loading**

In [None]:
gpt_trainer.save_file('resources/gpt/data/tokenized_inputs.pkl', tokenized_inputs)
prepped_data_gen.save('resources/gpt/prepped_data_gen')
prepped_data_intent.save('resources/gpt/prepped_data_intent')

In [None]:
with open('resources/gpt/data/tokenized_inputs.pkl', 'rb') as f:
    tokenized_inputs = joblib.load(f)