### Training DistilBERT for Intent Recognition

**Imports and Initial Setup**

In [None]:
# If Colab
import os

import pandas as pd

os.chdir('/content/drive/Othercomputers/AKATSUKI-PC/PycharmProjects/chatopotamus')

In [3]:
from transformers import DistilBertTokenizer, TFDistilBertModel, DistilBertConfig
import tensorflow as tf
from tensorflow import keras
import os
import joblib
import numpy as np
from all_legacy_code.src.preprocess import text_prep
from all_legacy_code.src import models

In [4]:
# If Local Interpreter
os.chdir('C:\\Users\\thory\\PycharmProjects\\chatopotamus')

**Path Constants**

In [19]:
DATA_PATH = 'resources/gpt/data'
ORDERS_PATH = os.path.join(DATA_PATH, 'orders.txt')
ENQUIRY_PATH = os.path.join(DATA_PATH, 'enquiry.txt')
COMPLAINS_PATH = os.path.join(DATA_PATH, 'complains.txt')

**Convert into Sequences**

In [20]:
final_sequence_order, intent_order  = text_prep.file_to_sequences(ORDERS_PATH, intent='order', length=2)
final_sequence_enquiry, intent_enquiry = text_prep.file_to_sequences(ENQUIRY_PATH, intent='enquiry', length=2)
final_sequence_complain, intent_complain = text_prep.file_to_sequences(COMPLAINS_PATH, intent='complain', length=2)

In [21]:
final_sequences = final_sequence_complain + final_sequence_enquiry + final_sequence_order
final_intents = intent_complain+intent_enquiry+intent_order

In [62]:
final_sequences[2002], final_intents[2002]

("customer: what kinds of teas are available? system: we offer a range of teas including black green herbal and flavored varieties any particular flavor you're interested in?",
 'enquiry')

**Tokenize**

In [5]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

In [24]:
tokenized_inputs = tokenizer(final_sequences, max_length=40, truncation=True, padding='max_length', return_tensors="tf")

In [25]:
tokenized_inputs

{'input_ids': <tf.Tensor: shape=(4320, 40), dtype=int32, numpy=
array([[  101,  8013,  1024, ...,     0,     0,     0],
       [  101,  8013,  1024, ...,  2005,  2017,   102],
       [  101,  8013,  1024, ...,     0,     0,     0],
       ...,
       [  101,  8013,  1024, ..., 14744,  1000,   102],
       [  101,  8013,  1024, ...,  1037, 25751,   102],
       [  101,  8013,  1024, ...,  3424,  6895,   102]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(4320, 40), dtype=int32, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1]], dtype=int32)>}

**Preprocessing**

In [26]:
prepped_data = text_prep.preprocess_for_intent(tokenized_inputs, final_intents)
prepped_data

<_PrefetchDataset element_spec=({'input_ids': TensorSpec(shape=(None, 40), dtype=tf.int32, name=None), 'attention_mask': TensorSpec(shape=(None, 40), dtype=tf.int32, name=None)}, TensorSpec(shape=(None, 3), dtype=tf.float32, name=None))>

In [27]:
for i in prepped_data.take(1):
    print(i)

({'input_ids': <tf.Tensor: shape=(16, 40), dtype=int32, numpy=
array([[  101,  8013,  1024,  2748,  1045,  1005,  1040,  2066,  1037,
         7967, 16510,  2123,  4904,  1998,  1037,  2235,  2413, 21161,
         2007,  2019,  4469,  2915,  2291,  1024,  6581,  9804,   999,
         1037,  7967, 16510,  2123,  4904,  1998,  1037,  2235,  2413,
        21161,  2007,  2019,   102],
       [  101,  8013,  1024,  3398,  1037,  2312,  2137,  2080,  4469,
         2915,  2291,  1024,  2312,  2137,  2080,  2007,  2019,  4469,
        24689,  7959,  3170,  8595,  2008,  1005,  2222,  2022,  1026,
         3976,  1028,  2129,  2052,  2017,  2066,  2000,  3477,  1029,
          102,     0,     0,     0],
       [  101,  8013,  1024,  2024,  2045,  5699,  1011,  2489,  7047,
         2005,  2123, 16446,  1029,  2291,  1024,  2747,  2057,  2123,
         1005,  1056,  2031,  5699,  1011,  2489,  2123, 16446,  2021,
         2057,  2031,  2060,  5699,  1011,  2489, 19782,  7047,  4699,
         19

**Train-Test Split**

In [28]:
total_size = 0
for _ in prepped_data:
    total_size += 1

In [29]:
total_size

270

In [30]:
train_set = prepped_data.take(200)

val_set = prepped_data.skip(200).take(30)

test_set = prepped_data.skip(230)

**Modeling and Training**

**1. Intent Classification**

In [None]:
final_model = models.create_intent_classifier(compile=False)

In [32]:
final_model.summary()

Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_ids (InputLayer)      [(None, 40)]                 0         []                            
                                                                                                  
 attention_mask (InputLayer  [(None, 40)]                 0         []                            
 )                                                                                                
                                                                                                  
 tf_distil_bert_model_3 (TF  TFBaseModelOutput(last_hid   6636288   ['input_ids[0][0]',           
 DistilBertModel)            den_state=(None, 40, 768),   0          'attention_mask[0][0]']      
                              hidden_states=None, atten                                     

In [None]:
root_log_dir = os.path.join(os.curdir, 'tb_logs', 'ir_training')
def get_run_logdir():
    import time
    run_id = time.strftime('run_%H-%M')
    run_logdir = os.path.join(root_log_dir, run_id)
    return run_logdir

In [34]:
run_logdir = get_run_logdir()
tb_callback = keras.callbacks.TensorBoard(log_dir=run_logdir)
es_callback = keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)

In [35]:
max_lr = 0.0001  # Peak learning rate
num_epochs = 50
batch_size = 16
num_samples = 270

# Initialize the scheduler
onecycle_lr_scheduler = text_prep.OneCycleLRSchedule(max_lr, total_steps=num_epochs * (num_samples // batch_size))

In [36]:
history = final_model.fit(train_set, epochs=100, validation_data=val_set, callbacks=[es_callback, onecycle_lr_scheduler], use_multiprocessing=True)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [43]:
final_weights = final_model.get_weights()

**Evaluation**

In [36]:
loss, accuracy, precision, recall, f1_score = final_model.evaluate(test_set)
print(f'Loss : {loss}\nAccuracy : {accuracy}\nPrecision : {precision}\nRecall : {recall}\nF1 Score : {f1_score}')

Loss : 0.07061126083135605
Accuracy : 0.9828125238418579
Precision : 0.9843505620956421
Recall : 0.9828125238418579
F1 Score : 0.983580470085144


In [11]:
print(np.argmax(final_model.predict(
    text_prep.preprocess_for_intent('get me a medium dark roast', intents=None, tokenizer=tokenizer, train=False))))

0


**Saving/Loading**

In [65]:
# model.save_weights('resources/bert/ir_init_model.h5')
final_model.save_weights('resources/bert/ir2_final_weights.h5')

In [23]:
text_prep.save_file('resources/gpt/data/tokenized_inputs.pkl', tokenized_inputs)
prepped_data.save('resources/gpt/prepped_data_gen')

In [8]:
with open('resources/gpt/data/tokenized_inputs.pkl', 'rb') as f:
    tokenized_inputs = joblib.load(f)

In [5]:
final_model.load_weights('resources/bert/ir_final_weights.h5')

In [46]:
joblib.dump(final_weights, 'intent_weights.pkl')

['intent_weights.pkl']

In [7]:
with open('resources/bert/saved/intent_tokenizer.pkl', 'wb') as f:
    joblib.dump(tokenizer, f)