This notebook is an implementation by Mostafa El Katerji and is an experiment for leveraging Transfer Learning in the domain of intent detection. It was implemented for the CSI5180 course on Virtual Assistants at uOttawa. It builds on previous work referenced below.

References:
- Multi-Domain Joint Semantic Frame Parsing using Bi-directional RNN-LSTM
https://www.microsoft.com/en-us/research/wp-content/uploads/2016/06/IS16_MultiJoint.pdf 
- ATIS Dataset
https://www.kaggle.com/hassanamin/atis-airlinetravelinformationsystem 
- An Evaluation Dataset for Intent Classification and Out-of-Scope Prediction
https://github.com/clinc/oos-eval 
- A Deep Multi-task Model for Dialogue Act Classification, Intent Detection and Slot Filling
https://link.springer.com/article/10.1007/s12559-020-09718-4 
- A survey of joint intent detection and slot-filling models in natural language understanding
https://arxiv.org/pdf/2101.08091.pdf 
- tf.keras.preprocessing.text.Tokenizer
https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/text/Tokenizer 
- sklearn.preprocessing.LabelEncoder
https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html 
- Chatbots: Intent Recognition Dataset
https://www.kaggle.com/elvinagammed/chatbots-intent-recognition-dataset 


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!mkdir results
!mkdir results/OOS
!mkdir results/Atis
!mkdir results/CB

In [None]:
import json
import nltk.corpus
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import tensorflow.keras.models as models
import tensorflow.keras.layers as layers
import matplotlib.pyplot as plt
from keras.utils.vis_utils import plot_model
from keras.callbacks import EarlyStopping

nltk.download('stopwords')
all_english_stop_words = stopwords.words('english')
porter_stemmer = PorterStemmer()

# The path to a folder that contains the dataset files with names "atis_intents.csv", "chatbots-intent-recognition-dataset.json", and "oos-eval-full.json"
# https://www.kaggle.com/hassanamin/atis-airlinetravelinformationsystem/version/1
# https://www.kaggle.com/elvinagammed/chatbots-intent-recognition-dataset
# https://github.com/clinc/oos-eval
VA_PROJECT_DATA =  "./datasets"

# The path to a folder where the results will be saved. It must contain three folders named "OOS", "Atis", and "CB"
VA_PROJECT_RES = "./results"

# Parse the datasets
class Dataset:
    def __init__(self, intent_texts, intent_labels):
        self.intent_texts = intent_texts
        self.intent_labels = intent_labels

def parse_oos_dataset():
    with open(f'{VA_PROJECT_DATA}/oos-eval-full.json') as f:
        ds = json.load(f)

    intents = []
    intent_labels = []

    for key in ds.keys():
        for elem in ds[key]:
            intent = elem[0]
            intent_label = elem[1]

            if 'oos' not in intent_label:
                intents.append(intent)
                intent_labels.append(intent_label)

    return Dataset(intents, intent_labels)

def parse_atis_dataset():
    with open(f'{VA_PROJECT_DATA}/atis_intents.csv') as f:
        lines = f.read().split('\n')
       
    intents = []
    intent_labels = []

    for line in lines:
        pair = line.split(",")
        if(len(pair) > 1):
            intents.append(pair[1].strip())
            intent_labels.append(pair[0].strip())
    
    return Dataset(intents, intent_labels)

def parse_cb_dataset():
    with open(f'{VA_PROJECT_DATA}/chatbots-intent-recognition-dataset.json') as f:
        ds = json.load(f)

    intents = []
    intent_labels = []

    for elem in ds['intents']:
        intent = elem['intent']
        for text in elem['text']:
            intents.append(text)
            intent_labels.append(intent)

    return Dataset(intents, intent_labels)

# clean the text
def clean_dataset_text(ds):
    for i in range(0, len(ds.intent_texts)):
        # remove punctiation
        curr_text = re.sub(r'[^\w\s]', '', ds.intent_texts[i])

        # remove stop words and stem the text
        output_text = ""
        for word in curr_text.split(" "):
            if word not in all_english_stop_words:
                output_text += " " + porter_stemmer.stem(word)
        ds.intent_texts[i] = output_text.strip()

# tokenize the dataset
def create_tokenizer(texts):
    # init the tokenizer
    tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='', oov_token='<unk>')
    # fit the tokenizer on all the texts
    tokenizer.fit_on_texts(texts)

    return tokenizer

def tokenize(tokenizer, texts):
    # generate the input sequences
    input_sequences = tf.keras.preprocessing.sequence.pad_sequences(tokenizer.texts_to_sequences(texts), padding='pre')
    return input_sequences

# categorize labels
def create_category_encoder(labels):
    label_encoder = LabelEncoder()
    label_encoder.fit(labels)

    return label_encoder

def categorize_labels(label_encoder, labels):
    vec = label_encoder.transform(labels)
    return tf.keras.utils.to_categorical(vec, num_classes=194)

# build the models
def build_baseline_model(word_count, number_of_intents, freeze_top = False, weights = None):
    gru_layer = layers.GRU(4)
    model = models.Sequential([
        layers.Embedding(word_count, 356),
        layers.Bidirectional(gru_layer),
        layers.Dense(128, activation='relu'),
        layers.Dropout(0.5),
        layers.Dense(128, activation='relu'),
        layers.Dropout(0.5),
        layers.Dense(number_of_intents, activation='softmax')
    ])

    if weights is not None:
      model.set_weights(weights)

    if freeze_top:
      for li in range(0, len(model.layers) - 1):
        model.layers[li].trainable = False
      gru_layer.trainable = False
      model.pop()
      model.add(layers.Dense(number_of_intents, activation='softmax'))

    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

def build_final_model(word_count, number_of_intents, freeze_top = False, weights = None):
    model = models.Sequential([
        layers.Embedding(word_count, 356),
        layers.BatchNormalization(),
        layers.Conv1D(128, 5, activation='relu'),
        layers.GlobalMaxPooling1D(),
        layers.Flatten(),
        layers.Dense(128, activation='relu'),
        layers.Dropout(0.5),
        layers.Dense(number_of_intents, activation='softmax')
    ])

    if weights is not None:
      model.set_weights(weights)

    if freeze_top:
      for li in range(0, len(model.layers) - 1):
        model.layers[li].trainable = False
      model.pop()
      model.add(layers.Dense(number_of_intents, activation='softmax'))

    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# fit the model
def fit_model(name, sub_folder, model, input_sequences, categorized_labels, epochs, batch_size):
    print(name)
    X_train, X_test, y_train, y_test = train_test_split(input_sequences, categorized_labels, test_size=0.2)
  
    callback = EarlyStopping(monitor='val_accuracy', patience=300, restore_best_weights = True)

    history = model.fit(X_train, y_train, epochs=epochs, validation_split=0.2, batch_size=batch_size, callbacks=[callback])

    training_metrics = {
        "accuracy": history.history['accuracy'],
        "val_accuracy": history.history['val_accuracy']
    }

    results = model.evaluate(X_test, y_test, batch_size=batch_size)
    print("val loss, val acc:", results)

    with open('{}/{}/{}-history.json'.format(VA_PROJECT_RES, sub_folder, name), 'w', encoding='utf-8') as f:
        json.dump(training_metrics, f, ensure_ascii=False, indent=4)
    with open('{}/{}/{}-evaluate.json'.format(VA_PROJECT_RES, sub_folder, name), 'w', encoding='utf-8') as f:
        json.dump(results, f, ensure_ascii=False, indent=4)

    plt.plot(history.history['accuracy'])
    plt.plot(history.history['val_accuracy'])
    plt.title(f'{name} train:{round(history.history["accuracy"][-1] * 100, 2)} val:{round(history.history["val_accuracy"][-1] * 100, 2)} test:{round(results[1] * 100, 2)}'[3:])
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend(['train', 'val'], loc='lower left')
    plt.ylim(0,1)
    plt.savefig('{}/{}/{}-acc.png'.format(VA_PROJECT_RES, sub_folder, name))
    plt.clf()


In [None]:
# parse the datasets
oos_dataset = parse_oos_dataset()
cb_dataset = parse_cb_dataset()
atis_dataset = parse_atis_dataset()

# clean the datasets
clean_dataset_text(oos_dataset)
clean_dataset_text(cb_dataset)
clean_dataset_text(atis_dataset)

# create a common tokenizer and category encoder
combined_texts = oos_dataset.intent_texts + cb_dataset.intent_texts + atis_dataset.intent_texts
tokenizer = create_tokenizer(combined_texts)
combined_labels_comb = oos_dataset.intent_labels + cb_dataset.intent_labels + atis_dataset.intent_labels
category_encoder = create_category_encoder(list(set(combined_labels_comb)))

# tokenize the text from oos
input_sequences_oos = tokenize(tokenizer, oos_dataset.intent_texts)

# tokenize the text from cb
input_sequences_cb = tokenize(tokenizer, cb_dataset.intent_texts)

# tokenize the text from atis
input_sequences_atis = tokenize(tokenizer, atis_dataset.intent_texts)

# tokenize the text from both datasets
input_sequences_comb = tokenize(tokenizer, combined_texts)
print(len(list(set(combined_labels_comb))))
# categorize labels from oos
categorized_labels_oos = categorize_labels(category_encoder, oos_dataset.intent_labels)

# categorize labels from cb
categorized_labels_cb = categorize_labels(category_encoder, cb_dataset.intent_labels)

# categorize labels from atis
categorized_labels_atis = categorize_labels(category_encoder, atis_dataset.intent_labels)

# categorize labels from both datasets
categorized_labels_comb = categorize_labels(category_encoder, combined_labels_comb)

In [None]:
plot_model(build_baseline_model(len(tokenizer.word_index) + 1, categorized_labels_oos.shape[1]), to_file='baseline.png', show_shapes=True, show_layer_names=True)
plot_model(build_final_model(len(tokenizer.word_index) + 1, categorized_labels_oos.shape[1]), to_file='final.png', show_shapes=True, show_layer_names=True)

In [None]:
# build the models
nbr_of_words = len(tokenizer.word_index) + 1
nbr_of_cat = categorized_labels_comb.shape[1]

dataset_config = {
    "OOS": {
        "is": input_sequences_oos,
        "cl": categorized_labels_oos,
        "modelBl": build_baseline_model(nbr_of_words, nbr_of_cat),
        "modelFl": build_final_model(nbr_of_words, nbr_of_cat),
        "epochs": 60,
        "batchSize": 128,
    },
    "CB": {
        "is": input_sequences_cb,
        "cl": categorized_labels_cb,
        "modelBl": build_baseline_model(nbr_of_words, nbr_of_cat),
        "modelFl": build_final_model(nbr_of_words, nbr_of_cat),
        "epochs": 250,
        "batchSize": 16,
    },
    "Atis": {
        "is": input_sequences_atis,
        "cl": categorized_labels_atis,
        "modelBl": build_baseline_model(nbr_of_words, nbr_of_cat),
        "modelFl": build_final_model(nbr_of_words, nbr_of_cat),
        "epochs": 60,
        "batchSize": 128,
    }
}

for el in dataset_config:
  model_bl = dataset_config[el]["modelBl"]
  model_fnl = dataset_config[el]["modelFl"]
  inp_seq = dataset_config[el]["is"]
  cl = dataset_config[el]["cl"]
  epochs = dataset_config[el]["epochs"]
  batch_size = dataset_config[el]["batchSize"]

  model_bl.summary()
  model_fnl.summary()

  fit_model(f"1. {el} from scratch baseline", el, model_bl, inp_seq, cl, epochs, batch_size)
  fit_model(f"A. {el} from scratch final", el, model_fnl, inp_seq, cl, epochs, batch_size)

  old_weights_bl = model_bl.get_weights()
  old_weights_fnl = model_fnl.get_weights()

  tl_datasets = list(dataset_config.keys())
  tl_datasets.remove(el)

  for other_ds in tl_datasets:
    ods_is = dataset_config[other_ds]["is"]
    ods_cl = dataset_config[other_ds]["cl"]
    ods_epochs = dataset_config[other_ds]["epochs"]
    ods_batch_size = dataset_config[other_ds]["batchSize"]

    # Transfer learning on frozen top weights
    ods_model_bl = build_baseline_model(nbr_of_words, nbr_of_cat, True, old_weights_bl)
    ods_model_fnl = build_final_model(nbr_of_words, nbr_of_cat, True, old_weights_fnl)

    fit_model(f"2. {other_ds} using {el} weights baseline", other_ds, ods_model_bl, ods_is, ods_cl, ods_epochs, ods_batch_size)
    fit_model(f"B. {other_ds} using {el} weights final", other_ds, ods_model_fnl, ods_is, ods_cl, ods_epochs, ods_batch_size)

    # Transfer learning unfreze
    ods_model_bl = build_baseline_model(nbr_of_words, nbr_of_cat, False, ods_model_bl.get_weights())
    ods_model_fnl = build_final_model(nbr_of_words, nbr_of_cat, False, ods_model_fnl.get_weights())

    fit_model(f"3. {other_ds} using {el} weights baseline", other_ds, ods_model_bl, ods_is, ods_cl, ods_epochs, ods_batch_size)
    fit_model(f"C. {other_ds} using {el} weights final", other_ds, ods_model_fnl, ods_is, ods_cl, ods_epochs, ods_batch_size)