### Install Python Libraries

In [55]:
!pip install kaggle
!pip install tensorflow
!pip install transformers
!pip install evaluate
!pip install scikit-learn
!pip install datasets
!pip install seaborn
!pip install matplotlib
!pip install typing
!pip install pandas



### Apply Kaggle Configuration

### Import Libraries

In [58]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import math


import kaggle
import tensorflow as tf
from datasets import Dataset, concatenate_datasets
from typing import Union, Optional

from transformers import AutoTokenizer, AdamWeightDecay, BlenderbotTokenizer, TFBlenderbotForConditionalGeneration, DataCollatorForLanguageModeling,DataCollatorForSeq2Seq,DataCollatorWithPadding, create_optimizer, TFAutoModelForCausalLM,TFAutoModelForSeq2SeqLM, pipeline
# Used for model metric call when applying the metrics during training and evaluation
from transformers.keras_callbacks import KerasMetricCallback
from tensorflow.keras.optimizers import Adam, Adagrad, SGD

#from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy

import evaluate

# Import sklearn metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

### Import Dataset
- huggingface ubuntu dialog corpus
- https://huggingface.co/datasets/ubuntu_dialogs_corpus
- authenticate with Kaggle
- download the files from Kaggle
- add them to dataset module

In [59]:
kaggle.api.authenticate()



In [61]:
# Define the directory where the files are located
directory = "./Ubuntu-dialogue-corpus"

In [62]:
# Define the file names and corresponding dataframe names
files_dataframes = {
    "dialogueText_301.csv": "train_ubuntu_dialogue_df",
    "dialogueText_196.csv": "validation_ubuntu_dialogue_df",
    "dialogueText.csv": "test_ubuntu_dialogue_df"
}

In [63]:
# Iterate through the files in the directory
for filename in os.listdir(directory):
    # If the file is one of the files we're interested in
    if filename in files_dataframes:
        # Construct the full file path
        file_path = os.path.join(directory, filename)
        # Load the file into a dataframe and assign it to a variable with the corresponding name
        globals()[files_dataframes[filename]] = pd.read_csv(file_path)

In [64]:
train_ubuntu_dialogue_df.head()

Unnamed: 0,folder,dialogueID,date,from,to,text
0,301,1.tsv,2004-11-23T11:49:00.000Z,stuNNed,,any ideas why java plugin takes so long to load?
1,301,1.tsv,2004-11-23T11:49:00.000Z,crimsun,stuNNed,java 1.4?
2,301,1.tsv,2004-11-23T11:49:00.000Z,stuNNed,crimsun,yes
3,301,1.tsv,2004-11-23T11:49:00.000Z,crimsun,stuNNed,java 1.5 loads _much_ faster
4,301,1.tsv,2004-11-23T11:50:00.000Z,stuNNed,crimsun,noneus: how can i get 1.5 is there a .deb some...


In [65]:
validation_ubuntu_dialogue_df.head()

Unnamed: 0,folder,dialogueID,date,from,to,text
0,301,1.tsv,2004-11-23T11:49:00.000Z,stuNNed,,any ideas why java plugin takes so long to load?
1,301,1.tsv,2004-11-23T11:49:00.000Z,crimsun,stuNNed,java 1.4?
2,301,1.tsv,2004-11-23T11:49:00.000Z,stuNNed,crimsun,yes
3,301,1.tsv,2004-11-23T11:49:00.000Z,crimsun,stuNNed,java 1.5 loads _much_ faster
4,301,1.tsv,2004-11-23T11:50:00.000Z,stuNNed,crimsun,noneus: how can i get 1.5 is there a .deb some...


In [66]:
# Review the downloaded dataset
test_ubuntu_dialogue_df.head()

Unnamed: 0,folder,dialogueID,date,from,to,text
0,3,126125.tsv,2008-04-23T14:55:00.000Z,bad_image,,"Hello folks, please help me a bit with the fol..."
1,3,126125.tsv,2008-04-23T14:56:00.000Z,bad_image,,Did I choose a bad channel? I ask because you ...
2,3,126125.tsv,2008-04-23T14:57:00.000Z,lordleemo,bad_image,the second sentence is better english and we...
3,3,64545.tsv,2009-08-01T06:22:00.000Z,mechtech,,Sock Puppe?t
4,3,64545.tsv,2009-08-01T06:22:00.000Z,mechtech,,WTF?


### Exploratory Data Analysis
- get dataset info
- get dataset description

In [67]:
train_ubuntu_dialogue_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16587830 entries, 0 to 16587829
Data columns (total 6 columns):
 #   Column      Dtype 
---  ------      ----- 
 0   folder      int64 
 1   dialogueID  object
 2   date        object
 3   from        object
 4   to          object
 5   text        object
dtypes: int64(1), object(5)
memory usage: 759.3+ MB


In [68]:
validation_ubuntu_dialogue_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9212877 entries, 0 to 9212876
Data columns (total 6 columns):
 #   Column      Dtype 
---  ------      ----- 
 0   folder      int64 
 1   dialogueID  object
 2   date        object
 3   from        object
 4   to          object
 5   text        object
dtypes: int64(1), object(5)
memory usage: 421.7+ MB


In [69]:
test_ubuntu_dialogue_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1038324 entries, 0 to 1038323
Data columns (total 6 columns):
 #   Column      Non-Null Count    Dtype 
---  ------      --------------    ----- 
 0   folder      1038324 non-null  int64 
 1   dialogueID  1038324 non-null  object
 2   date        1038324 non-null  object
 3   from        1038311 non-null  object
 4   to          566035 non-null   object
 5   text        1038237 non-null  object
dtypes: int64(1), object(5)
memory usage: 47.5+ MB


In [70]:
train_ubuntu_dialogue_df['text'].describe()

count     16586581
unique    12520971
top            yes
freq         83685
Name: text, dtype: object

In [71]:
validation_ubuntu_dialogue_df['text'].describe()

count     9212063
unique    7232452
top           yes
freq        47000
Name: text, dtype: object

In [72]:
test_ubuntu_dialogue_df['text'].describe()

count     1038237
unique     863907
top        thanks
freq         6256
Name: text, dtype: object

### Preprocess the datasets
- convert the dataframes to datasets
- initialize the tokenizer by selecting a pretrained model
- initialize the model by selecting a pretrained model
- initialize the datacollator
- apply tokenizer to datacollator
- tokenize the dataset with the tokenizer function
- build a tokenized dataset
- build function to split concatenated sequences into shorter chunks defined by block size and use that to build the label

In [73]:
train_ubuntu_dialogue_ds = Dataset.from_pandas(train_ubuntu_dialogue_df[0:10])
validation_ubuntu_dialogue_ds = Dataset.from_pandas(validation_ubuntu_dialogue_df[0:10])
test_ubuntu_dialogue_ds = Dataset.from_pandas(test_ubuntu_dialogue_df[0:10])

In [74]:
train_ubuntu_dialogue_ds

Dataset({
    features: ['folder', 'dialogueID', 'date', 'from', 'to', 'text'],
    num_rows: 10
})

In [75]:
tokenizer = BlenderbotTokenizer.from_pretrained('facebook/blenderbot-400M-distill') #AutoTokenizer.from_pretrained('facebook/blenderbot-400M-distill') #,truncation=True,padding="longest", max_length=512, return_overflowing_tokens=True,return_offsets_mapping=True) #,truncation=True

Downloading (…)olve/main/vocab.json: 100%|██████████| 127k/127k [00:00<00:00, 3.60MB/s]
Downloading (…)olve/main/merges.txt: 100%|██████████| 62.9k/62.9k [00:00<00:00, 75.5MB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 1.15k/1.15k [00:00<00:00, 865kB/s]
Downloading (…)in/added_tokens.json: 100%|██████████| 16.0/16.0 [00:00<00:00, 57.1kB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 772/772 [00:00<00:00, 5.11MB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 1.57k/1.57k [00:00<00:00, 4.03MB/s]


In [76]:
tokenizer.pad_token = tokenizer.eos_token

In [77]:
# used for causal inference
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, return_tensors='tf')

In [78]:
tokenizer

BlenderbotTokenizer(name_or_path='facebook/blenderbot-400M-distill', vocab_size=8008, model_max_length=128, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'sep_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': '</s>', 'cls_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=True)}, clean_up_tokenization_spaces=True)

In [79]:
def preprocess_function(examples):
    """
    This function preprocesses the input examples for the model training. It tokenizes the input text, removes examples where input_ids is None, and creates decoder_input_ids and labels.

    Args:
    examples (dict): A dictionary containing the input text.

    Returns:
    dict: A dictionary containing the tokenized input text, decoder_input_ids, and labels.
    """
    # Tokenize the examples
    tokenized_examples = tokenizer([" ".join(x) for x in examples['text']], truncation=True, padding="longest", max_length=512)

    # Remove examples where input_ids is None
    tokenized_examples = {k: v for k, v in tokenized_examples.items() if v is not None}

    # Create decoder_input_ids and labels
    tokenized_examples["decoder_input_ids"] = [[tokenizer.bos_token_id] + ids + [tokenizer.eos_token_id] for ids in tokenized_examples["input_ids"]]
    tokenized_examples["labels"] = [[tokenizer.bos_token_id] + ids + [tokenizer.eos_token_id] for ids in tokenized_examples["input_ids"]]

    return tokenized_examples


In [80]:
tokenized_train_ubuntu_dialogue_ds = train_ubuntu_dialogue_ds.map(
    preprocess_function,
    batched=True,
    num_proc=1,
    remove_columns=train_ubuntu_dialogue_ds.column_names,
)

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

                                                           

In [81]:
tokenized_validation_ubuntu_dialogue_ds = validation_ubuntu_dialogue_ds.map(
    preprocess_function,
    batched=True,
    num_proc=1,
    remove_columns=validation_ubuntu_dialogue_ds.column_names,
)

                                                  

In [82]:
tokenized_test_ubuntu_dialogue_ds = test_ubuntu_dialogue_ds.map(
    preprocess_function,
    batched=True,
    num_proc=1,
    remove_columns=test_ubuntu_dialogue_ds.column_names,
)

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

                                                  

In [83]:
tokenized_train_ubuntu_dialogue_ds

Dataset({
    features: ['input_ids', 'attention_mask', 'decoder_input_ids', 'labels'],
    num_rows: 10
})

### Model Build
- build the hyperparameters
- build and compute metrics for model evaluation
- build the pretrained model
- build the tensorflow prepared dataset (includes tensorflow tensor conversion, batch setting for model runs, shuffling the data, adding the data collator for collation)
- build metric callbacks
- apply them to the model
- compile the model

In [85]:
model = TFBlenderbotForConditionalGeneration.from_pretrained('facebook/blenderbot-400M-distill')

Downloading tf_model.h5: 100%|██████████| 1.46G/1.46G [00:28<00:00, 51.8MB/s]
All model checkpoint layers were used when initializing TFBlenderbotForConditionalGeneration.

Some layers of TFBlenderbotForConditionalGeneration were not initialized from the model checkpoint at facebook/blenderbot-400M-distill and are newly initialized: ['final_logits_bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Downloading (…)neration_config.json: 100%|██████████| 347/347 [00:00<00:00, 4.58MB/s]


In [86]:
# Preparing TensorFlow dataset for training:
# The `prepare_tf_dataset` method is called on the model object to convert the tokenized training dataset into a TensorFlow dataset.
# The dataset is shuffled to ensure randomness, which is beneficial for training the model.
# Batch size is set to 1, meaning that the dataset will be divided into batches, each containing 1 example.
# The `collate_fn` function is passed to handle the batching and padding of sequences.
tf_train_set = model.prepare_tf_dataset(
    tokenized_train_ubuntu_dialogue_ds,
    shuffle=True,
    batch_size=1,
    collate_fn=data_collator,
)

# Preparing TensorFlow dataset for validation:
# Similar to the training set preparation, but the dataset is not shuffled in this case.
# This dataset will be used for validating the model's performance during training.
tf_validation_set = model.prepare_tf_dataset(
    tokenized_validation_ubuntu_dialogue_ds,
    shuffle=False,
    batch_size=1,
    collate_fn=data_collator,
)

# Preparing TensorFlow dataset for testing:
# Similar to the validation set preparation, this dataset will be used for evaluating the model's performance after training.
tf_test_set = model.prepare_tf_dataset(
    tokenized_test_ubuntu_dialogue_ds,
    shuffle=False,
    batch_size=1,
    collate_fn=data_collator,
)


In [89]:
# Define a list of optimizers
optimizers = ['adam','sgd', 'adagrad']

In [90]:
# Parameters for capturing the best optimizer used in the model
best_eval_loss = float('inf')
best_model = None
best_optimizer = None

### Model Compile, Train and Evaluate
- Iterate over the optimizers to determine the best one to be used
- compile the model
- train the model on the train and validation datasets
- evaluate the model based on loss leader of optimizer

In [93]:
# Iterate over the optimizers
for optimizer in optimizers:
    if optimizer == 'adam':
        opt = tf.keras.optimizers.Adam(learning_rate=2e-5, weight_decay=0.01) # decay = weight_decay
    elif optimizer == 'adafactor':
        opt = tf.keras.optimizers.Adafactor(learning_rate=2e-5, weight_decay=0.01)
    elif optimizer == 'sgd':
        opt = tf.keras.optimizers.SGD(learning_rate=2e-5, weight_decay=0.01)
    elif optimizer == 'adagrad':
        opt = tf.keras.optimizers.Adagrad(learning_rate=2e-5, weight_decay=0.01)

    model.compile(optimizer=opt) # can also add metrics here

    # Train the model
    model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=3) # callbacks=callbacks

    # Evaluate the model
    eval_loss = model.evaluate(tf_validation_set)  # callbacks=callbacks

    # If this model has a lower evaluation loss than the current best, update the best model and optimizer
    if eval_loss < best_eval_loss:
        best_eval_loss = eval_loss
        best_model = model
        best_optimizer = optimizer



Epoch 1/3


2023-10-15 23:05:50.762160: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 2/3
Epoch 3/3




Epoch 1/3
Epoch 2/3
Epoch 3/3




Epoch 1/3
Epoch 2/3
Epoch 3/3


In [94]:
print('Best Model to use based on the optimizer: ', best_model)
print('Best Optimizer to use based loss leader comparison: ', best_optimizer)

Best Model to use based on the optimizer:  <transformers.models.blenderbot.modeling_tf_blenderbot.TFBlenderbotForConditionalGeneration object at 0x5990eb0d0>
Best Optimizer to use based loss leader comparison:  adagrad


### Model Evaluate
- evaluate the model's performance using the model evaluate and model predict

In [95]:
eval_results = best_model.evaluate(tf_validation_set)
print(f'Perplexity: {math.exp(eval_results):.2f}')

Perplexity: 1.04


In [134]:
acc = evaluate.load('accuracy')
f1_score = evaluate.load('f1')
precis = evaluate.load('precision')
recalll = evaluate.load('recall')

In [97]:
# Round the predictions to turn them into "0" or "1" labels
test_preds = model.predict(tf_validation_set)



In [116]:
predicted_ids = tf.argmax(test_preds.logits, axis=-1)
predicted_ids_tensor = tf.constant(predicted_ids, dtype=tf.int64)
references = tf.constant(tokenized_validation_ubuntu_dialogue_ds['labels'], dtype=tf.int64)

predicted_ids_numpy = predicted_ids_tensor.numpy().flatten()
references_numpy = references.numpy().flatten()

In [113]:
accuracy_score = acc.compute(predictions=predicted_ids_numpy, references=references_numpy)

print('Accuracy Score: ', accuracy_score)

Accuracy Score:  {'accuracy': 0.9907692307692307}


In [135]:
# Calculating precision score:
# The `compute` method is called on the precision object (`precis`) to calculate the precision score.
# Predictions and references are converted to numpy arrays and flattened before being passed to the `compute` method.
# The 'weighted' average is used to calculate a single precision score that takes class imbalance into account.
precision_score = precis.compute(predictions=predicted_ids_tensor.numpy().flatten(), references=references.numpy().flatten(), average='weighted')

# Calculating recall score:
# Similarly, the `compute` method is called on the recall object (`recalll`) to calculate the recall score.
# The 'weighted' average is used to calculate a single recall score that takes class imbalance into account.
recall_score = recalll.compute(predictions=predicted_ids_tensor.numpy().flatten(), references=references.numpy().flatten(), average='weighted')

# Calculating F1 score:
# The `compute` method is called on the F1 score object (`f1_score`) to calculate the F1 score.
# The 'weighted' average is used to calculate a single F1 score that takes class imbalance into account.
f1_score = f1_score.compute(predictions=predicted_ids_tensor.numpy().flatten(), references=references.numpy().flatten(), average='weighted')

# Printing the calculated scores:
print('Precision Score: ', precision_score)
print('Recall Score: ', recall_score)
print('F1 Score: ', f1_score)



precision Score:  {'precision': 0.9865765004226542}
recall Score:  {'recall': 0.9907692307692307}
F1 Score:  {'f1': 0.9878009561060409}


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Model Save

In [136]:
# Define the directory where the files are located
save_model_directory = "./bestModel"

In [137]:
### Save the weights of the model
best_model.save_pretrained(save_model_directory)

### Perform Inference
- recall the saved model
- check the model's architecture
- test the model to confirm it can perform causal inference calculations properly

In [5]:
from transformers import BlenderbotTokenizer, TFBlenderbotForConditionalGeneration
import tensorflow as tf

# Load the model and tokenizer from the directory where you saved your model
save_model_directory = "./bestModel"
tokenizer = BlenderbotTokenizer.from_pretrained('facebook/blenderbot-400M-distill')
model= TFBlenderbotForConditionalGeneration.from_pretrained(save_model_directory)

def get_response(user_input):
    # Tokenize the user input
    inputs = tokenizer(user_input, return_tensors='tf').input_ids
    # Generate a response
    outputs = model.generate(input_ids=inputs, max_new_tokens=100, do_sample=True, top_k=30, top_p=0.85)
    # Decode the response
    response = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    return response


def print_welcome_message():
    print("========================================")
    print("   Welcome to the Chatbot Interface!   ")
    print(" Type 'quit', 'exit', or 'bye' to exit ")
    print("========================================")

print_welcome_message()

# Interactive loop
while True:
    user_input = input("You: ")
    if user_input.lower() in ['quit', 'exit', 'bye']:
        print("Bot: Bye! Have a great day.")
        break
    print(f"You: {user_input}")  
    response = get_response(user_input)
    print(f"Bot: {response}")




All model checkpoint layers were used when initializing TFBlenderbotForConditionalGeneration.

All the layers of TFBlenderbotForConditionalGeneration were initialized from the model checkpoint at ./bestModel.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBlenderbotForConditionalGeneration for predictions without further training.


   Welcome to the Chatbot Interface!   
 Type 'quit', 'exit', or 'bye' to exit 
You: Hello
Bot:  Hi, how are you doing today? I'm just relaxing before work, how about you?
You: where do you work
Bot:  I work at a grocery store. It's not the best but it pays the bills.
You: Well thats good. Straight to the point, I need help with my linux system
Bot:  I know how you feel.  I need to get a new one as well.
You: 
Bot:  I'm not sure what you mean by that, but I hope you have a great day!
You: Can you help me troubleshoot an issue
Bot:  Sure, what is the issue? I can help you if you want to talk about it.
Bot: Bye! Have a great day.
