In [None]:
#imports
from datasets import Dataset
import pandas as pd
import tensorflow as tf
from transformers import create_optimizer, AutoTokenizer, DataCollatorWithPadding, TFAutoModelForSequenceClassification
from transformers.keras_callbacks import KerasMetricCallback, PushToHubCallback
import numpy as np
import evaluate
from sklearn.model_selection import train_test_split

In [None]:
## load dataset
df = pd.read_csv('intent_classification_task_data.csv')

## preprocess into list of strings
task_list = []
label_list = []

for text, label  in zip(df["text"], df["label"]):
    task_list.append(text)
    label_list.append(label)

## process into Hugging Face Dataset
data = {
    "label": label_list,
    "text": task_list
}

In [None]:
id2label = {0: "Event", 1: "Reminder", 2: "Todo"}
label2id = {"Event": 0, "Reminder": 1, "Todo": 2}

In [None]:
## change labels to id encoding
labels = []
for label in data["label"]:
    labels.append(label2id[label])

In [None]:
## split into train and test sets with labels
X_train, X_test, y_train, y_test = train_test_split(data["text"], labels, test_size=0.2, random_state=42)

train_text = []
test_text = []
train_label = []
test_label = []

for text, label in zip(X_train, y_train):
    train_text.append(text)
    train_label.append(label)

for text, label in zip(X_test, y_test):
    test_text.append(text)
    test_label.append(label)

train_dict = {
    "label": train_label,
    "text": train_text
}

test_dict = {
    "label": test_label,
    "text": test_text
}

train_data = Dataset.from_dict(train_dict)
test_data = Dataset.from_dict(test_dict)


In [None]:
## distilBERT tokenizer to preprocess
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

In [None]:
## preprocessing function to apply tokenizer over whole dataset
def preprocess_function(data):
    return tokenizer(data["text"], truncation=True)

In [None]:
## batch to process multiple at once for faster compute
tokenized_train_data = train_data.map(preprocess_function, batched=True)
tokenized_test_data = test_data.map(preprocess_function, batched=True)

In [None]:
## padding dynamically
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

In [None]:
accuracy = evaluate.load("accuracy")

In [None]:
## metrics function that passes preds and labels to compute metrics
def compute_metrics(eval_preds):
    predictions, labels = eval_preds
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [None]:
batch_size = 16
num_epochs = 5
batches_per_epoch = len(train_data) // batch_size
total_train_steps = int(batches_per_epoch * num_epochs)
# try 3e-5
optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)

In [None]:
model = TFAutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=3, id2label=id2label, label2id=label2id)


In [None]:
tf_train_set = model.prepare_tf_dataset(
    tokenized_train_data,
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)

tf_test_set = model.prepare_tf_dataset(
    tokenized_test_data,
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)

In [None]:
model.compile(optimizer=optimizer) #Transformer has default task-relevant loss function


In [None]:
metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_test_set)

In [None]:
push_to_hub_callback = PushToHubCallback(
    output_dir="assistant",
    tokenizer=tokenizer,
)



In [None]:
callbacks = [metric_callback, push_to_hub_callback]

In [None]:
model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=3, callbacks=callbacks)