#### This notebook trains a BERT model for IT Service ticket classification 
##### The BERT model is from HuggingFace and the dataset is from Kaggle (https://www.kaggle.com/datasets/adisongoh/it-service-ticket-classification-dataset?resource=download). The dataset has 8 classes (Access, Administrative rights, HR Support, Hardware, Internal Project, Miscellaneous, Purchase, Storage) and one feature (Document). The document column has the IT ticket description. See EDA notebook for more details about the data.

##### **Final results:**
|   Metric  |   Value  |
|:---------:|:--------:|
|     f1    |  0.85765 |
|  roc_auc  | 0.918657 |
|  accuracy |  0.85765 |
| precision |  0.85765 |
|   recall  |  0.85765 |

In [None]:
import pandas as pd
import mlflow
from transformers import AutoTokenizer
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score, precision_score, recall_score
from datasets import Dataset
import numpy as np

In [None]:
BERT_MODEL = "distilbert/distilbert-base-uncased"

In [None]:
data = pd.read_csv("../data/all_tickets_processed_improved_v3.csv")

##### Holdout

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data["Document"], data["Topic_group"], test_size=0.1, random_state=42)

##### Label encoding

In [None]:
label_binarizer = LabelBinarizer()
y_train_encoded = label_binarizer.fit_transform(y_train)
y_test_encoded = label_binarizer.transform(y_test)

##### Train and test tokenization

In [None]:
tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL)

In [None]:
def preprocess_function(data):
    return tokenizer(data["text"], truncation=True, padding=True, max_length=512)

In [None]:

train_df = Dataset.from_pandas(pd.DataFrame({"text": X_train, "label": y_train_encoded.astype(np.float32).tolist()}))
test_df = Dataset.from_pandas(pd.DataFrame({"text": X_test, "label": y_test_encoded.astype(np.float32).tolist()})) 

tokenized_train_data = train_df.map(preprocess_function, batched=True)
tokenized_test_data = test_df.map(preprocess_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

##### Load BERT model

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(BERT_MODEL, num_labels=y_train.nunique())

##### Training and evalutation

In [None]:
def compute_metrics(true_label, pred_label):

    f1_micro_average = f1_score(true_label, pred_label, average="micro")
    roc_auc = roc_auc_score(true_label, pred_label, average = "micro")
    accuracy = accuracy_score(true_label, pred_label)
    precision = precision_score(true_label, pred_label, average="micro")
    recall = recall_score(true_label, pred_label, average="micro")

    metrics = {"f1": f1_micro_average,
            "roc_auc": roc_auc,
            "accuracy": accuracy,
            "precision": precision,
            "recall": recall
            }

    return metrics

def from_probabilities_to_binarizer(probs):
    return [1 if p == probs[np.argmax(probs)] else 0 for p in probs]
        

In [None]:
learning_rate = 2e-5
num_train_epochs = 15
weight_decay = 0.01

with mlflow.start_run():

    mlflow.log_param("learning_rate", learning_rate)
    mlflow.log_param("num_train_epochs", num_train_epochs)
    mlflow.log_param("weight_decay", weight_decay)

    training_args = TrainingArguments(
    output_dir="./ticket_classification",
    learning_rate=learning_rate,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=num_train_epochs,
    weight_decay=weight_decay,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)
    
    trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_data,
    eval_dataset=tokenized_test_data,
    tokenizer=tokenizer,
    data_collator=data_collator,
)
    
    trainer.train()
 
    preds = trainer.predict(tokenized_test_data)

    y_pred_probs = [list(row) for row in preds.predictions]
    y_pred = np.array(list(map(from_probabilities_to_binarizer, y_pred_probs)))

    metrics = compute_metrics(y_test_encoded, y_pred)

    print(metrics)

    mlflow.log_metrics(metrics)

    components = {
        "model": model,
        "tokenizer": tokenizer,
    }

    mlflow.transformers.log_model(
        transformers_model=components,
        artifact_path="model",
    )

