In [1]:
# Basic Libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt  # we only need pyplot
sns.set()  # set the default Seaborn style for graphics


In [8]:
import os 
from transformers import AutoTokenizer

def train_model(tokenized_train_dataset, tokenized_test_dataset, label2id, id2label, model_path='microsoft/deberta-v3-small'):
    os.environ["TOKENIZERS_PARALLELISM"] = "false"

    # Load the model
    model = AutoModelForSequenceClassification.from_pretrained(
        model_path,
        num_labels=len(label2id),
        id2label=id2label,
        label2id=label2id,
        problem_type="multi_label_classification"
    )

    # Load the tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    # Define metrics computation
    def compute_metrics(eval_pred):
        labels = eval_pred.label_ids
        predictions = eval_pred.predictions.argmax(-1)
        accuracy = accuracy_score(labels, predictions)
        f1 = f1_score(labels, predictions, average='micro')
        precision = precision_score(labels, predictions, average='micro')
        recall = recall_score(labels, predictions, average='micro')
        return {"accuracy": accuracy, "f1": f1, "precision": precision, "recall": recall}

    # Training arguments with reduced verbosity
    training_args = TrainingArguments(
        output_dir="model",
        learning_rate=2e-5,
        per_device_train_batch_size=3,
        per_device_eval_batch_size=3,
        num_train_epochs=2,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        logging_dir='logs',  # Reduced verbosity in logging
        logging_strategy='steps',  # Reduced verbosity in logging
    )

    # Initialize the optimizer
    optimizer = AdamW(model.parameters(), lr=5e-5)

    # Trainer setup
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train_dataset,
        eval_dataset=tokenized_test_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        optimizers=(optimizer, None)
    )

    # Train and save the model
    trainer.train()
    trainer.save_model()

    return model


In [4]:
def preprocess_function(train_dataset, test_dataset, department2id):
    os.environ["TOKENIZERS_PARALLELISM"] = "false"

    def process_dataset(dataset, department2id):
        texts = dataset["Description"]
        departments = [
            col for col in dataset.column_names if col != 'Description']
        labels_list = []

        for example in dataset:
            labels = [0. for _ in range(len(departments))]
            for department in departments:
                if example[department] == 1:
                    label_id = department2id[department]
                    labels[label_id] = 1.
            labels_list.append(labels)

        encoded_texts = tokenizer(
            texts,
            truncation=True,
            padding='max_length',
            max_length=512
        )

        encoded_dict = {key: val for key, val in encoded_texts.items()}
        encoded_dict['labels'] = labels_list

        return Dataset.from_dict(encoded_dict)

    tokenized_train_dataset = process_dataset(train_dataset, department2id)
    tokenized_test_dataset = process_dataset(test_dataset, department2id)

    return tokenized_train_dataset, tokenized_test_dataset


In [6]:
from transformers import AutoTokenizer
import os
import json
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import numpy as np
from torch.optim import AdamW
from transformers.optimization import AdamW
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments, DataCollatorWithPadding
import logging
from typing import Dict, Tuple
from datasets import Dataset
from datasets import load_from_disk

import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import max_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split


def split_data(df: pd.DataFrame, parameters: Dict):
    train_df, test_df = train_test_split(
        df, test_size=parameters["test_size"], random_state=parameters["random_state"])
    return train_df, test_df

def split_subcategory_data(encoded_dir: str, parameters: Dict) -> Dict[str, pd.DataFrame]:
    # Load parameters
    test_size = parameters["test_size"]
    random_state = parameters["random_state"]

    # Create a dictionary to store the partitioned data
    partitioned_data = {}

    # Iterate through each department's data in the encoded_dir
    for department in parameters["departments"]:
        input_filepath = os.path.join(encoded_dir, f"{department}.csv")
        df = pd.read_csv(input_filepath)

        # Split the data
        train_df, test_df = train_test_split(
            df, test_size=test_size, random_state=random_state)

        # Store the train and test data in the partitioned_data dictionary
        partitioned_data[f"{department}_train"] = train_df
        partitioned_data[f"{department}_test"] = test_df

    return partitioned_data


def dataframe_to_dataset(train, test):
    train_dataset = Dataset.from_pandas(train)
    test_dataset = Dataset.from_pandas(test)
    return train_dataset, test_dataset


def department_label_encoding(train_dataset) -> Tuple[Dict[str, int], Dict[int, str]]:
    output_dir = "data/04_feature/department_label_encoded_dir"
    os.makedirs(output_dir, exist_ok=True)
    labels = [col for col in train_dataset.column_names if col != 'Description']

    label2id = {label: idx for idx, label in enumerate(labels)}
    id2label = {idx: label for label, idx in label2id.items()}

    return label2id, id2label


2024-06-21 08:14:11.431338: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-06-21 08:14:11.471226: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [11]:
file = '../data/02_intermediate/category_encoded_dir/Technology Services.csv'
df = pd.read_csv(file, encoding='ISO-8859-1', skipinitialspace=True)
df.head()


Unnamed: 0,Description,Biometric,Document,Asset Management,Data Security Incident,Off Boarding,CCTV,Network,Network and Infrastructure Design,Circuit,...,New Hire Shadowing,Web/Application Security,On Boarding,BCP Activation,Ticketing Support,Email Security,TPMO,Food Allowance,Procurement,Compliance
0,request reactiv user s biometr access,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,request upload access anoth biometr devic,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,request re assign badg card differ user,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,request badg card it alreadi present need,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,request disabl biometr access,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
train_data = df


In [15]:
train_df, test_df = train_test_split(train_data)

In [16]:
train_dataset, test_dataset = dataframe_to_dataset(train_df, test_df)

In [17]:
label2id, id2label = department_label_encoding(train_dataset)

In [18]:
label2id, id2label

({'Biometric': 0,
  'Document': 1,
  'Asset Management': 2,
  'Data Security Incident': 3,
  'Off Boarding': 4,
  'CCTV': 5,
  'Network': 6,
  'Network and Infrastructure Design': 7,
  'Circuit': 8,
  'Outage': 9,
  'Network Security': 10,
  'Notification': 11,
  'System Security': 12,
  'Server Management': 13,
  'Server Projects': 14,
  'Services Management': 15,
  'User Account': 16,
  'Checklist': 17,
  'Deployment / Movement': 18,
  'CASA': 19,
  'Email': 20,
  'General Assistance': 21,
  'Hardware': 22,
  'Printer': 23,
  'Software': 24,
  'Telephony': 25,
  'TV Advertisement': 26,
  'Development': 27,
  'New Hire Shadowing': 28,
  'Web/Application Security': 29,
  'On Boarding': 30,
  'BCP Activation': 31,
  'Ticketing Support': 32,
  'Email Security': 33,
  'TPMO': 34,
  'Food Allowance': 35,
  'Procurement': 36,
  'Compliance': 37,
  '__index_level_0__': 38},
 {0: 'Biometric',
  1: 'Document',
  2: 'Asset Management',
  3: 'Data Security Incident',
  4: 'Off Boarding',
  5: 'C

In [20]:
model_path = 'microsoft/deberta-v3-small'
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)


In [21]:
tokenized_train_dataset, tokenized_test_dataset = preprocess_function(train_dataset,test_dataset,label2id)

In [22]:
tokenized_train_dataset


Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 159
})

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    model_path, num_labels=len(id2label),
    id2label=id2label, label2id=label2id,
    problem_type="multi_label_classification")


In [None]:
training_args = TrainingArguments(

    output_dir="my_awesome_model",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)


: 

In [29]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import numpy as np


def sigmoid(x):
    return 1 / (1 + np.exp(-x))


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = sigmoid(predictions)
    predictions = (predictions > 0.5).astype(int)

    # Compute metrics
    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='micro')
    precision = precision_score(labels, predictions, average='micro')
    recall = recall_score(labels, predictions, average='micro')

    return {"accuracy": accuracy, "f1": f1, "precision": precision, "recall": recall}


In [30]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    tokenizer=tokenizer,
    # data_collator=data_collator,
    compute_metrics=compute_metrics,
)


In [31]:
trainer.train()


RestException: INVALID_PARAMETER_VALUE: Response: {'Error': {'Code': 'ValidationError', 'Severity': None, 'Message': 'No more than 500 characters per params Value. Request contains 2 of greater length.', 'MessageFormat': None, 'MessageParameters': None, 'ReferenceCode': None, 'DetailsUri': None, 'Target': None, 'Details': [], 'InnerError': None, 'DebugInfo': None, 'AdditionalInfo': None}, 'Correlation': {'operation': '246941b1fdbaeab4cba38efa17aae71b', 'request': 'af2ed1b63ff8baca'}, 'Environment': 'southeastasia', 'Location': 'southeastasia', 'Time': '2024-06-21T08:31:41.9784407+00:00', 'ComponentName': 'mlflow', 'statusCode': 400, 'error_code': 'INVALID_PARAMETER_VALUE'}

In [32]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments


def train_model(tokenized_train_dataset, tokenized_test_dataset, label2id, id2label, model_path):
    tokenizer = AutoTokenizer.from_pretrained(model_path)

    # Truncate sequences to a maximum length of 500 tokens
    def truncate_sequences(batch):
        max_length = 500
        batch['input_ids'] = [seq[:max_length] for seq in batch['input_ids']]
        batch['attention_mask'] = [mask[:max_length]
                                   for mask in batch['attention_mask']]
        if 'token_type_ids' in batch:
            batch['token_type_ids'] = [type_ids[:max_length]
                                       for type_ids in batch['token_type_ids']]
        return batch

    tokenized_train_dataset = tokenized_train_dataset.map(
        truncate_sequences, batched=True)
    tokenized_test_dataset = tokenized_test_dataset.map(
        truncate_sequences, batched=True)

    model = AutoModelForSequenceClassification.from_pretrained(
        model_path,
        num_labels=len(label2id),
        label2id=label2id,
        id2label=id2label
    )

    training_args = TrainingArguments(
        output_dir='./results',
        evaluation_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=3,
        weight_decay=0.01,
        logging_dir='./logs'
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train_dataset,
        eval_dataset=tokenized_test_dataset
    )

    trainer.train()
    trainer.save_model()

    return model


In [34]:
train_model(tokenized_train_dataset, tokenized_test_dataset,
            label2id, id2label, model_path="microsoft/deberta-v3-small")


Map:   0%|          | 0/159 [00:00<?, ? examples/s]

Map:   0%|          | 0/54 [00:00<?, ? examples/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RestException: INVALID_PARAMETER_VALUE: Response: {'Error': {'Code': 'ValidationError', 'Severity': None, 'Message': 'No more than 500 characters per params Value. Request contains 2 of greater length.', 'MessageFormat': None, 'MessageParameters': None, 'ReferenceCode': None, 'DetailsUri': None, 'Target': None, 'Details': [], 'InnerError': None, 'DebugInfo': None, 'AdditionalInfo': None}, 'Correlation': {'operation': '8d7b7ae85271da20d2d8083609efb7fa', 'request': 'a049252e302d74cc'}, 'Environment': 'southeastasia', 'Location': 'southeastasia', 'Time': '2024-06-21T08:36:13.6327411+00:00', 'ComponentName': 'mlflow', 'statusCode': 400, 'error_code': 'INVALID_PARAMETER_VALUE'}