## Install Library and Package

In [3]:
import pandas as pd
import sklearn
import numpy as np
import joblib
import string
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Load the dataset
file_path = '../data/Ticket Categories Description (Updated)(Updated Dataset).csv'
df = pd.read_csv(file_path, encoding='ISO-8859-1')
data = df[['Category', 'Sub-Category', 'Description']]
# Display the first few rows of the dataset
data.head()


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/tangminhanh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/tangminhanh/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/tangminhanh/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Unnamed: 0,Category,Sub-Category,Description
0,Web/Application Security,High Severity Alert,
1,Web/Application Security,Low Severity Alert,
2,Web/Application Security,Vulnerability Assessment,Any tickets associated with the assessment and...
3,Web/Application Security,Vulnerability Remediation,Any tickets associated with concerns or findin...
4,Vendor Payment,Invoice Posting,Any ticket related to Posting of invoices tail...


In [44]:
!pip3 install transformers torch torchvision scikit-learn pandas filelock accelerate



In [4]:
categories = data['Category'].unique()
sub_categories = data['Sub-Category'].unique()

print(f"Categories: {categories}")
print(f"Sub-Categories: {sub_categories}")

Categories: ['Web/Application Security' 'Vendor Payment' 'Vendor Accreditation'
 'User Account' 'TV Advertisement' 'TREASURY' 'TPMO' 'Ticketing Support'
 'Telephony' 'Talent as a Service' 'System Security' 'Software'
 'Services Management' 'Server Projects' 'Server Management'
 'Project Quality Summary' 'Procurement' 'PRO Vendor Payment'
 'PRO Vendor Accreditation' 'PRO Purchase Order Processing'
 'PRO Delivery Confirmation' 'PRO Canvassing' 'Printer'
 'PQS - Vendor\xa0Assessment?' 'PQS - Project Estimate?'
 'PQS - Pre-Project Phase' 'PQS - Post Project Phase'
 'PQS - Ongoing Project Phase' 'PQS - Contractor Awarding?'
 'PQE - Site Inspection' 'PQE - Plan Review Request' 'Power'
 'PO Processing' 'PM Testfit request' 'PM Request' 'PM OSM Request'
 'PM Office Info Request' 'PM Design Request' 'PM Design Management'
 'PM Costing Request' 'PM Compliance Request' 'People Experience'
 'P&C Verification' 'P&C SSS | Pag-Ibig | Philhealth | BIR'
 'P&C Resignation/Offboarding' 'P&C Reports' 'P&C

In [5]:
category_subcategory = df.groupby('Category')['Sub-Category'].unique()
print(category_subcategory)

Category
ACCOUNTS PAYABLE              [Below 20k invoices (No PO), Client Cash advan...
Asset Management              [Accountability Form, Disposal, Gatepass Proce...
BCP Activation                                  [BCP Testing, Client, Internal]
BTS CLIENT MEETING REQUEST    [Client Presentation, Project Scoping / Client...
BTS DESIGN REQUEST            [Furniture / Joinery Details, 3d Modeling + Re...
                                                    ...                        
Ticketing Support             [Recurring Ticket Request/Removal, Report Gene...
User Account                  [Email Account Creation, Email Account Deletio...
Vendor Accreditation          [Vendor verification, Vendor sourcing, Vendor ...
Vendor Payment                              [Invoice Posting, Invoice Revision]
Web/Application Security      [High Severity Alert, Low Severity Alert, Vuln...
Name: Sub-Category, Length: 158, dtype: object


In [47]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

train_df['category_encoded'] = le.fit_transform(df['Category'])
train_df['sub-category_encoded'] = le.fit_transform(df['Sub-Category'])

print(train_df.head())

                   Category               Sub-Category  \
0  Web/Application Security        High Severity Alert   
1  Web/Application Security         Low Severity Alert   
2  Web/Application Security   Vulnerability Assessment   
3  Web/Application Security  Vulnerability Remediation   
4            Vendor Payment            Invoice Posting   

                                         Description  category_encoded  \
0                                                                  157   
1                                                                  157   
2  ticket associ assess identif vulner web applic...               157   
3  ticket associ concern find web applic requir r...               157   
4  ticket relat post invoic tailor zoom client re...               156   

   sub-category_encoded  
0                   304  
1                   370  
2                   788  
3                   789  
4                   345  


In [48]:
data = data.replace(np.nan,"")
train_df = data

In [49]:
stop = set(stopwords.words("english"))
def clean(text):
            text = text.lower()
            obj = re.compile(r"<.*?>")
            text = obj.sub(r" ", text)
            obj = re.compile(r"https://\S+|http://\S+")
            text = obj.sub(r" ", text)
            obj = re.compile(r"[^\w\s]")
            text = obj.sub(r" ", text)
            obj = re.compile(r"\d{1,}")
            text = obj.sub(r" ", text)
            obj = re.compile(r"_+")
            text = obj.sub(r" ", text)
            obj = re.compile(r"\s\w\s")
            text = obj.sub(r" ", text)
            obj = re.compile(r"\s{2,}")
            text = obj.sub(r" ", text)

            stemmer = SnowballStemmer("english")
            text = [stemmer.stem(word)
                    for word in text.split() if word not in stop]

            porter_stemmer = PorterStemmer()
            text = [porter_stemmer.stem(word) for word in text]
            wordnet_lemmatizer = WordNetLemmatizer()
            lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in text]
            return " ".join(text)

In [50]:
train_df['Description'] = train_df['Description'].apply(clean)

In [51]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

X = train_df['Description'].values
y_category = LabelEncoder().fit_transform(train_df['Category'].values)
# y_subcategory = LabelEncoder().fit_transform(train_df['Sub-Category'].values)

# Split the data into training and validation sets for both category and sub-category
X_train, X_val, y_train_category, y_val_category = train_test_split(X, y_category, test_size=0.2, random_state=42)
# _, _, y_train_subcategory, y_val_subcategory = train_test_split(X, y_subcategory, test_size=0.2, random_state=42)

In [52]:
# Load model directly
from transformers import AutoTokenizer, DistilBertForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=len(set(y_category)))

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [53]:
# Tokenize the input (takes some time)
train_encodings = tokenizer(list(X_train), truncation=True, padding=True)
val_encodings = tokenizer(list(X_val), truncation=True, padding=True)

In [54]:
train_encodings.keys()

dict_keys(['input_ids', 'attention_mask'])

In [55]:
# Convert to PyTorch DataLoaders
import torch
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = Dataset(train_encodings, y_train_category)
val_dataset = Dataset(val_encodings, y_val_category)

In [56]:
train_dataset[100]

{'input_ids': tensor([  101,  3154, 18856,  8649, 27885,  3367,  6820,  6593,  2624,  6590,
          3089,  2240,   102,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'labels': tensor(62)}

In [68]:
def compute_metrics(p):
  pred, labels = p.predictions, p.label_ids
  pred = np.argmax(pred, axis = 1)

  accuracy = accuracy_score(y_true=labels, y_pred=pred)
  recall = recall_score(y_true = labels, y_pred=pred, average='macro')
  precision = precision_score(y_true = labels, y_pred = pred, average='macro')
  f1 = f1_score(y_true = labels, y_pred = pred, average='macro')
  return {"accuracy": accuracy, "recall": recall, "precision": precision, "f1": f1}


In [69]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

In [70]:
# Fine-tune the model
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='output',          # output directory
    num_train_epochs=1,              # total number of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    # per_device_eval_batch_size=8,   # batch size for evaluation
    #warmup_steps=500,                # number of warmup steps for learning rate scheduler
    #weight_decay=0.01,               # strength of weight decay
    #logging_dir='./logs',            # directory for storing logs
    #logging_steps=10,
)


In [71]:
trainer = Trainer(
    model=model,                         # the instantiated ðŸ¤— Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset ,            # evaluation dataset
    compute_metrics=compute_metrics,
)


In [72]:
trainer.train()

Step,Training Loss


TrainOutput(global_step=100, training_loss=4.517292785644531, metrics={'train_runtime': 403.8076, 'train_samples_per_second': 1.981, 'train_steps_per_second': 0.248, 'total_flos': 14736747844800.0, 'train_loss': 4.517292785644531, 'epoch': 1.0})

**Evaluate Fine-Tune Model**

In [73]:
eval_results = trainer.evaluate()
print(eval_results)

{'eval_loss': 4.381199359893799, 'eval_accuracy': 0.115, 'eval_recall': 0.011235955056179775, 'eval_precision': 0.0013051866984451254, 'eval_f1': 0.002338705577304388, 'eval_runtime': 37.4199, 'eval_samples_per_second': 5.345, 'eval_steps_per_second': 0.668, 'epoch': 1.0}


  _warn_prf(average, modifier, msg_start, len(result))


In [75]:
text = "Any ticket related to Accreditation process for vendors exclusive to Zoom clients."
input = tokenizer(text, padding = True, truncation=True, return_tensors='pt')
output = model(**input)
print(output)
pred = torch.nn.functional.softmax(output.logits, dim = 1)
print(pred)

SequenceClassifierOutput(loss=None, logits=tensor([[ 0.3735,  0.0765, -0.0069, -0.6914, -0.6249,  0.4454, -1.0513, -1.1338,
         -0.6104, -0.9787, -0.0850,  0.1991, -0.5781, -0.0338, -0.3416, -0.3112,
         -0.3927, -1.3495, -0.9651, -0.5681, -1.1739, -0.9182,  0.0308, -0.7326,
         -0.4356, -0.4897, -1.2099, -0.8220, -0.3908, -0.2166, -0.9548, -0.7899,
         -0.2954, -0.9739, -0.6031, -0.4792,  1.0946, -0.9363, -0.4206, -0.6582,
         -0.3878, -0.6317, -0.3592, -0.4276, -0.8160, -0.5542, -0.5718, -0.6678,
         -1.1690, -0.5443,  0.0130, -0.8932,  2.2457, -0.1250, -0.5028,  0.3295,
         -0.0701, -0.8655,  0.1351, -0.8063, -0.8013,  0.0771, -0.4234, -0.2246,
         -1.3062, -1.1546, -1.1298, -0.8776, -0.5027, -0.6403, -0.0757, -0.9083,
         -0.7505, -0.4007, -0.5943, -0.1385, -0.0684,  0.4667,  0.1464,  0.3692,
         -0.9351, -0.5094, -0.9115, -0.7061, -0.4841, -0.9007,  0.0589,  0.1452,
         -0.4910, -0.4956,  0.4417,  0.0540,  0.5255, -0.6376, -0.

In [None]:
text = "I'm happy I can finally train a model for multi-label classification"

encoding = tokenizer(text, return_tensors="pt")
encoding = {k: v.to(trainer.model.device) for k,v in encoding.items()}

outputs = trainer.model(**encoding)

In [None]:
from transformers import pipeline

def predict_category(text):
    # Preprocess the text
    text = clean(text)

    # Load the fine-tuned model
    model = DistilBertForSequenceClassification.from_pretrained("./my_model")

    # Create a pipeline
    classifier = pipeline('text-classification', model=model, tokenizer=tokenizer)

    # Make a prediction
    result = classifier(text)

    # Convert the prediction to the corresponding category
    predicted_label = result[0]['label']
    predicted_score = result[0]['score']

    return predicted_label, predicted_score

# Test the function
text = "Your test input here"
print(predict_category(text))
