In [1]:
# Basic Libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt  # we only need pyplot
sns.set()  # set the default Seaborn style for graphics

In [2]:
file = '../data/AI Auto Ticket Categorization (Original)(Reference).csv'
df = pd.read_csv(file, encoding='ISO-8859-1', skipinitialspace=True)
df.head()

Unnamed: 0,Department,Tech Group,Sub-Category,Priority,Description,assistant (For Getlogs only)
0,Technology Services,Access Control and Security,Access Reactivation,P3 - Planned,Request to reactivate a users biometric access.,
1,Technology Services,Access Control and Security,Access Upload,P3 - Planned,Request to upload an access to another biometr...,
2,Technology Services,Access Control and Security,Badge Card Re-assignment,P3 - Planned,Request to re-assign a badge card to a differe...,
3,Technology Services,Access Control and Security,Badge Card Request,P3 - Planned,Request for badge card (ITS is already present...,
4,Technology Services,Access Control and Security,Bio Disable,P3 - Planned,Request to disable a biometric access.,


In [3]:
df = df[['Description', 'Department', 'Tech Group']]
df

Unnamed: 0,Description,Department,Tech Group
0,Request to reactivate a users biometric access.,Technology Services,Access Control and Security
1,Request to upload an access to another biometr...,Technology Services,Access Control and Security
2,Request to re-assign a badge card to a differe...,Technology Services,Access Control and Security
3,Request for badge card (ITS is already present...,Technology Services,Access Control and Security
4,Request to disable a biometric access.,Technology Services,Access Control and Security
...,...,...,...
926,Any tickets related to variation order quotati...,Operations Team,Service Delivery Asset Management
927,Any tickets related to variation order quotati...,Operations Team,Service Delivery Asset Management
928,Any tickets related to review of additive-dedu...,Operations Team,Service Delivery Asset Management
929,Any tickets related to inpuuting analysis for ...,Operations Team,Service Delivery Asset Management


In [4]:
# Create a dictionary of dataframes, one for each department
dfs = {dept: df[df['Department'] == dept]
       for dept in df['Department'].unique()}

# Now dfs is a dictionary where the key is the department name and the value is the corresponding dataframe

# To access the dataframe for a specific department, you can do:
# df_dept = dfs['your_department_name']

In [6]:
df_0 = dfs['Technology Services']
df_0

Unnamed: 0,Description,Department,Tech Group
0,Request to reactivate a users biometric access.,Technology Services,Access Control and Security
1,Request to upload an access to another biometr...,Technology Services,Access Control and Security
2,Request to re-assign a badge card to a differe...,Technology Services,Access Control and Security
3,Request for badge card (ITS is already present...,Technology Services,Access Control and Security
4,Request to disable a biometric access.,Technology Services,Access Control and Security
...,...,...,...
823,Ticket for IP phone-related issues.,Technology Services,Voice and Email
824,"Ticket for telephony configuration assistance,...",Technology Services,Voice and Email
825,Any voice-related activity/inquiry that are no...,Technology Services,Voice and Email
826,Ticket for 3CX softphone issues.,Technology Services,Voice and Email


In [10]:
techgroups_0 = df_0['Tech Group'].unique()
techgroups_0

array(['Access Control and Security', 'Asset Management', 'CCTV Admin',
       'Network Design Team', 'NOC', 'NOC Systems', 'On-Site Support',
       'Product Development', 'Service Desk', 'SOC',
       'Technical Project Management', 'TS Managers', 'TS Procurement',
       'Voice and Email'], dtype=object)

In [9]:
descriptions = df_0['Description']

In [11]:
rows = []
for description in descriptions:
    row = {'Description': description}
    for techgroup_0 in techgroups_0:
        row[techgroup_0] = 1 if techgroup_0 in df_0[df_0['Description']==description]['Tech Group'].values else 0
    rows.append(row)

In [12]:
new_df_0 = pd.DataFrame(rows, columns=['Description']+list(techgroups_0))
new_df_0

Unnamed: 0,Description,Access Control and Security,Asset Management,CCTV Admin,Network Design Team,NOC,NOC Systems,On-Site Support,Product Development,Service Desk,SOC,Technical Project Management,TS Managers,TS Procurement,Voice and Email
0,Request to reactivate a users biometric access.,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Request to upload an access to another biometr...,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Request to re-assign a badge card to a differe...,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Request for badge card (ITS is already present...,1,0,0,0,0,0,1,0,0,0,0,0,0,0
4,Request to disable a biometric access.,1,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
354,Ticket for IP phone-related issues.,0,0,0,0,0,0,1,0,0,0,0,0,0,1
355,"Ticket for telephony configuration assistance,...",0,0,0,0,0,0,0,0,0,0,0,0,0,1
356,Any voice-related activity/inquiry that are no...,0,0,0,0,0,0,0,0,0,0,0,0,0,1
357,Ticket for 3CX softphone issues.,0,0,0,0,0,0,1,0,0,0,0,0,0,1


In [13]:
train_data = new_df_0

In [15]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/tangminhanh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/tangminhanh/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/tangminhanh/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [20]:
# Remove Stopwords
stop_words = set(stopwords.words('english'))

# function to remove stopwords


def remove_stopwords(text):
    no_stopword_text = [w for w in text.split() if not w in stop_words]
    return ' '.join(no_stopword_text)

# Clean Text


def clean_text(text):
    text = text.lower()
    text = re.sub("[^a-zA-Z]", " ", text)
    text = ' '.join(text.split())
    return text


# stemming
stemmer = SnowballStemmer("english")


def stemming(sentence):
    stemSentence = ""
    for word in sentence.split():
        stem = stemmer.stem(word)
        stemSentence += stem
        stemSentence += " "
    stemSentence = stemSentence.strip()
    return stemSentence


train_data['Text'] = train_data['Description'].apply(
    lambda x: remove_stopwords(x))
train_data['Text'] = train_data['Description'].apply(lambda x: clean_text(x))
train_data['Text'] = train_data['Description'].apply(stemming)


In [21]:
train_data['Description'] = train_data['Text']
train_data.drop('Text', inplace=True, axis=1)

In [22]:
train_data

Unnamed: 0,Description,Access Control and Security,Asset Management,CCTV Admin,Network Design Team,NOC,NOC Systems,On-Site Support,Product Development,Service Desk,SOC,Technical Project Management,TS Managers,TS Procurement,Voice and Email
0,request to reactiv a user biometr access.,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,request to upload an access to anoth biometr d...,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,request to re-assign a badg card to a differ u...,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3,request for badg card (it is alreadi present o...,1,0,0,0,0,0,1,0,0,0,0,0,0,0
4,request to disabl a biometr access.,1,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
354,ticket for ip phone-rel issues.,0,0,0,0,0,0,1,0,0,0,0,0,0,1
355,"ticket for telephoni configur assistance, requ...",0,0,0,0,0,0,0,0,0,0,0,0,0,1
356,ani voice-rel activity/inquiri that are not on...,0,0,0,0,0,0,0,0,0,0,0,0,0,1
357,ticket for 3cx softphon issues.,0,0,0,0,0,0,1,0,0,0,0,0,0,1


In [23]:
!pip3 install datasets transformers evaluate sentencepiece accelerate



In [24]:
!pip install datasets



In [25]:
from sklearn.model_selection import train_test_split
train_dataset, test_dataset = train_test_split(
    train_data, test_size=0.2, random_state=42)

In [26]:
from datasets import Dataset
train_dataset = Dataset.from_pandas(train_dataset)
test_dataset = Dataset.from_pandas(test_dataset)

  from .autonotebook import tqdm as notebook_tqdm


In [27]:
train_dataset

Dataset({
    features: ['Description', 'Access Control and Security', 'Asset Management', 'CCTV Admin', 'Network Design Team', 'NOC', 'NOC Systems', 'On-Site Support', 'Product Development', 'Service Desk', 'SOC', 'Technical Project Management', 'TS Managers', 'TS Procurement', 'Voice and Email', '__index_level_0__'],
    num_rows: 287
})

In [28]:
techgroups_0 = [
    col for col in train_dataset.column_names if col != 'Description']


In [29]:
techgroups_0 = techgroups_0[:-1]
print(techgroups_0)

['Access Control and Security', 'Asset Management', 'CCTV Admin', 'Network Design Team', 'NOC', 'NOC Systems', 'On-Site Support', 'Product Development', 'Service Desk', 'SOC', 'Technical Project Management', 'TS Managers', 'TS Procurement', 'Voice and Email']


In [30]:
techgroup_02id = {techgroup_0: id for id, techgroup_0 in enumerate(techgroups_0)}
id2techgroup_0 = {id:techgroup_0 for techgroup_0, id in techgroup_02id.items()}

In [31]:
techgroup_02id,id2techgroup_0

({'Access Control and Security': 0,
  'Asset Management': 1,
  'CCTV Admin': 2,
  'Network Design Team': 3,
  'NOC': 4,
  'NOC Systems': 5,
  'On-Site Support': 6,
  'Product Development': 7,
  'Service Desk': 8,
  'SOC': 9,
  'Technical Project Management': 10,
  'TS Managers': 11,
  'TS Procurement': 12,
  'Voice and Email': 13},
 {0: 'Access Control and Security',
  1: 'Asset Management',
  2: 'CCTV Admin',
  3: 'Network Design Team',
  4: 'NOC',
  5: 'NOC Systems',
  6: 'On-Site Support',
  7: 'Product Development',
  8: 'Service Desk',
  9: 'SOC',
  10: 'Technical Project Management',
  11: 'TS Managers',
  12: 'TS Procurement',
  13: 'Voice and Email'})

In [32]:
!pip install protobuf



In [40]:
from transformers import AutoTokenizer
model_path = 'microsoft/deberta-v3-small'
tokenizer = AutoTokenizer.from_pretrained(
    model_path)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [41]:
def preprocess_function(example):
  text = example["Description"]
  labels = [0. for _ in range(len(techgroups_0))]
  for techgroup_0 in techgroups_0:
      if example[techgroup_0] == 1:
          label_id = techgroup_02id[techgroup_0]
          labels[label_id] = 1.

  example = tokenizer(text, truncation=True,
                      padding='max_length', max_length=512)
  example['labels'] = labels
  return example


In [42]:
tokenized_train_dataset = train_dataset.map(preprocess_function)

Map: 100%|██████████| 287/287 [00:00<00:00, 307.70 examples/s]


In [44]:
tokenized_test_dataset = test_dataset.map(preprocess_function)

Map: 100%|██████████| 72/72 [00:00<00:00, 199.83 examples/s]


In [45]:
from sklearn.model_selection import train_test_split
train_dataset, test_dataset = train_test_split(
    train_data, test_size=0.2, random_state=42)