### some tools and framework 
- Python: the primary program language
- spaCy: Using to preprocesing data
- scikit-learn: to evalued the model
- Pytorch: Using to building deep learning model


In [120]:
import torch
import re
import torch.nn as nn
import torch.optim as optim
import spacy
import sklearn
import pandas as pd
from sklearn.model_selection import train_test_split
from langdetect import detect


In [121]:
nlp = spacy.load("en_core_web_sm")

In [122]:
df = pd.read_excel("dataset/Dataset_ticket.xlsx")


In [123]:
df.head()

Unnamed: 0,Category,Subcategory,Text
0,Inquiry/Help,Antivirus,Ho bisogno di assistenza per installare il mio...
1,Inquiry/Help,Antivirus,Il mio antivirus non si aggiorna correttamente...
2,Inquiry/Help,Antivirus,Ho ricevuto una notifica di minaccia dal mio a...
3,Inquiry/Help,Antivirus,Vorrei configurare la scansione automatica del...
4,Inquiry/Help,Antivirus,Il mio antivirus continua a segnalare falsi po...


In [124]:
# lọc chỉ giữ ngôn ngữ tiếng anh
langs = df['Text'].map(detect)
df = df[langs == 'en']
df.head()


Unnamed: 0,Category,Subcategory,Text
50,Inquiry/Help,Antivirus,Need assistance installing antivirus software ...
51,Inquiry/Help,Antivirus,Looking for recommendations for the best antiv...
52,Inquiry/Help,Antivirus,"Experiencing issues with my current antivirus,..."
53,Inquiry/Help,Antivirus,"Want to upgrade my antivirus subscription, nee..."
54,Inquiry/Help,Antivirus,Seeking advice on configuring antivirus settin...


In [125]:
df['Y'] = df['Category'] + '-' +df['Subcategory']
df.head()
x = df['Text']
y = df['Y']

Tiền xử lý dữ liệu

In [126]:
# Tách từ, loại bỏ stop-word, chuyển về từ thường, đưa về nguyên thể
# loại bỏ dấu '.', ký tự xuống dòng, và một vài ký tự đặc biệt
def preprocessing_data(x):
    # lower
    x = x.map(lambda a: a.lower())
    x = x.map(lambda a: re.sub(r'\n+', ' ', a))      
    # loại bỏ các ký tự đặc biệt     
    x = x.map(lambda a: re.sub(r'[^a-zA-Z0-9\s]', ' ', a))
    # tách từ
    x = x.map(lambda a : nlp(a))
    # loại bỏ stop-word và lemmanization 
    x = x.map(lambda a : [ token.lemma_ for token in a if not token.is_stop and token.is_alpha])

    return x

In [127]:
processed_data = preprocessing_data(x)
print(processed_data[0:10])

50    [need, assistance, instal, antivirus, software...
51     [look, recommendation, good, antivirus, program]
52    [experience, issue, current, antivirus, need, ...
53    [want, upgrade, antivirus, subscription, need,...
54    [seek, advice, configure, antivirus, setting, ...
55    [require, help, resolve, compatibility, issue,...
56    [interested, learn, late, feature, antivirus, ...
57    [need, assistance, renew, antivirus, subscript...
58    [experience, slowdown, instal, antivirus, need...
59    [want, schedule, regular, antivirus, scan, nee...
Name: Text, dtype: object


Tạo vocabulary

In [128]:
tokens = [token for row in processed_data for token in row]

vocabulary = list(set(tokens))
word2idx = {w: i for i, w in enumerate(vocabulary)}
idx2word = {i:w for i, w in enumerate(vocabulary)}

vocabulary_size = len(vocabulary)

In [129]:
window_size = 4
pairs = []

for row in processed_data:
    for i, center in enumerate(row):
        context_start = i - window_size if i > window_size else 0
        context_end = i + window_size if i + window_size <= len(row) -1 else len(row) - 1
        context_words = row[context_start : i] + row[i+1 : context_end + 1]
        for word in context_words:
            pairs.append((word2idx[center], word2idx[word]))
        
print(pairs[:10])


[(26, 1172), (26, 1099), (26, 240), (26, 234), (1172, 26), (1172, 1099), (1172, 240), (1172, 234), (1172, 953), (1099, 26)]


In [130]:
class Word2Vec (nn.Module) : 
    def __init__(self, vocabulary_size, embedding_dim):
        super(Word2Vec, self).__init__()
        self.in_embed = nn.Embedding(vocabulary_size, embedding_dim= embedding_dim)
        self.out_embed = nn.Embedding(vocabulary_size, embedding_dim= embedding_dim)
    def forward(self, center_words):
        center_vectors = self.in_embed(center_words)  # (batch_size, embedding_dim)
        scores = torch.matmul(center_vectors, self.out_embed.weight.t())  # (batch_size, vocab_size)
        log_probs = torch.log_softmax(scores, dim=1)
        return log_probs


In [131]:
embedding_dim = 50
model = Word2Vec(vocabulary_size, embedding_dim)
optimizer = optim.Adam(model.parameters(), lr=0.01)
loss_fn = nn.NLLLoss()  # negative log likelihood

# Tạo dữ liệu train
centers = torch.tensor([c for c, ctx in pairs], dtype=torch.long)
contexts = torch.tensor([ctx for c, ctx in pairs], dtype=torch.long)

# Training loop
for epoch in range(2000):
    optimizer.zero_grad()
    log_probs = model(centers)
    loss = loss_fn(log_probs, contexts)
    loss.backward()
    optimizer.step()
    if (epoch+1) % 100 == 0:
        print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")

Epoch 100, Loss: 6.5279
Epoch 200, Loss: 4.5850
Epoch 300, Loss: 4.1997
Epoch 400, Loss: 4.0392
Epoch 500, Loss: 3.9505
Epoch 600, Loss: 3.8939
Epoch 700, Loss: 3.8554
Epoch 800, Loss: 3.8278
Epoch 900, Loss: 3.8072
Epoch 1000, Loss: 3.7915
Epoch 1100, Loss: 3.7793
Epoch 1200, Loss: 3.7698
Epoch 1300, Loss: 3.7623
Epoch 1400, Loss: 3.7563
Epoch 1500, Loss: 3.7515
Epoch 1600, Loss: 3.7476
Epoch 1700, Loss: 3.7445
Epoch 1800, Loss: 3.7419
Epoch 1900, Loss: 3.7397
Epoch 2000, Loss: 3.7380


Tạo mô hình - huấn luyện mô hình

In [132]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)


In [133]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)
num_classes = len(le.classes_)
y_train_tensor = torch.tensor(y_train_enc, dtype=torch.long)
y_test_tensor  = torch.tensor(y_test_enc, dtype=torch.long)

In [134]:
x_train_tokens = preprocessing_data(x_train)
x_test_tokens = preprocessing_data(x_test)


In [135]:
def tokens_to_vectors(tokens, model, word2idx):
    """
    tokens: list các từ (string)
    model: mô hình Word2Vec PyTorch của bạn
    word2idx: dict ánh xạ từ -> index
    """
    # Chuyển token sang index
    indices = [word2idx[t] for t in tokens if t in word2idx]
    
    # Tạo tensor
    indices_tensor = torch.tensor(indices, dtype=torch.long)
    
    # Lấy embedding
    vectors = model.in_embed(indices_tensor)  # shape: (len(tokens), embedding_dim)
    return vectors

In [136]:

w2v_model = model
x_train_vectors = [tokens_to_vectors(s, w2v_model, word2idx) for s in x_train_tokens]
x_test_vectors = [tokens_to_vectors(s, w2v_model, word2idx) for s in x_test_tokens]

In [137]:
from torch.nn.utils.rnn import pad_sequence

def tokens_to_indices(tokens_list, word2idx):
    # Chuyển list of tokens thành list of tensors index
    return [torch.tensor([word2idx[t] for t in tokens if t in word2idx], dtype=torch.long)
            for tokens in tokens_list]

x_train_indices = tokens_to_indices(x_train_tokens, word2idx)
x_test_indices  = tokens_to_indices(x_test_tokens, word2idx)

# Padding
x_train_padded = pad_sequence(x_train_indices, batch_first=True)
x_test_padded  = pad_sequence(x_test_indices, batch_first=True)

In [138]:

class TextClassifier(nn.Module):
    def __init__(self, embedding_layer, hidden_dim, output_dim, lstm_layers=2, fc_hidden_dim=128):
        super(TextClassifier, self).__init__()
        self.embedding = embedding_layer  # embedding layer từ w2v_model
        
        # LSTM nhiều layer
        self.lstm = nn.LSTM(
            input_size=self.embedding.embedding_dim,
            hidden_size=hidden_dim,
            num_layers=lstm_layers,      # số layer LSTM
            batch_first=True,
            bidirectional=True           # nếu muốn LSTM 2 chiều
        )
        
        # Fully connected nhiều layer
        self.fc1 = nn.Linear(hidden_dim * 2 if self.lstm.bidirectional else hidden_dim, fc_hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(fc_hidden_dim, output_dim)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, x_indices):
        # Lấy embedding mới mỗi forward
        x_embed = self.embedding(x_indices)
        lstm_out, (h_n, c_n) = self.lstm(x_embed)
        
        # Lấy hidden state cuối cùng của LSTM
        if self.lstm.bidirectional:
            h_final = torch.cat((h_n[-2], h_n[-1]), dim=1)  # ghép 2 chiều
        else:
            h_final = h_n[-1]
        
        out = self.fc1(h_final)
        out = self.relu(out)
        out = self.fc2(out)
        return self.softmax(out)
    
embedding_dim = 100
hidden_dim = 64
model_class = TextClassifier(w2v_model.in_embed, hidden_dim, num_classes)


In [None]:
model_class = TextClassifier(w2v_model.in_embed, hidden_dim, num_classes)
optimizer = torch.optim.Adam(model_class.parameters(), lr=0.005)
loss_fn = nn.NLLLoss()
epochs = 100
for epoch in range(epochs):
    optimizer.zero_grad()
    outputs = model_class(x_train_padded)  # x_train_padded là index
    loss = loss_fn(outputs, y_train_tensor)
    loss.backward()
    optimizer.step()
    print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")


Epoch 1, Loss: 2.8911
Epoch 2, Loss: 2.8704
Epoch 3, Loss: 2.8375
Epoch 4, Loss: 2.7781
Epoch 5, Loss: 2.6734
Epoch 6, Loss: 2.4983
Epoch 7, Loss: 2.2435
Epoch 8, Loss: 2.0357
Epoch 9, Loss: 2.1975
Epoch 10, Loss: 1.7275
Epoch 11, Loss: 1.5245
Epoch 12, Loss: 1.4799
Epoch 13, Loss: 1.1887
Epoch 14, Loss: 1.0034
Epoch 15, Loss: 0.9135
Epoch 16, Loss: 0.7774
Epoch 17, Loss: 0.6388
Epoch 18, Loss: 0.5396
Epoch 19, Loss: 0.4761
Epoch 20, Loss: 0.4018
Epoch 21, Loss: 0.3299
Epoch 22, Loss: 0.2785
Epoch 23, Loss: 0.2194
Epoch 24, Loss: 0.1787
Epoch 25, Loss: 0.1444
Epoch 26, Loss: 0.1180
Epoch 27, Loss: 0.0953
Epoch 28, Loss: 0.0715
Epoch 29, Loss: 0.0556
Epoch 30, Loss: 0.0433
Epoch 31, Loss: 0.0334
Epoch 32, Loss: 0.0260
Epoch 33, Loss: 0.0199
Epoch 34, Loss: 0.0152
Epoch 35, Loss: 0.0117
Epoch 36, Loss: 0.0091
Epoch 37, Loss: 0.0071
Epoch 38, Loss: 0.0057
Epoch 39, Loss: 0.0046
Epoch 40, Loss: 0.0036
Epoch 41, Loss: 0.0029
Epoch 42, Loss: 0.0024
Epoch 43, Loss: 0.0020
Epoch 44, Loss: 0.00

In [140]:
with torch.no_grad():
    outputs_test = model_class(x_test_padded)
    predicted = torch.argmax(outputs_test, dim=1)

from sklearn.metrics import accuracy_score, classification_report
acc = accuracy_score(y_test_tensor, predicted)
print(f"Test Accuracy: {acc:.4f}")
print(classification_report(y_test_tensor, predicted, target_names=le.classes_))

Test Accuracy: 0.8436
                                   precision    recall  f1-score   support

                     Database-DB2       0.86      0.60      0.71        10
           Database-MS SQL Server       0.82      0.90      0.86        10
                  Database-Oracle       0.82      1.00      0.90         9
                     Hardware-CPU       0.82      0.90      0.86        10
                    Hardware-Disk       0.75      0.90      0.82        10
                Hardware-Keyboard       0.83      1.00      0.91        10
                  Hardware-Memory       0.89      0.80      0.84        10
                 Hardware-Monitor       0.77      1.00      0.87        10
                   Hardware-Mouse       0.90      0.90      0.90        10
           Inquiry/Help-Antivirus       0.90      0.90      0.90        10
Inquiry/Help-Internal Application       0.90      0.90      0.90        10
                     Network-DHCP       0.75      0.60      0.67        10
  

In [141]:
torch.save(model_class.state_dict(), 'model.pth')