In [1]:
import nltk
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

# my project
from module.conf import PROJECT_DIR

In [4]:
data = pd.read_csv(PROJECT_DIR + "/data/basic/email/spam.csv", encoding="utf-8")[['Category', 'Message']]  # v1 = label, v2 = text
data.columns = ['label', 'text']

# convert label Spam/Ham to number (Spam = 1, Ham = 0)
label_encoder = LabelEncoder()
data['label'] = label_encoder.fit_transform(data['label'])

# split train/test
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['label'], test_size=0.2, random_state=42)

In [21]:
len(X_train)

4457

In [22]:
sum(y_train)

598

In [69]:
vectorizer = TfidfVectorizer(max_features=5000)  # get the 5000 most popular words
# vectorizer = TfidfVectorizer(max_features=10_000)  # get all words
X_train_tfidf = vectorizer.fit_transform(X_train).toarray()
X_test_tfidf = vectorizer.transform(X_test).toarray()

In [70]:
len(vectorizer.vocabulary_)

5000

In [68]:
X_train_tfidf.shape

(4457, 7701)

In [26]:
X_test_tfidf[:5]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [27]:
model = Sequential([
    Dense(512, activation='relu', input_shape=(X_train_tfidf.shape[1],)),
    Dropout(0.3),
    Dense(256, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')  # Binary classification (Spam / Ham)
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [28]:
model.fit(X_train_tfidf, y_train, epochs=10, batch_size=32, validation_data=(X_test_tfidf, y_test))

Epoch 1/10


2025-02-25 22:06:45.142094: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x16221d060>

In [29]:
loss, accuracy = model.evaluate(X_test_tfidf, y_test)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 99.19%


LSTM

In [34]:
from tensorflow.keras.layers import Embedding, LSTM, SpatialDropout1D

max_words = 5000
max_len = 100

# Tokenizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

model_lstm = Sequential([
    Embedding(max_words, 128, input_length=max_len),
    SpatialDropout1D(0.2),
    LSTM(100, dropout=0.2, recurrent_dropout=0.2),
    Dense(1, activation='sigmoid')
])

model_lstm.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model_lstm.fit(X_train_pad, y_train, epochs=5, batch_size=64, validation_data=(X_test_pad, y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x169e0ffa0>

In [36]:
loss, accuracy = model_lstm.evaluate(X_test_pad, y_test)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 99.10%


1. DF (Inverse Document Frequency): in TF-IDF.

$$IDF(t) = \log\left(\frac{N}{1 + DF(t)}\right)$$

- N: number of document.
- IDF(t): number of document what contains t.
- Add 1 to avoid devide by 0 error.

2. TF (Term Frequency): another version of IDF, apply to process a word appear more times in a corpus.

$$TF(t, d) = \log\left(\frac{|d|}{1 + TF(t, d)}\right)$$

- |d|: number of words in corpus.
- TF(t, d): the number of times word t appears in corpus d.

In [76]:
from collections import Counter
def compute_idf(corpus):
    """ Calculate IDF """
    N = len(corpus)
    word_doc_count = Counter()
    for doc in corpus:
        # uni_words = nltk.word_tokenize(doc)
        unique_words = set(doc.split())
        for word in unique_words:
            word_doc_count[word] += 1
            pass
        pass
    rs = {word : np.log(N / (1 + count)) for word, count in word_doc_count.items()}
    return rs

In [77]:
documents = [
    "học máy là một nhánh của trí tuệ nhân tạo",
    "học sâu là một phần của học máy",
    "trí tuệ nhân tạo có nhiều ứng dụng trong đời sống"
]
idf_values = compute_idf(documents)
print(idf_values)

{'tạo': 0.0, 'nhánh': 0.4054651081081644, 'nhân': 0.0, 'một': 0.0, 'là': 0.0, 'của': 0.0, 'trí': 0.0, 'tuệ': 0.0, 'máy': 0.0, 'học': 0.0, 'sâu': 0.4054651081081644, 'phần': 0.4054651081081644, 'nhiều': 0.4054651081081644, 'có': 0.4054651081081644, 'ứng': 0.4054651081081644, 'đời': 0.4054651081081644, 'dụng': 0.4054651081081644, 'sống': 0.4054651081081644, 'trong': 0.4054651081081644}


In [78]:
def compute_itf(document):
    """ Calculate ITF """
    words = document.split()
    total_words = len(words)
    word_count = Counter(words)

    # TF
    itf_scores = {word: np.log(total_words / (1 + count)) for word, count in word_count.items()}
    return itf_scores

In [79]:
sample_doc = "học máy là một lĩnh vực của trí tuệ nhân tạo học máy rất quan trọng"
itf_values = compute_itf(sample_doc)
print(itf_values)

{'học': 1.6739764335716716, 'máy': 1.6739764335716716, 'là': 2.0794415416798357, 'một': 2.0794415416798357, 'lĩnh': 2.0794415416798357, 'vực': 2.0794415416798357, 'của': 2.0794415416798357, 'trí': 2.0794415416798357, 'tuệ': 2.0794415416798357, 'nhân': 2.0794415416798357, 'tạo': 2.0794415416798357, 'rất': 2.0794415416798357, 'quan': 2.0794415416798357, 'trọng': 2.0794415416798357}


In [100]:
vectorizer1 = TfidfVectorizer(max_features=None)
vectorizer1.fit(documents)
X_train_tfidf_1 = vectorizer.transform(documents).toarray()
# X_train_tfidf_1 = vectorizer1.fit_transform(documents).toarray()
X_test_tfidf_2 = vectorizer.transform(documents).toarray()

In [101]:
X_train_tfidf_1

array([[0.        , 0.30529678, 0.        , 0.30529678, 0.30529678,
        0.30529678, 0.30529678, 0.        , 0.40142857, 0.30529678,
        0.        , 0.        , 0.        , 0.        , 0.30529678,
        0.30529678, 0.30529678, 0.        , 0.        ],
       [0.        , 0.29542622, 0.        , 0.59085245, 0.29542622,
        0.29542622, 0.29542622, 0.        , 0.        , 0.        ,
        0.38844998, 0.38844998, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        ],
       [0.32767345, 0.        , 0.32767345, 0.        , 0.        ,
        0.        , 0.        , 0.32767345, 0.        , 0.24920411,
        0.        , 0.        , 0.32767345, 0.32767345, 0.24920411,
        0.24920411, 0.24920411, 0.32767345, 0.32767345]])

In [91]:
vectorizer1

AttributeError: 'TfidfVectorizer' object has no attribute 'vocabulary_'