In [6]:
import nltk
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

# my project
from module.conf import PROJECT_DIR

In [None]:
data = pd.read_csv(PROJECT_DIR + "/data/basic/email/spam.csv", encoding="utf-8")[['Category', 'Message']]  # v1 = label, v2 = text
data.columns = ['label', 'text']

In [None]:
# convert label Spam/Ham to number (Spam = 1, Ham = 0)
label_encoder = LabelEncoder()
data['label'] = label_encoder.fit_transform(data['label'])

# split train/test
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['label'], test_size=0.2, random_state=42)

In [None]:
len(X_train)

In [None]:
sum(y_train)

In [None]:
vectorizer = TfidfVectorizer(max_features=5000)  # get the 5000 most popular words
# vectorizer = TfidfVectorizer(max_features=10_000)  # get all words
X_train_tfidf = vectorizer.fit_transform(X_train).toarray()
X_test_tfidf = vectorizer.transform(X_test).toarray()

In [None]:
len(vectorizer.vocabulary_)

In [None]:
X_train_tfidf.shape

In [None]:
X_test_tfidf

LSTM

1. DF (Inverse Document Frequency): in TF-IDF.

$$IDF(t) = \log\left(\frac{N}{1 + DF(t)}\right)$$

- N: number of document.
- IDF(t): number of document what contains t.
- Add 1 to avoid devide by 0 error.

2. TF (Term Frequency): another version of IDF, apply to process a word appear more times in a corpus.

$$TF(t, d) = \log\left(\frac{|d|}{1 + TF(t, d)}\right)$$

- |d|: number of words in corpus.
- TF(t, d): the number of times word t appears in corpus d.

In [1]:
from collections import Counter

In [2]:
documents = [
    "học máy là một nhánh của trí tuệ nhân tạo",
    "học sâu là một phần của học máy",
    "trí tuệ nhân tạo có nhiều ứng dụng trong đời sống"
]

In [3]:
def compute_idf(corpus):
    """ Calculate IDF """
    N = len(corpus)
    word_doc_count = Counter()
    for doc in corpus:
        # uni_words = nltk.word_tokenize(doc)
        unique_words = set(doc.split())
        for word in unique_words:
            word_doc_count[word] += 1
            pass
        pass
    rs = {word : np.log(N / (1 + count)) for word, count in word_doc_count.items()}
    return rs

In [26]:
def compute_tf(document):
    """ Calculate ITF """
    words = document.split()
    total_words = len(words)
    word_count = Counter(words)

    # TF
    tf_scores = {word: np.log(total_words / (1 + count)) for word, count in word_count.items()}
    return tf_scores

In [24]:
def compute_tfidf(corpus):
    """ Calculate TF-IDF """
    N = len(corpus)
    word_doc_count = Counter()
    word_corpus_count = Counter()
    for doc in corpus:
        for word in doc.split():
            word_corpus_count[word] += 1
            pass
        pass
    rs = []
    for doc in corpus:

        for word, count in word_corpus_count.items():

            pass
        pass

    return word_corpus_count

In [28]:
compute_idf(documents)

{'của': 0.0,
 'tạo': 0.0,
 'trí': 0.0,
 'nhánh': 0.4054651081081644,
 'học': 0.0,
 'tuệ': 0.0,
 'là': 0.0,
 'nhân': 0.0,
 'máy': 0.0,
 'một': 0.0,
 'phần': 0.4054651081081644,
 'sâu': 0.4054651081081644,
 'ứng': 0.4054651081081644,
 'dụng': 0.4054651081081644,
 'đời': 0.4054651081081644,
 'có': 0.4054651081081644,
 'sống': 0.4054651081081644,
 'trong': 0.4054651081081644,
 'nhiều': 0.4054651081081644}

In [25]:
compute_tfidf(documents)

Counter({'học': 3,
         'máy': 2,
         'là': 2,
         'một': 2,
         'của': 2,
         'trí': 2,
         'tuệ': 2,
         'nhân': 2,
         'tạo': 2,
         'nhánh': 1,
         'sâu': 1,
         'phần': 1,
         'có': 1,
         'nhiều': 1,
         'ứng': 1,
         'dụng': 1,
         'trong': 1,
         'đời': 1,
         'sống': 1})

In [12]:
vectorizer1 = TfidfVectorizer()
vectorizer1.fit(documents)
X_train_tfidf_1 = vectorizer1.transform(documents).toarray()
# X_train_tfidf_1 = vectorizer1.fit_transform(documents).toarray()
# X_test_tfidf_2 = vectorizer.transform(documents).toarray()

In [13]:
X_train_tfidf_1

array([[0.        , 0.30529678, 0.        , 0.30529678, 0.30529678,
        0.30529678, 0.30529678, 0.        , 0.40142857, 0.30529678,
        0.        , 0.        , 0.        , 0.        , 0.30529678,
        0.30529678, 0.30529678, 0.        , 0.        ],
       [0.        , 0.29542622, 0.        , 0.59085245, 0.29542622,
        0.29542622, 0.29542622, 0.        , 0.        , 0.        ,
        0.38844998, 0.38844998, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        ],
       [0.32767345, 0.        , 0.32767345, 0.        , 0.        ,
        0.        , 0.        , 0.32767345, 0.        , 0.24920411,
        0.        , 0.        , 0.32767345, 0.32767345, 0.24920411,
        0.24920411, 0.24920411, 0.32767345, 0.32767345]])

In [33]:
df = pd.DataFrame(X_train_tfidf_1)
df.columns = vectorizer1.get_feature_names_out()

In [34]:
df

Unnamed: 0,có,của,dụng,học,là,máy,một,nhiều,nhánh,nhân,phần,sâu,sống,trong,trí,tuệ,tạo,đời,ứng
0,0.0,0.305297,0.0,0.305297,0.305297,0.305297,0.305297,0.0,0.401429,0.305297,0.0,0.0,0.0,0.0,0.305297,0.305297,0.305297,0.0,0.0
1,0.0,0.295426,0.0,0.590852,0.295426,0.295426,0.295426,0.0,0.0,0.0,0.38845,0.38845,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.327673,0.0,0.327673,0.0,0.0,0.0,0.0,0.327673,0.0,0.249204,0.0,0.0,0.327673,0.327673,0.249204,0.249204,0.249204,0.327673,0.327673


In [31]:
vectorizer1.get_feature_names_out(0)

array(['có', 'của', 'dụng', 'học', 'là', 'máy', 'một', 'nhiều', 'nhánh',
       'nhân', 'phần', 'sâu', 'sống', 'trong', 'trí', 'tuệ', 'tạo', 'đời',
       'ứng'], dtype=object)