<a href="https://colab.research.google.com/github/hwarang97/spam_classifier/blob/main/spam_classifier_ngram.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from sklearn.model_selection import train_test_split
import re
import pandas as pd
import numpy as np

# Hyper Parameters

In [None]:
n = 1
test_size = 0.2
SELECT = {
    'use_lower': False,
    'use_stemming': True,
    'use_stopwords': True,
    'use_numreplace': True,
}

# Preprocessing

In [None]:
from nltk.stem import PorterStemmer

def preprocess_text(text, use_lower=False, use_stemming=False, use_stopwords=False, use_numreplace=False):
    if use_lower:
        text = lowercase_text(text)

    words = tokenize_text(text)

    if use_stemming:
        words = stem_words(words)

    if use_stopwords:
        words = remove_stopwords(words)

    if use_numreplace:
        words = replace_numbers(words)

    return words

def lowercase_text(text):
    return text.lower()

def stem_words(words):
    stemmer = PorterStemmer()
    return [stemmer.stem(word) for word in words]

# Function to remove stopwords
def remove_stopwords(words):
    stopwords = set(['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves',
                     'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves',
                     'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had',
                     'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with',
                     'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over',
                     'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some',
                     'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd',
                     'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven',
                     "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren',
                     "weren't", 'won', "won't", 'wouldn', "wouldn't"])

    return [word for word in words if word not in stopwords]

# Function to replace numbers with a special token "NUM"
def replace_numbers(words):
    return ['NUM' if word.isdigit() else word for word in words]

# Tokenize the text by words
def tokenize_text(text):
    return re.findall(r'\b\w+\b', text)

# Function to generate n-grams from a list of words
def generate_ngrams(words, n=2):
    ngrams = []
    for i in range(len(words) - n + 1):
        ngram = ' '.join(words[i:i + n])
        ngrams.append(ngram)
    return ngrams

In [None]:
words = preprocess_text('This this is IS 336 a 22 A 1 do doing done dont use usage play playing', **SELECT)
print(words)

['thi', 'thi', 'NUM', 'NUM', 'NUM', 'done', 'dont', 'use', 'usag', 'play', 'play']


포터 알고리즘을 사용한 이유

- 기존 방법은 어간이 같지만 형태가 다른 경우, 다른 토큰으로 처리되어버린다. 스팸 메일 여부를 판단하는데, 단어의 형태를 크게 중요하지 않다고 생각하여 어간으로 단어 빈도수를 합치자고 함

기본은 소문자화만 적용한 것.
45278 -> 38320 (기본에서 포터 알고리즘 적용)

스탑워드를 사용하니까 확실히 성능이 많이 오른다.

숫자를 NUM으로 바꾼 이유
- 토큰들을 보면 의미를 모를 숫자들이 한번 출현한 경우가 많은데, 어짜피 다시는 쓰일 일이 거의 없을것이라 생각하여어 NUM으로 합쳐버리는것이 좋을것이라 판단.

데이터셋이 작기 때문에, 토큰을 적게 만드는 것이 좋다고 판단하였다. 따라서 숫자 토큰의 경우, 빈도수 1짜리가 너무 많고, 앞으도록 중복되는 숫자가 나올 가능성이 너무 낮아 NUM으로 바꾸었다.

# Split Dataset

In [None]:
# Split the data into training and test sets
df = pd.read_csv('/content/spam_ham_dataset.csv')
X = df['text']
y = df['label_num']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

# Show the number of samples in the training and test sets
len(X_train), len(X_test)

(4136, 1035)

In [None]:
text1 = df['text'][1]
print(text1)

Subject: hpl nom for january 9 , 2001
( see attached file : hplnol 09 . xls )
- hplnol 09 . xls


# 토큰 확인

In [None]:
from collections import Counter

# Tokenize and count the frequency of words in the training set
word_counts = Counter()
for text in X_train: # text는 하나의 소문자화한 이메일 내용
    word_counts.update(preprocess_text(text, **SELECT))

# Show the 10 most common words
word_counts.most_common(10)

[('NUM', 64389),
 ('ect', 10739),
 ('subject', 6370),
 ('thi', 5724),
 ('hou', 5630),
 ('enron', 5055),
 ('com', 3126),
 ('deal', 2845),
 ('pleas', 2603),
 ('ga', 2405)]

In [None]:
len(word_counts)

33967

In [None]:
# 추출한 토큰들 확인
[(l,k) for k,l in sorted([(j,i) for i,j in word_counts.items()], reverse=False)]

[('aaa', 1),
 ('aabda', 1),
 ('aabvmmq', 1),
 ('aac', 1),
 ('aachecar', 1),
 ('aaer', 1),
 ('aafco', 1),
 ('aaigrcrb', 1),
 ('aaihmqv', 1),
 ('aaldano', 1),
 ('aambiqu', 1),
 ('aamlrg', 1),
 ('aar', 1),
 ('aashqcsni', 1),
 ('aaxrzm', 1),
 ('abackof', 1),
 ('abarch', 1),
 ('abas', 1),
 ('abbey', 1),
 ('abbi', 1),
 ('abcdzhongguo', 1),
 ('abdomen', 1),
 ('abdominoplasti', 1),
 ('abductbath', 1),
 ('abe', 1),
 ('abeckley', 1),
 ('abelian', 1),
 ('abelmosk', 1),
 ('abelson', 1),
 ('abercrombi', 1),
 ('aberdeen', 1),
 ('abernathi', 1),
 ('abey', 1),
 ('abfan', 1),
 ('abhor', 1),
 ('abideth', 1),
 ('abilen', 1),
 ('abissno', 1),
 ('abject', 1),
 ('ablat', 1),
 ('ablish', 1),
 ('ablut', 1),
 ('abo', 1),
 ('aboardca', 1),
 ('abolish', 1),
 ('abonn', 1),
 ('aborn', 1),
 ('abortionist', 1),
 ('abound', 1),
 ('aboutyour', 1),
 ('aboveboard', 1),
 ('aboveground', 1),
 ('abpzhnpd', 1),
 ('abrad', 1),
 ('abrahm', 1),
 ('abramo', 1),
 ('abras', 1),
 ('abrbr', 1),
 ('abrbrbrbrp', 1),
 ('abrbrfont', 1)

어떤 숫자가 나오고, 그 숫자는 일관적이지 못해 여러개의 형태가 있어 빈도수가 1인 단어가 있다. 이런 단어는 앞으도로 나올 가능성이 낮고, ngram을 통해 복잡한 토큰을 구성할때도 자주 쓰이지 않을것같다. 따라서 제거하거나 normalizing 하는 방식을 취하는게 좋을 것 같다.

내용을 분류하는데 별로 의미가 없을법한 단어들이 있는데, 그것들을 일반적으로 처리하는게 좋을지 남겨두는것이 좋을지 모르겠음. 일단 없애는 방식을 사용해보자.


# Model(NaiveBayes Classifer)

In [None]:
# Re-define the modified NaiveBayesClassifier class to use the total unique words for Laplace Smoothing
class NaiveBayesClassifier:
    def __init__(self):
        self.word_probs = {}

    def fit(self, X, y, n):
        # Initialize counters for spam and ham word frequencies
        spam_word_counts = Counter()
        ham_word_counts = Counter()

        # Initialize counters for spam and ham emails
        spam_count = 0
        ham_count = 0

        # Count words and emails
        for text, label in zip(X, y):
            # tokens = tokenize_text(text)
            tokens = preprocess_text(text, **SELECT)
            ngrams = generate_ngrams(tokens, n)

            if label == 1:  # spam
                spam_count += 1
                spam_word_counts.update(ngrams)
            else:  # ham
                ham_count += 1
                ham_word_counts.update(ngrams)

        # Calculate the total number of words in spam and ham emails
        total_spam_words = sum(spam_word_counts.values())
        total_ham_words = sum(ham_word_counts.values())

        # Calculate the total number of unique words
        total_unique_words = len(set(spam_word_counts.keys()).union(set(ham_word_counts.keys())))

        # Calculate the probabilities of spam and ham
        spam_prob = spam_count / (spam_count + ham_count)
        ham_prob = 1 - spam_prob

        # Calculate the word probabilities given spam and ham
        for word in set(spam_word_counts.keys()).union(set(ham_word_counts.keys())):
            spam_word_prob = (spam_word_counts[word] + 1) / (total_spam_words + total_unique_words)
            ham_word_prob = (ham_word_counts[word] + 1) / (total_ham_words + total_unique_words)
            self.word_probs[word] = (spam_word_prob, ham_word_prob)

        self.class_probs = (spam_prob, ham_prob)

    def predict(self, X, n):
        predictions = []
        for text in X:
            # tokens = tokenize_text(text)
            tokens = preprocess_text(text, **SELECT)
            ngrams = generate_ngrams(tokens, n)  # Generate n-grams

            log_spam_prob = np.log(self.class_probs[0])
            log_ham_prob = np.log(self.class_probs[1])

            for word in ngrams:
                if word in self.word_probs:
                    log_spam_prob += np.log(self.word_probs[word][0])
                    log_ham_prob += np.log(self.word_probs[word][1])

            # Choose the class with higher log probability
            predictions.append(1 if log_spam_prob > log_ham_prob else 0)

        return np.array(predictions)

    def show_word_effect(self, X, n):
        predictions = []
        for text in X:
            # tokens = tokenize_text(text)
            tokens = preprocess_text(text, **SELECT)
            ngrams = generate_ngrams(tokens, n)  # Generate n-grams
            print(ngrams)

            for word in ngrams:
                if word in self.word_probs:
                    log_spam_prob = np.log(self.class_probs[0])
                    log_ham_prob = np.log(self.class_probs[1])
                    log_spam_prob += np.log(self.word_probs[word][0])
                    log_ham_prob += np.log(self.word_probs[word][1])

                    print(f'token: {word}')
                    print(f'log_spam_prob: {log_spam_prob}, log_spam_prob: {log_spam_prob}')

                else:
                    print(f'token: {word}')
                    print(f'log_spam_prob: {0}, log_spam_prob: {0}')

# Initialize and train the modified Naive Bayes classifier with corrected Laplace Smoothing
nb_classifier = NaiveBayesClassifier()
nb_classifier.fit(X_train, y_train, n=n)

# Show a few calculated word probabilities
list(nb_classifier.word_probs.items())[:10]

[('busniess', (1.524196621364156e-05, 2.8502208921191392e-06)),
 ('jueopstr', (1.0161310809094373e-05, 2.8502208921191392e-06)),
 ('psychopath', (1.524196621364156e-05, 2.8502208921191392e-06)),
 ('hyb', (1.0161310809094373e-05, 2.8502208921191392e-06)),
 ('ziq', (1.0161310809094373e-05, 2.8502208921191392e-06)),
 ('sm', (3.048393242728312e-05, 4.560353427390623e-05)),
 ('muller', (5.080655404547186e-05, 5.7004417842382785e-06)),
 ('navok', (1.0161310809094373e-05, 2.8502208921191392e-06)),
 ('lollipop', (1.0161310809094373e-05, 2.8502208921191392e-06)),
 ('mastectomi', (1.524196621364156e-05, 2.8502208921191392e-06))]

In [None]:
nb_classifier.show_word_effect(['this is the spam mail busniness 433  NUM augean'], n)

['thi', 'spam', 'mail', 'busni', 'NUM', 'num', 'augean']
token: thi
log_spam_prob: -5.863973583908158, log_spam_prob: -5.863973583908158
token: spam
log_spam_prob: -9.118425233749633, log_spam_prob: -9.118425233749633
token: mail
log_spam_prob: -7.84276050096758, log_spam_prob: -7.84276050096758
token: busni
log_spam_prob: 0, log_spam_prob: 0
token: NUM
log_spam_prob: -3.897631228572279, log_spam_prob: -3.897631228572279
token: num
log_spam_prob: -12.323878038285692, log_spam_prob: -12.323878038285692
token: augean
log_spam_prob: -12.729343146393857, log_spam_prob: -12.729343146393857


# Performance

In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score, accuracy_score

# Function to perform k-fold cross-validation
def k_fold_cross_validation(X, y, k=5):
    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    f1_scores = []
    accuracy_scores = []

    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # Initialize and train the Naive Bayes classifier
        nb_classifier = NaiveBayesClassifier()
        nb_classifier.fit(X_train, y_train, n)

        # Make predictions on the test set
        y_pred = nb_classifier.predict(X_test, n)

        # Calculate F1 score and accuracy
        f1 = f1_score(y_test, y_pred)
        accuracy = accuracy_score(y_test, y_pred)

        f1_scores.append(f1)
        accuracy_scores.append(accuracy)

    return f1_scores, accuracy_scores

# Assuming your data is stored in 'X' and 'y' (You should replace this with actual data)
X, y = df['text'], df['label_num']

f1_scores, accuracy_scores = k_fold_cross_validation(X, y, k=5)
print("Mean F1 Score:", np.mean(f1_scores))
print("Mean Accuracy:", np.mean(accuracy_scores))

Mean F1 Score: 0.9503232643898845
Mean Accuracy: 0.9713794746727217
