<a href="https://colab.research.google.com/github/hwarang97/spam_classifier/blob/main/spam_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [76]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import re

df = pd.read_csv('/content/spam_ham_dataset.csv')

# Convert text to lowercase
df['text_lower'] = df['text'].str.lower()

# Tokenize the text by words
def tokenize_text(text):
    return re.findall(r'\b\w+\b', text.lower())

df['text_tokenized'] = df['text_lower'].apply(tokenize_text)

# Show the first few rows after preprocessing
df[['text', 'text_lower', 'text_tokenized']].head()

Unnamed: 0,text,text_lower,text_tokenized
0,Subject: enron methanol ; meter # : 988291\r\n...,subject: enron methanol ; meter # : 988291\r\n...,"[subject, enron, methanol, meter, 988291, this..."
1,"Subject: hpl nom for january 9 , 2001\r\n( see...","subject: hpl nom for january 9 , 2001\r\n( see...","[subject, hpl, nom, for, january, 9, 2001, see..."
2,"Subject: neon retreat\r\nho ho ho , we ' re ar...","subject: neon retreat\r\nho ho ho , we ' re ar...","[subject, neon, retreat, ho, ho, ho, we, re, a..."
3,"Subject: photoshop , windows , office . cheap ...","subject: photoshop , windows , office . cheap ...","[subject, photoshop, windows, office, cheap, m..."
4,Subject: re : indian springs\r\nthis deal is t...,subject: re : indian springs\r\nthis deal is t...,"[subject, re, indian, springs, this, deal, is,..."


In [70]:
# Split the data into training and test sets
X = df['text_lower']
y = df['label_num']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Show the number of samples in the training and test sets
len(X_train), len(X_test)


(4136, 1035)

In [71]:
from collections import Counter

# Tokenize and count the frequency of words in the training set
word_counts = Counter()
for text in X_train:
    word_counts.update(tokenize_text(text))

# Show the 10 most common words
word_counts.most_common(10)

[('the', 20598),
 ('to', 16214),
 ('ect', 10731),
 ('and', 10285),
 ('for', 8325),
 ('of', 8224),
 ('a', 7887),
 ('you', 6596),
 ('subject', 6367),
 ('in', 6199)]

In [72]:
import numpy as np

class NaiveBayesClassifier:
    def __init__(self):
        self.word_probs = {}

    def fit(self, X, y):
        # Count the frequency of each word in spam and ham emails
        spam_word_counts = Counter()
        ham_word_counts = Counter()
        spam_count = 0
        ham_count = 0

        for text, label in zip(X, y):
            if label == 1:  # spam
                spam_count += 1
                spam_word_counts.update(tokenize_text(text))
            else:  # ham
                ham_count += 1
                ham_word_counts.update(tokenize_text(text))

        # Calculate probabilities
        total_spam_words = sum(spam_word_counts.values())
        total_ham_words = sum(ham_word_counts.values())
        total_words = total_spam_words + total_ham_words
        spam_prob = spam_count / (spam_count + ham_count)
        ham_prob = 1 - spam_prob

        # Calculate word probabilities given spam or ham
        for word in set(spam_word_counts.keys()).union(set(ham_word_counts.keys())):
            spam_word_prob = (spam_word_counts[word] + 1) / (total_spam_words + total_words)
            ham_word_prob = (ham_word_counts[word] + 1) / (total_ham_words + total_words)
            self.word_probs[word] = (spam_word_prob, ham_word_prob)

        self.class_probs = (spam_prob, ham_prob)

    def predict(self, X):
        predictions = []
        for text in X:
            tokens = tokenize_text(text)
            log_spam_prob = np.log(self.class_probs[0])
            log_ham_prob = np.log(self.class_probs[1])

            for word in tokens:
                if word in self.word_probs:
                    log_spam_prob += np.log(self.word_probs[word][0])
                    log_ham_prob += np.log(self.word_probs[word][1])

            # Choose the class with higher log probability
            predictions.append(1 if log_spam_prob > log_ham_prob else 0)

        return np.array(predictions)

# Initialize and train the Naive Bayes classifier
nb_classifier = NaiveBayesClassifier()
nb_classifier.fit(X_train, y_train)

# Show a few calculated word probabilities
list(nb_classifier.word_probs.items())[:10]

[('prohibition', (2.2087708080014932e-06, 8.813725438350635e-07)),
 ('viagxera', (2.2087708080014932e-06, 8.813725438350635e-07)),
 ('verrucose', (2.2087708080014932e-06, 8.813725438350635e-07)),
 ('prudency', (1.1043854040007466e-06, 2.6441176315051903e-06)),
 ('velez', (2.2087708080014932e-06, 8.813725438350635e-07)),
 ('stylised', (1.1043854040007466e-06, 1.762745087670127e-06)),
 ('natalie', (2.2087708080014932e-06, 1.762745087670127e-06)),
 ('nearsighted', (2.2087708080014932e-06, 8.813725438350635e-07)),
 ('stadler', (1.1043854040007466e-06, 1.762745087670127e-06)),
 ('comfortable', (5.521927020003733e-06, 6.169607806845445e-06))]

In [77]:
def pretty_print_metrics(accuracy, conf_matrix, class_report):
    print("### 모델 성능 요약 ###\n")

    print("#### 정확도 (Accuracy) ####")
    print(f"- {accuracy * 100:.2f}%\n")

    print("#### 혼동 행렬 (Confusion Matrix) ####")
    print(f"True Ham, False Spam: {conf_matrix[0, 0]}")
    print(f"False Ham, True Spam: {conf_matrix[1, 1]}")
    print(f"True Ham, True Spam: {conf_matrix[0, 1]}")
    print(f"False Ham, False Spam: {conf_matrix[1, 0]}\n")

    lines = class_report.strip().split("\n")
    print("#### 분류 보고서 (Classification Report) ####")
    for line in lines[2:]:
        values = line.split()
        if len(values) == 0:  # Skip empty lines
            continue
        if values[0] in ["Ham", "Spam"]:
            print(f"{values[0]} (스팸이 아닌 이메일)" if values[0] == "Ham" else f"{values[0]} (스팸 이메일)")
            print(f"  - Precision: {values[1]}")
            print(f"  - Recall: {values[2]}")
            print(f"  - F1-score: {values[3]}\n")

# Demonstrate the function
pretty_print_metrics(accuracy, conf_matrix, class_report)


### 모델 성능 요약 ###

#### 정확도 (Accuracy) ####
- 91.69%

#### 혼동 행렬 (Confusion Matrix) ####
True Ham, False Spam: 742
False Ham, True Spam: 207
True Ham, True Spam: 0
False Ham, False Spam: 86

#### 분류 보고서 (Classification Report) ####
Ham (스팸이 아닌 이메일)
  - Precision: 0.90
  - Recall: 1.00
  - F1-score: 0.95

Spam (스팸 이메일)
  - Precision: 1.00
  - Recall: 0.71
  - F1-score: 0.83

