Các thành viên nhóm 11:
  - 23122007 - Nguyễn Tấn Hùng
  - 23122011 - Đoàn Hải Nam
  - 23122027 - Võ Ngọc Hiếu
  - 23122055 - Lý Nguyên Thương

Note:
  - Đây chỉ là file tổng hợp kết quả cuối cùng của nhóm 11 nên sẽ không có các phần nhận xét, giải thích, quá trình làm bài ...
  - Để thấy chi tiết hơn, nhóm em mời thầy xem file còn lại

In [None]:
### Các thư viện cần thiết
import numpy as np
import pandas as pd
import math
import string
import re
import matplotlib.pyplot as plt
from collections import Counter
from collections import defaultdict
from sklearn.metrics import accuracy_score, precision_score, recall_score , confusion_matrix, classification_report

CHƯƠNG 1: TẬP DỮ LIỆU ENRON-SPAM

In [None]:
def load_and_process_data(train_csv_path, val_csv_path):

    df_train = pd.read_csv(train_csv_path, index_col=0)
    df_val = pd.read_csv(val_csv_path, index_col=0)

    if 'split' in df_train.columns:
        df_train.drop(columns=["split"], inplace=True)
    if 'split' in df_val.columns:
        df_val.drop(columns=["split"], inplace=True)

    print("5 dòng đầu tiên của tập train:")
    display(df_train.head(5))
    print("\nMột số thông tin tập train:")
    print(df_train.info())
    print("\n5 dòng đầu tiên của tập val:")
    display(df_val.head(5))
    print("\nMột số thông tin tập val:")
    print(df_val.info())

    return df_train, df_val

CHƯƠNG 2: TIỀN XỬ LÝ DỮ LIỆU

In [None]:
def preprocess_data(df_train, df_val):

    df_train.fillna("", inplace=True)
    df_val.fillna("", inplace=True)

    df_train['Label'] = df_train['Spam/Ham'].map({'spam': 1, 'ham': 0})
    df_val['Label'] = df_val['Spam/Ham'].map({'spam': 1, 'ham': 0})

    duplicate_rows_train = df_train.duplicated()
    duplicate_rows_val = df_val.duplicated()

    df_train.drop_duplicates(inplace=True)
    df_val.drop_duplicates(inplace=True)

    df_train['Content'] = df_train['Subject'] + " " + df_train['Message']
    df_val['Content'] = df_val['Subject'] + " " + df_val['Message']

    def remove_punctuation(text):
        if isinstance(text, str):
            punctuation_free = "".join([i for i in text if i not in string.punctuation])
            return punctuation_free
        return text
    df_train['Content'] = df_train['Content'].apply(lambda x: remove_punctuation(x))
    df_val['Content'] = df_val['Content'].apply(lambda x: remove_punctuation(x))
    df_train['Content'] = df_train['Content'].str.lower()
    df_val['Content'] = df_val['Content'].str.lower()

    def replace_urls_emails(text):
        if isinstance(text, str):
            text = re.sub(r'http\S+|www\S+|https\S+', 'URLTOKEN', text, flags=re.MULTILINE)
            text = re.sub(r'\S+@\S+', 'EMAILTOKEN', text)
            return text
        return text
    df_train['Content'] = df_train['Content'].apply(lambda x: replace_urls_emails(x))
    df_val['Content'] = df_val['Content'].apply(lambda x: replace_urls_emails(x))

    def remove_extra_whitespace(text):
        if isinstance(text, str):
            text = re.sub(r'\s+', ' ', text)
            return text.strip()
        return text
    df_train['Content'] = df_train['Content'].apply(lambda x: remove_extra_whitespace(x))
    df_val['Content'] = df_val['Content'].apply(lambda x: remove_extra_whitespace(x))

    return df_train, df_val

CHƯƠNG 3: MÔ HÌNH PHÂN LOẠI EMAIL SPAM

In [None]:
class NaiveBayes:
    def __init__(self, alpha=1.0):
        self.alpha = alpha
        self.class_probs = {}
        self.word_probs = {}
        self.vocabulary = set()
        self.total_words_in_class = {}

    def fit(self, X, y):

        total_docs = len(y)
        class_counts = Counter(y)
        for cls, count in class_counts.items():
            self.class_probs[cls] = count / total_docs

        docs_by_class = {}
        for i, doc in enumerate(X):
            label = y[i]
            if label not in docs_by_class:
                docs_by_class[label] = []
            docs_by_class[label].append(str(doc))

        word_counts_by_class = {}
        for cls, docs in docs_by_class.items():
            all_words_in_class = " ".join(docs).split()
            word_counts_by_class[cls] = Counter(all_words_in_class)
            self.vocabulary.update(word_counts_by_class[cls].keys())
            self.total_words_in_class[cls] = sum(word_counts_by_class[cls].values())

        for cls, word_counts in word_counts_by_class.items():
            total_words = self.total_words_in_class[cls]
            vocab_size = len(self.vocabulary)
            self.word_probs[cls] = {}

            for word, count in word_counts.items():
                self.word_probs[cls][word] = (count + self.alpha) / (total_words + vocab_size * self.alpha)

            for word in self.vocabulary:
                 if word not in self.word_probs[cls]:
                    self.word_probs[cls][word] = (0 + self.alpha) / (total_words + vocab_size * self.alpha)


    def predict(self, X):

        predictions = []
        vocab_size = len(self.vocabulary)

        for doc in X:
            scores = {}
            words = str(doc).split()

            for cls in self.class_probs:
                scores[cls] = math.log(self.class_probs[cls])
                total_words_cls = self.total_words_in_class.get(cls, 0)

                for word in words:
                    word_prob = self.word_probs[cls].get(word, (0 + self.alpha) / (total_words_cls + vocab_size * self.alpha))

                    if word_prob > 0:
                         scores[cls] += math.log(word_prob)
                    else:
                        scores[cls] += math.log(1e-10)

            predicted_class = max(scores, key=scores.get)
            predictions.append(predicted_class)
        return predictions

    def score(self, X, y):

        predictions = self.predict(X)
        correct_predictions = sum([1 for i in range(len(y)) if predictions[i] == y[i]])
        return correct_predictions / len(y)


In [None]:
def train_and_evaluate_model(model, df_train, df_val):

    df_train['Content'] = df_train['Content'].astype(str)
    df_val['Content'] = df_val['Content'].astype(str)

    model.fit(df_train['Content'].tolist(), df_train['Label'].tolist())

    nb_predictions_train = model.predict(df_train['Content'].tolist())
    actual_labels_train = df_train['Label'].tolist()

    print("Đánh giá hiệu suất trên tập train:")
    print("Accuracy:", accuracy_score(actual_labels_train, nb_predictions_train))
    print("Precision:", precision_score(actual_labels_train, nb_predictions_train))
    print("Recall:", recall_score(actual_labels_train, nb_predictions_train))
    print("Confusion Matrix:\n", confusion_matrix(actual_labels_train, nb_predictions_train))
    print("Classification Report:\n", classification_report(actual_labels_train, nb_predictions_train))

    nb_predictions_val = model.predict(df_val['Content'].tolist())
    actual_labels_val = df_val['Label'].tolist()

    print("\nĐánh giá hiệu suất trên tập val:")
    print("Accuracy:", accuracy_score(actual_labels_val, nb_predictions_val))
    print("Precision:", precision_score(actual_labels_val, nb_predictions_val))
    print("Recall:", recall_score(actual_labels_val, nb_predictions_val))
    print("Confusion Matrix:\n", confusion_matrix(actual_labels_val, nb_predictions_val))
    print("Classification Report:\n", classification_report(actual_labels_val, nb_predictions_val))


CHƯƠNG 4: THỬ NGHIỆM THỰC TẾ

In [None]:
def predict_an_email(model):

    subject = input("Nhập tiêu đề email (Subject): ")
    message = input("Nhập nội dung email (Message): ")

    # Tạo DataFrame tạm thời với cấu trúc Subject, Message, Spam/Ham (preprocess_data sẽ xử lý việc kết hợp Subject và Message thành Content)
    email_df = pd.DataFrame({'Subject': [subject], 'Message': [message], 'Spam/Ham': ['ham']})
    dummy_df = pd.DataFrame(columns=['Subject', 'Message', 'Spam/Ham']) # Dummy df for preprocess_data

    # Áp dụng tiền xử lý
    processed_email_df, _ = preprocess_data(email_df, dummy_df)

    # Lấy nội dung đã tiền xử lý từ cột 'Content'
    processed_email_content = processed_email_df['Content'].tolist()

    # Dự đoán sử dụng mô hình đã huấn luyện
    prediction = model.predict(processed_email_content)

    # In kết quả dự đoán
    if prediction[0] == 1:
        print("Kết quả dự đoán: SPAM")
    else:
        print("Kết quả dự đoán: HAM")

In [None]:
def evaluate_on_a_file(model, csv_path):

    try:
        df = pd.read_csv(csv_path, index_col=0)
        if 'split' in df.columns:
            df.drop(columns=["split"], inplace=True)
        if 'split' in df_val.columns:
            df.drop(columns=["split"], inplace=True)

        df_eval_processed, _ = preprocess_data(df, pd.DataFrame(columns=df.columns))

        df_eval_processed['Content'] = df_eval_processed['Content'].astype(str)
        actual_labels_eval = df_eval_processed['Label'].tolist()

        nb_predictions_eval = model.predict(df_eval_processed['Content'].tolist())

        print(f"\nĐánh giá hiệu suất trên tập {csv_path}:")
        print("Accuracy:", accuracy_score(actual_labels_eval, nb_predictions_eval))
        print("Precision:", precision_score(actual_labels_eval, nb_predictions_eval))
        print("Recall:", recall_score(actual_labels_eval, nb_predictions_eval))
        print("Confusion Matrix:\n", confusion_matrix(actual_labels_eval, nb_predictions_eval))
        print("Classification Report:\n", classification_report(actual_labels_eval, nb_predictions_eval))

    except FileNotFoundError:
        print(f"Lỗi: Không tìm thấy file '{csv_path}'.")
    except Exception as e:
        print(f"Đã xảy ra lỗi khi xử lý file '{csv_path}': {e}")


In [None]:
### Tạo một file csv khác để thử chương 4
emails = [
    # spam emails
    {
        "Subject": "Get Paid To Work From Home",
        "Message": "Are you tired of your 9-to-5 job? Join thousands of people who earn over $500 a day from home. No experience required. Click here to get started: www.workfastnow.com",
        "Spam/Ham": "spam"
    },
    {
        "Subject": "Claim Your Free Gift Card Today",
        "Message": "Congratulations! You have been selected to receive a $1000 Walmart gift card. Complete the survey to claim your prize.",
        "Spam/Ham": "spam"
    },
    {
        "Subject": "New Weight Loss Secret Doctors Won’t Tell You",
        "Message": "Lose 10kg in two weeks with this Japanese tonic. Proven by thousands. Learn the truth now.",
        "Spam/Ham": "spam"
    },
    {
        "Subject": "Your Mortgage Pre-Approval Expires Soon",
        "Message": "Rates are at historic lows! Lock in 2.3% APR. Apply now.",
        "Spam/Ham": "spam"
    },
    {
        "Subject": "Eliminate All Debt Legally",
        "Message": "Government program to erase your credit card and student loan debt. No fees, no checks. Start today.",
        "Spam/Ham": "spam"
    },
    {
        "Subject": "Protect Your Identity Online",
        "Message": "Get our identity protection service free for 30 days and stay safe from hackers.",
        "Spam/Ham": "spam"
    },
    {
        "Subject": "Increase Your Manhood",
        "Message": "Boost your performance naturally. Discreet shipping. Guaranteed results.",
        "Spam/Ham": "spam"
    },
    {
        "Subject": "Act Now – You’ve Been Selected!",
        "Message": "One of 50 people to receive this investment opportunity. No risk. High return.",
        "Spam/Ham": "spam"
    },
    {
        "Subject": "Earn Bitcoin Daily",
        "Message": "Our system helps you earn Bitcoin passively. No tech knowledge required.",
        "Spam/Ham": "spam"
    },
    {
        "Subject": "Save Big On Insurance",
        "Message": "Compare car insurance quotes and save $700/year. Enter zip code.",
        "Spam/Ham": "spam"
    },
    # ham emails
    {
        "Subject": "Final Notice - Pay Your Invoice",
        "Message": "Your payment is past due. Services will be terminated if unpaid. Pay now.",
        "Spam/Ham": "ham"
    },
    {
        "Subject": "Your iCloud Account Is Locked",
        "Message": "Suspicious activity detected. Sign in to restore account.",
        "Spam/Ham": "ham"
    },

    {
        "Subject": "Quarterly Project Update",
        "Message": "Attached is the Q2 update with metrics and blockers. Let's schedule a review next week.",
        "Spam/Ham": "ham"
    },
    {
        "Subject": "IT Support Request - Laptop Issue",
        "Message": "My laptop is overheating and shutting down. Can IT take a look?",
        "Spam/Ham": "ham"
    },
    {
        "Subject": "Final Exam Schedule",
        "Message": "Final exam for MTH56-2425 is Tuesday, 9:00 AM, Room 405. Review material uploaded Friday.",
        "Spam/Ham": "ham"
    },
    {
        "Subject": "Monthly Newsletter – November",
        "Message": "Welcome to the November newsletter! Highlights include student work and upcoming events.",
        "Spam/Ham": "ham"
    },
    {
        "Subject": "Reimbursement for Conference Travel",
        "Message": "I’ve attached all receipts and forms for AI Conference reimbursement.",
        "Spam/Ham": "ham"
    },
    {
        "Subject": "Dinner this weekend?",
        "Message": "Are you free Saturday night? Thought we could try the new Italian place near the park.",
        "Spam/Ham": "ham"
    },
    {
        "Subject": "Library Book Due Reminder",
        "Message": "Just a heads-up: your books are due back on March 31st. Renew online if needed.",
        "Spam/Ham": "ham"
    },
    {
        "Subject": "Cousin’s Wedding Invite",
        "Message": "You’re invited to Emily’s wedding on May 20th at River Gardens. Formal invite to follow.",
        "Spam/Ham": "ham"
    }
]

df_emails = pd.DataFrame(emails)
df_emails.to_csv("sample_emails_for_testing.csv")


MÃ NGUỒN CHÍNH

In [None]:
if __name__ == "__main__":

    ### CHƯƠNG 1
    print('CHƯƠNG 1\n')
    df_train, df_val = load_and_process_data('train.csv', 'val.csv')
    print('\n--------------------------------------------------------------------------------------------')

    ### CHƯƠNG 2
    print('CHƯƠNG 2\n')
    df_train, df_val = preprocess_data(df_train, df_val)
    print("Đã thực hiện tiền xử lí dữ liệu thành công")
    print('\n--------------------------------------------------------------------------------------------')

    ### CHƯƠNG 3
    print('CHƯƠNG 3\n')
    nb_model = NaiveBayes(alpha=0.05)
    train_and_evaluate_model(nb_model, df_train, df_val)
    print('\n--------------------------------------------------------------------------------------------')

    ### CHƯƠNG 4
    print('CHƯƠNG 4\n')
    predict_an_email(nb_model)
    testfile = "sample_emails_for_testing.csv" # mời thầy thay đổi tên file tại đây
    evaluate_on_a_file(nb_model, testfile)



CHƯƠNG 1

5 dòng đầu tiên của tập train:


Unnamed: 0,Message ID,Subject,Message,Spam/Ham
0,0,christmas tree farm pictures,,ham
1,1,"vastar resources , inc .","gary , production from the high island larger ...",ham
2,2,calpine daily gas nomination,- calpine daily gas nomination 1 . doc,ham
3,3,re : issue,fyi - see note below - already done .\nstella\...,ham
5,5,mcmullen gas for 11 / 99,"jackie ,\nsince the inlet to 3 river plant is ...",ham



Một số thông tin tập train:
<class 'pandas.core.frame.DataFrame'>
Index: 27284 entries, 0 to 33715
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Message ID  27284 non-null  int64 
 1   Subject     27055 non-null  object
 2   Message     26932 non-null  object
 3   Spam/Ham    27284 non-null  object
dtypes: int64(1), object(3)
memory usage: 1.0+ MB
None

5 dòng đầu tiên của tập val:


Unnamed: 0,Message ID,Subject,Message,Spam/Ham
23,23,miscellaneous,- - - - - - - - - - - - - - - - - - - - - - fo...,ham
24,24,re : purge of old contract _ event _ status,fyi - what do you all think ?\n- - - - - - - -...,ham
32,32,valero 8018 and 1394,it is my understanding the outages valero incu...,ham
37,37,01 / 00 natural gas nomination,enron methanol company nominates the following...,ham
43,43,re : misc . questions,- - - - - - - - - - - - - - - - - - - - - - fo...,ham



Một số thông tin tập val:
<class 'pandas.core.frame.DataFrame'>
Index: 3084 entries, 23 to 33692
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Message ID  3084 non-null   int64 
 1   Subject     3055 non-null   object
 2   Message     3049 non-null   object
 3   Spam/Ham    3084 non-null   object
dtypes: int64(1), object(3)
memory usage: 120.5+ KB
None

--------------------------------------------------------------------------------------------
CHƯƠNG 2

Đã thực hiện tiền xử lí dữ liệu thành công

--------------------------------------------------------------------------------------------
CHƯƠNG 3

Đánh giá hiệu suất trên tập train:
Accuracy: 0.9952353027415335
Precision: 0.9944532488114105
Recall: 0.9961754942993217
Confusion Matrix:
 [[13349    77]
 [   53 13805]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.99      1.00     13426
           1      

(Nhận xét: Vì tập test thử chỉ có 10 email spam và 10 email không phải spam và các email test thử không khó để phân loại nên việc phân loại cho ra kết quả tuyệt đối)