In [1]:
import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split


df = pd.read_csv('spamhamdata.csv', delimiter='\t', header=None)
df.columns = ["Category", "Text"]


train_df, test_df = train_test_split(df, test_size=0.25, shuffle=True, random_state=42)


def to_alpha(s):
    return re.sub(r'[^a-z ]', '', s.lower())


word_list = []
for text in train_df["Text"].values:
    for word in to_alpha(text).split():
        if len(word) >= 2:
            word_list.append(word)
unique_word = np.unique(word_list)


n = len(train_df)
m = len(unique_word)
train_matrix = pd.DataFrame(np.zeros((n, m)), columns=unique_word, index=train_df.index)


for idx, text in zip(train_df.index, train_df["Text"].values):
    for word in to_alpha(text).split():
        if word in unique_word:
            train_matrix.loc[idx, word] += 1

train_matrix["Category"] = train_df["Category"]


cent_spam = train_matrix[train_matrix["Category"]=="spam"].iloc[:, :-1].mean(axis=0).values
cent_ham  = train_matrix[train_matrix["Category"]=="ham"].iloc[:, :-1].mean(axis=0).values


def category(x, cent_s, cent_h):
    dist_s = np.linalg.norm(x - cent_s)
    dist_h = np.linalg.norm(x - cent_h)
    return "ham" if dist_h < dist_s else "spam"


test_matrix = pd.DataFrame(np.zeros((len(test_df), m)), columns=unique_word, index=test_df.index)
for idx, text in zip(test_df.index, test_df["Text"].values):
    for word in to_alpha(text).split():
        if word in unique_word:
            test_matrix.loc[idx, word] += 1


misclassified = 0
for i, row in enumerate(test_matrix.values):
    if category(row, cent_spam, cent_ham) != test_df.iloc[i, 0]:
        misclassified += 1

print("Test samples:", len(test_matrix))
print("Misclassified:", misclassified)


Test samples: 1393
Misclassified: 105
