In [103]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import classification_report, confusion_matrix
import pickle
from tqdm import tqdm
from dataclasses import dataclass
from scipy.sparse import vstack




## Preprocess

In [104]:
df = pd.read_csv('../data/SMSSpamCollection', sep='\t', names=['label', 'message'])
#df.head()

# data cleaning function

def clean_text(text: str) -> str:
    if not isinstance(text, str):
        return ""
    text = text.lower()
    #remove html tags
    text = re.sub(r'<.*?>', '', text)
    #remove urls
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    #remove punctuation and numbers
    text = re.sub(r'[^\w\s]', '', text)
    #remove emails
    text = re.sub(r'\S+@\S+', '', text)
    #remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['cleaned_message'] = df['message'].apply(clean_text)
#df.head()

# encode labels
df['label'] = df['label'].map({'ham': 0, 'spam': 1})




#


## Vectorize
Vectorizing using TF-IDF(Term Frequency - Inverse Document Frequency)


In [105]:
vectorizer = TfidfVectorizer(
    #stop_words='english',
    ngram_range=(1, 2),
    max_df=0.9,
    min_df=2
)


X = vectorizer.fit_transform(df['cleaned_message'])
y = df['label'].values

print(f"matrix shape: {X.shape}")
print(f"Vocabulary size: {len(vectorizer.vocabulary_)}")
# Save the vectorizer
with open('../models/tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

# see a few top words
print(list(vectorizer.vocabulary_.keys())[:20])

# example vector for one email
sample_vec = X[0]
print("Non-zero features in first email:", sample_vec.nnz)


matrix shape: (5572, 14738)
Vocabulary size: 14738
['go', 'until', 'point', 'crazy', 'available', 'only', 'in', 'bugis', 'great', 'world', 'la', 'buffet', 'cine', 'there', 'got', 'wat', 'only in', 'ok', 'lar', 'joking']
Non-zero features in first email: 17


## Train and predict NSA

In [106]:
# seperate self(ham) and non-self (spam) 
X_ham = X[y == 0]
X_spam = X[y == 1]

print(f"Ham matrix shape: {X_ham.shape}")
print(f"Spam matrix shape: {X_spam.shape}")

Ham matrix shape: (4825, 14738)
Spam matrix shape: (747, 14738)


In [107]:
# split self data into train and test sets
X_ham_train, X_ham_test = train_test_split(X_ham, test_size=0.2, random_state=42)

print(f"Ham train shape: {X_ham_train.shape}")
print(f"Ham test shape: {X_ham_test.shape}")

Ham train shape: (3860, 14738)
Ham test shape: (965, 14738)


In [108]:
# define simple detector class
# a detector is defined by a centroid vector and a radius anything that lies within the radius is considered spam
@dataclass
class Detector:
    vector : np.ndarray # centroid vector
    radius : float   # detection radius



In [109]:
# V-detector generation
class VDetectorNSA:
    def __init__(self, min_threshold: float, max_threshold: float, max_detectors: int, max_tries: int):
        assert 0 <= min_threshold < max_threshold <= 1, "Thresholds must be in [0, 1] and min < max"
        self.min_threshold = min_threshold
        self.max_threshold = max_threshold
        self.max_detectors = max_detectors
        self.max_tries = max_tries
        self.detectors = []

    def random_vector(self, dim: int, k: int) -> np.ndarray: # generate random sparse vector with k non-zero entries
        idx = np.random.choice(dim, size=min(k, dim), replace=False)
        vals = np.random.rand(len(idx)) + 0.1  # avoid zero values
        vec = np.zeros(dim, dtype=np.float32)
        vec[idx] = vals
        vec /= np.linalg.norm(vec)  # normalize
        return vec
    
    def fit(self, X_ham_train: np.ndarray) -> "VDetectorNSA":
        n_self, dim = X_ham_train.shape
        accepted = 0
        tries = 0

        print("Generating detectors...")
        while accepted < self.max_detectors and tries < self.max_tries:
            tries += 1
            radius = np.random.uniform(self.min_threshold, self.max_threshold)
            cand_vec = self.random_vector(dim, k=10)  # sparse vector with 10 non-zero entries
            # quick regection test using 200 self random batch
            if n_self > 200:
                sample_idx = np.random.choice(n_self, size=200, replace=False)
                sim_batch = (X_ham_train[sample_idx] @ cand_vec).ravel()
                if np.any(sim_batch >= radius):
                    continue  # reject candidate
            # full test
            sim_full = (X_ham_train @ cand_vec).ravel()
            if np.any(sim_full >= radius):
                continue  # reject candidate    

            # accept candidate
            self.detectors.append(Detector(vector=cand_vec, radius=radius))
            accepted += 1
            print(f"Accepted detector {accepted} with radius {radius:.4f} after {tries} tries.")

        return self
    
    def predict(self, X: np.ndarray) -> np.ndarray:
       if not self.detectors:
            raise ValueError("No detectors found. Please fit the model first.")
       n_samples = X.shape[0]
       predictions = np.zeros(n_samples, dtype=int)  # default to self (0)
       # stack detector vectors for batch processing
       D = np.vstack([d.vector for d in self.detectors])
       R = np.array([d.radius for d in self.detectors])[None, :]  # shape (num_detectors, 1)

       print("Predicting labels...")
       batch_size = 500
       for start in range(0, n_samples, batch_size):
           end = min(start + batch_size, n_samples)
           X_batch = X[start:end]  # shape (batch_size, dim)
           sims = X_batch @ D.T  # shape (batch_size, num_detectors)
           sims = sims.A if hasattr(sims, 'A') else sims  # convert to dense if sparse
           hits = (sims >= R)  # shape (batch_size, num_detectors
           predictions[start:end] = hits.any(axis=1).astype(int)  # spam (1) if any detector hits
        
       return predictions



        

In [110]:

nsa = VDetectorNSA(
    min_threshold=0.05,
    max_threshold=0.3,
    max_detectors=400,
    max_tries=8000
)


nsa.fit(X_ham_train)

print("Detectors learned:", len(nsa.detectors))
print("First few radii:", [round(d.radius, 3) for d in nsa.detectors[:5]])

Generating detectors...
Accepted detector 1 with radius 0.2808 after 2 tries.
Accepted detector 2 with radius 0.2641 after 4 tries.
Accepted detector 3 with radius 0.2559 after 5 tries.
Accepted detector 4 with radius 0.2964 after 6 tries.
Accepted detector 5 with radius 0.2801 after 9 tries.
Accepted detector 6 with radius 0.2019 after 10 tries.
Accepted detector 7 with radius 0.2998 after 12 tries.
Accepted detector 8 with radius 0.1893 after 20 tries.
Accepted detector 9 with radius 0.1624 after 21 tries.
Accepted detector 10 with radius 0.2082 after 37 tries.
Accepted detector 11 with radius 0.1335 after 38 tries.
Accepted detector 12 with radius 0.1416 after 39 tries.
Accepted detector 13 with radius 0.2847 after 56 tries.
Accepted detector 14 with radius 0.1962 after 57 tries.
Accepted detector 15 with radius 0.1471 after 59 tries.
Accepted detector 16 with radius 0.2436 after 61 tries.
Accepted detector 17 with radius 0.1562 after 64 tries.
Accepted detector 18 with radius 0.283

In [111]:


sample_ham = X_ham_train[:10]  # pick 10 ham emails
d = nsa.detectors[0]
sims = sample_ham @ d.vector
print("Cosine similarities:", sims.ravel())
print("Detector radius:", d.radius)


Cosine similarities: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
Detector radius: 0.2808480968669162


In [112]:

radii = [d.radius for d in nsa.detectors]
print("Mean radius:", np.mean(radii))
print("Min:", np.min(radii), "Max:", np.max(radii))


Mean radius: 0.23460477489302498
Min: 0.0976940154172073 Max: 0.2998249587718028


In [113]:
X_eval = vstack([X_ham_test, X_spam])
y_eval = np.hstack([
    np.zeros(X_ham_test.shape[0]),  # ham = 0
    np.ones(X_spam.shape[0])      # spam = 1
])

y_pred = nsa.predict(X_eval)


Predicting labels...


## Evaluate

In [114]:
print("=== Classification Report ===")
report = classification_report(y_eval, y_pred, target_names=["ham", "spam"], output_dict=True)
print(pd.DataFrame(report).transpose())

print("\n=== Confusion Matrix ===")
cm = confusion_matrix(y_eval, y_pred)
print(pd.DataFrame(cm,
                   index=["True ham", "True spam"],
                   columns=["Pred ham", "Pred spam"]))

=== Classification Report ===
              precision    recall  f1-score     support
ham            0.557807  0.969948  0.708286   965.00000
spam           0.147059  0.006693  0.012804   747.00000
accuracy       0.549650  0.549650  0.549650     0.54965
macro avg      0.352433  0.488321  0.360545  1712.00000
weighted avg   0.378584  0.549650  0.404825  1712.00000

=== Confusion Matrix ===
           Pred ham  Pred spam
True ham        936         29
True spam       742          5
