In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import classification_report, confusion_matrix
import pickle
from tqdm import tqdm
from dataclasses import dataclass
from scipy.sparse import vstack




## Preprocess

In [None]:
df = pd.read_csv('../data/SMSSpamCollection', sep='\t', names=['label', 'message'])
#df.head()

# data cleaning function

def clean_text(text: str) -> str:
    if not isinstance(text, str):
        return ""
    text = text.lower()
    #remove html tags
    text = re.sub(r'<.*?>', '', text)
    #remove urls
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    #remove punctuation and numbers
    text = re.sub(r'[^\w\s]', '', text)
    #remove emails
    text = re.sub(r'\S+@\S+', '', text)
    #remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['cleaned_message'] = df['message'].apply(clean_text)
#df.head()

# encode labels
df['label'] = df['label'].map({'ham': 0, 'spam': 1})




#


Unnamed: 0,label,message,cleaned_message
0,0,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only in ...
1,0,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...
3,0,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say
4,0,"Nah I don't think he goes to usf, he lives aro...",nah i dont think he goes to usf he lives aroun...


## Vectorize
Vectorizing using TF-IDF(Term Frequency - Inverse Document Frequency)


In [14]:
vectorizer = TfidfVectorizer(
    stop_words='english'
    #ngram_range=(1, 2),
    #max_df=0.9,
    #min_df=2
)


X = vectorizer.fit_transform(df['cleaned_message'])
y = df['label'].values

print(f"matrix shape: {X.shape}")
print(f"Vocabulary size: {len(vectorizer.vocabulary_)}")
# Save the vectorizer
with open('../models/tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

# see a few top words
print(list(vectorizer.vocabulary_.keys())[:20])

# example vector for one email
sample_vec = X[0]
print("Non-zero features in first email:", sample_vec.nnz)


matrix shape: (5572, 9211)
Vocabulary size: 9211
['jurong', 'point', 'crazy', 'available', 'bugis', 'great', 'world', 'la', 'buffet', 'cine', 'got', 'amore', 'wat', 'ok', 'lar', 'joking', 'wif', 'oni', 'free', 'entry']
Non-zero features in first email: 13


## Train NSA

In [26]:
X_self = X[y == 0]  # ham messages
#print(f"Self samples:{X_self.shape[0]}") # 4825 samples
X_nonself = X[y == 1]  # spam messages
#print(f"Non-self samples:{X_nonself.shape[0]}") # 747 samples

X_self_train, X_self_test = train_test_split(X_self, test_size=0.2, random_state=42)

# Define NSA detector class
@dataclass
class Detector:
    vector: np.ndarray
    radius: float

class VDetectorNSA:
    def __init__(self, minthreshold: float = 0.3, maxthreshold: float = 0.7, max_detectors: int = 300, max_trials: int = 10000):
        self.minthreshold = minthreshold
        self.maxthreshold = maxthreshold
        self.max_detectors = max_detectors
        self.max_trials = max_trials
        self.detectors = []
    
    def random_vector(self, dim: int, active_k: int=10) -> np.ndarray:
        '''Create a random sparse-like vector with `active_k` entries'''
        idx = np.random.choice(dim, size=min(active_k, dim), replace=False)
        vals = np.random.rand(len(idx)) + 0.1  # avoid zeros
        v = np.zeros(dim, dtype=np.float32)
        v[idx] = vals
        v = v / np.linalg.norm(v)  # normalize
        return v
    
    def fit(self, training_data: np.ndarray):
        data, dim = training_data.shape
        trials, accepted = 0, 0

        print("Generating detectors...")
        while accepted < self.max_detectors and trials < self.max_trials:
            trials += 1
            threshold = np.random.uniform(self.minthreshold, self.maxthreshold)
            candidate = self.random_vector(dim)

            #check similarity to self samples
            sims = training_data @ candidate
            if np.any(sims.A.ravel() >= threshold):
                continue  # candidate too similar to self

            # Accept the candidate
            self.detectors.append(Detector(vector=candidate, radius=threshold))
            accepted += 1
        
        print(f"Generated {accepted} detectors in {trials} trials.")
        return self

# train NSA detector
nsa_detector = VDetectorNSA()
nsa_detector.fit(X_self_train)

Generating detectors...


AttributeError: 'numpy.ndarray' object has no attribute 'A'

## Predict

In [25]:
def predict(VDetectorNSA, X: np.ndarray) -> np.ndarray:
    if not VDetectorNSA.detectors:
        raise ValueError("Detector has not been fitted with training data.")
    
    n_samples = X.shape[0]
    preds = np.zeros(n_samples, dtype=int)

    #Stack all detector vectors for efficient computation
    D = np.vstack([d.vector for d in VDetectorNSA.detectors])
    radii = np.array([d.radius for d in VDetectorNSA.detectors])[None, :]

    print("Predicting...")
    batch_size = 256 # process in batches to save memory
    for start in range(0, n_samples, batch_size):
        end = min(start + batch_size, n_samples)
        X_batch = X[start:end]

        sims = X_batch @ D.T  # shape (batch_size, n_detectors)
        sims = sims.A  if hasattr(sims, 'A') else sims  # convert to dense if sparse

        # if similarity exceeds any detector's radius, mark as non-self (spam)
        hits = (sims >= radii)
        preds[start:end] = np.any(hits, axis=1).astype(int)
    return preds



## Evaluate

In [None]:
X_eval = vstack([X_self_test, X_nonself])
y_eval = np.hstack([
    np.zeros(X_self_test.shape[0], dtype=int),
    np.ones(X_nonself.shape[0], dtype=int)
]
)

