In [128]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import classification_report, confusion_matrix
import pickle
from tqdm import tqdm
from dataclasses import dataclass
from scipy.sparse import vstack




## Preprocess

In [129]:
df = pd.read_csv('../data/SMSSpamCollection', sep='\t', names=['label', 'message'])
#df.head()

# data cleaning function

def clean_text(text: str) -> str:
    if not isinstance(text, str):
        return ""
    text = text.lower()
    #remove html tags
    text = re.sub(r'<.*?>', '', text)
    #remove urls
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    #remove punctuation and numbers
    text = re.sub(r'[^\w\s]', '', text)
    #remove emails
    text = re.sub(r'\S+@\S+', '', text)
    #remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['cleaned_message'] = df['message'].apply(clean_text)
#df.head()

# encode labels
df['label'] = df['label'].map({'ham': 0, 'spam': 1})




#


## Vectorize
Vectorizing using TF-IDF(Term Frequency - Inverse Document Frequency)


In [154]:
# Vectorize the messages using TF-IDF
vectorizer = TfidfVectorizer(
    #stop_words='english',
    ngram_range=(1, 2),
    max_df=0.9,
    min_df=2
)


X_tfidf = vectorizer.fit_transform(df['cleaned_message'])
y = df['label'].values

print("TF-IDF matrix shape:", X_tfidf.shape)

# Convert to binary inorder to use with NSA
tau = 0.05
X_binary = (X_tfidf >= tau).astype(np.uint8)

#basic diagnostics
density = X_binary.nnz / (X_binary.shape[0] * X_binary.shape[1])
print(f"Binary density: {density:6f}")

#

TF-IDF matrix shape: (5572, 14738)
Binary density: 0.001315


- as expected we got total of 5572 messages
- we have a large feature space of 51010
- 

## Train and predict NSA

In [155]:
# seperate self(ham) and non-self (spam) 
X_ham = X_binary[y == 0]
X_spam = X_binary[y == 1]

print(f"Ham matrix shape: {X_ham.shape}")
print(f"Spam matrix shape: {X_spam.shape}")

Ham matrix shape: (4825, 14738)
Spam matrix shape: (747, 14738)


In [132]:
# split self data into train and test sets
X_ham_train, X_ham_test = train_test_split(X_ham, test_size=0.2, random_state=42)

print(f"Ham train shape: {X_ham_train.shape}")
print(f"Ham test shape: {X_ham_test.shape}")

Ham train shape: (3860, 51010)
Ham test shape: (965, 51010)


## Evaluate