In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import classification_report, confusion_matrix
import pickle



## Preprocess

In [None]:
df = pd.read_csv('../data/SMSSpamCollection', sep='\t', names=['label', 'message'])
#df.head()

# data cleaning function

def clean_text(text: str) -> str:
    if not isinstance(text, str):
        return ""
    text = text.lower()
    #remove html tags
    text = re.sub(r'<.*?>', '', text)
    #remove urls
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    #remove punctuation and numbers
    text = re.sub(r'[^\w\s]', '', text)
    #remove emails
    text = re.sub(r'\S+@\S+', '', text)
    #remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['cleaned_message'] = df['message'].apply(clean_text)
#df.head()

# encode labels
df['label'] = df['label'].map({'ham': 0, 'spam': 1})




#


Unnamed: 0,label,message,cleaned_message
0,0,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only in ...
1,0,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...
3,0,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say
4,0,"Nah I don't think he goes to usf, he lives aro...",nah i dont think he goes to usf he lives aroun...


## Vectorize
Vectorizing using TF-IDF(Term Frequency - Inverse Document Frequency)


In [14]:
vectorizer = TfidfVectorizer(
    stop_words='english'
    #ngram_range=(1, 2),
    #max_df=0.9,
    #min_df=2
)


X = vectorizer.fit_transform(df['cleaned_message'])
y = df['label'].values

print(f"matrix shape: {X.shape}")
print(f"Vocabulary size: {len(vectorizer.vocabulary_)}")
# Save the vectorizer
with open('../models/tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

# see a few top words
print(list(vectorizer.vocabulary_.keys())[:20])

# example vector for one email
sample_vec = X[0]
print("Non-zero features in first email:", sample_vec.nnz)


matrix shape: (5572, 9211)
Vocabulary size: 9211
['jurong', 'point', 'crazy', 'available', 'bugis', 'great', 'world', 'la', 'buffet', 'cine', 'got', 'amore', 'wat', 'ok', 'lar', 'joking', 'wif', 'oni', 'free', 'entry']
Non-zero features in first email: 13


## Train NSA

## Predict

## Evaluate