# Spam Detection NLP Project

## Setup

#### Import Libraries

In [1]:
import pandas as pd
import numpy as np
import nltk
import re
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense
from tensorflow.keras.optimizers import Adam

from collections import Counter

nltk.download('stopwords')
from nltk.corpus import stopwords

from utils.preprocessing import clean_text
from utils.eda import plot_message_lengths, plot_top_words_by_class, plot_venn_words
from utils.baseline import run_baseline_model
from utils.bert import run_bert_model
from utils.csv import write_stats_to_csv


2025-05-24 21:53:30.295739: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-24 21:53:30.470194: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748145210.535576   28141 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748145210.555004   28141 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1748145210.695575   28141 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

In [2]:
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

#### Load Dataset


In [3]:
url = "https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv"
df = pd.read_csv(url, sep='\t', names=["label", "message"])

df.head()
df.describe()

Unnamed: 0,label,message
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


## Pre-Processing

#### Remove Stop-Words

In [4]:
VARIANTS = ['raw', 'normalized', 'stop', 'stop_lemma', 'stop_stem']
df['raw'] = df['message']
df['normalized'] = df['message'].apply(lambda x: clean_text(x, 'normalized'))
df['stop'] = df['message'].apply(lambda x: clean_text(x, 'stop'))
df['stop_lemma'] = df['message'].apply(lambda x: clean_text(x, 'stop_lemma'))
df['stop_stem'] = df['message'].apply(lambda x: clean_text(x, 'stop_stem'))


# Print examples where all variants are different
print("\nExamples where all variants are different:")
different_variants = []

for idx, row in df.iterrows():
    variants = [row[variant] for variant in VARIANTS]
    # Check if all variants are different from each other
    if len(set(variants)) == len(VARIANTS):
        different_variants.append({
            'message': row['message'],
            'normalized': row['normalized'],
            'stop': row['stop'],
            'stop_lemma': row['stop_lemma'],
            'stop_stem': row['stop_stem'],
            'label': row['label']
        })
        # Limit to 5 examples to avoid overwhelming output
        if len(different_variants) >= 5:
            break

# Display the examples in a formatted way
for i, example in enumerate(different_variants, 1):
    print(f"\nExample {i} ({example['label']}):")
    print(f"Original: {example['message']}")
    print(f"Normalized:      {example['normalized']}")
    print(f"Stop:     {example['stop']}")
    print(f"Lemma:    {example['stop_lemma']}")
    print(f"Stem:     {example['stop_stem']}")



Examples where all variants are different:

Example 1 (ham):
Original: Nah I don't think he goes to usf, he lives around here though


KeyError: 'normalized'

## EDA

In [None]:
for variant in VARIANTS:
    plot_message_lengths(df, column=variant, label_column='label', title=variant, save_path=f'data/eda/{variant}/message_lengths.png')
    plot_top_words_by_class(df, column=variant, label_value='spam', top_n=20, title=f'Top 20 words in Spam Messages ({variant})',save_path=f'data/eda/{variant}/top_spam.png')
    plot_top_words_by_class(df, column=variant, label_value='ham', top_n=20, title=f'Top 20 words in Ham Messages ({variant})', save_path=f'data/eda/{variant}/top_ham.png')
    plot_venn_words(df, column=variant, title=f'Venn Diagram of Unique Words in Messages ({variant})', save_path=f'data/eda/{variant}/venn.png')


## Train/Evaluate Models

In [6]:
RANDOM_STATE = 202505

### Baseline

In [None]:
for variant in VARIANTS:
    print(variant)
    _, _, baseline_stats = run_baseline_model(df, column=variant, label_column='label', test_size=0.2, random_state=RANDOM_STATE)
    write_stats_to_csv(baseline_stats, f'data/stats.csv')

print('ham_as_spam')
for missed in baseline_stats['misclassified']['ham_as_spam'][:10]:
    print(missed)

print('spam_as_ham')
for missed in baseline_stats['misclassified']['spam_as_ham'][:10]:
    print(missed)



In [None]:
for variant in VARIANTS:
    print(variant)
    _, _, bert_stats = run_bert_model(df, column=variant, label_column='label', test_size=0.2, random_state=RANDOM_STATE)
    write_stats_to_csv(bert_stats, f'data/stats.csv')

    print('ham_as_spam')
    for missed in bert_stats['misclassified']['ham_as_spam'][:10]:
        print(missed)

    print('spam_as_ham')
    for missed in bert_stats['misclassified']['spam_as_ham'][:10]:
        print(missed)