# Spam Detector
```{admonition} Revised
25 Jul 2023
```

---

## Programming Environment

In [1]:
import numpy  as np
import pandas as pd

import sklearn
from   sklearn.ensemble        import RandomForestClassifier
from   sklearn.linear_model    import LogisticRegression
from   sklearn.metrics         import accuracy_score
from   sklearn.model_selection import train_test_split
from   sklearn.preprocessing   import StandardScaler

import datetime
from   importlib.metadata import version
import os
import platform as p
import sys

pad = 20
print(  f"\n{'Executed' : <{pad}} : {datetime.datetime.now().astimezone().strftime('%Y-%m-%d %H:%M:%S %z %Z')}"
        f"\n{'Platform' : <{pad}} : {p.platform(aliased = False, terse = False)}"
        f"\n{'Conda'    : <{pad}} : {os.environ['CONDA_DEFAULT_ENV'] or sys.executable.split('/')[-3]}"
        f"\n{'Python'   : <{pad}} : {p.python_implementation()} {p.python_version()} {sys.executable}")
print(*[f'{name : <{pad}} : {version(name)}'
        for name in ['NumPy', 'Pandas', 'Scikit-Learn']], sep = '\n')


Executed             : 2023-09-04 12:13:52 -0400 EDT
Platform             : macOS-13.5.1-arm64-arm-64bit
Conda                : ml
Python               : CPython 3.11.5 /Users/df/anaconda3/envs/ml/bin/python
NumPy                : 1.23.5
Pandas               : 2.1.0
Scikit-Learn         : 1.3.0


In [2]:
np.show_config()

openblas64__info:
    libraries = ['openblas64_', 'openblas64_']
    library_dirs = ['/usr/local/lib']
    language = c
    define_macros = [('HAVE_CBLAS', None), ('BLAS_SYMBOL_SUFFIX', '64_'), ('HAVE_BLAS_ILP64', None)]
    runtime_library_dirs = ['/usr/local/lib']
blas_ilp64_opt_info:
    libraries = ['openblas64_', 'openblas64_']
    library_dirs = ['/usr/local/lib']
    language = c
    define_macros = [('HAVE_CBLAS', None), ('BLAS_SYMBOL_SUFFIX', '64_'), ('HAVE_BLAS_ILP64', None)]
    runtime_library_dirs = ['/usr/local/lib']
openblas64__lapack_info:
    libraries = ['openblas64_', 'openblas64_']
    library_dirs = ['/usr/local/lib']
    language = c
    define_macros = [('HAVE_CBLAS', None), ('BLAS_SYMBOL_SUFFIX', '64_'), ('HAVE_BLAS_ILP64', None), ('HAVE_LAPACKE', None)]
    runtime_library_dirs = ['/usr/local/lib']
lapack_ilp64_opt_info:
    libraries = ['openblas64_', 'openblas64_']
    library_dirs = ['/usr/local/lib']
    language = c
    define_macros = [('HAVE_CBLAS', None

In [3]:
np.show_runtime()

[{'simd_extensions': {'baseline': ['NEON', 'NEON_FP16', 'NEON_VFPV4', 'ASIMD'],
                      'found': ['ASIMDHP', 'ASIMDDP'],
                      'not_found': ['ASIMDFHM']}},
 {'architecture': 'armv8',
  'filepath': '/Users/davefriedman/anaconda3/envs/jb/lib/python3.11/site-packages/numpy/.dylibs/libopenblas64_.0.dylib',
  'internal_api': 'openblas',
  'num_threads': 10,
  'prefix': 'libopenblas',
  'threading_layer': 'pthreads',
  'user_api': 'blas',
  'version': '0.3.21'},
 {'architecture': 'VORTEX',
  'filepath': '/Users/davefriedman/anaconda3/envs/jb/lib/libopenblas.0.dylib',
  'internal_api': 'openblas',
  'num_threads': 10,
  'prefix': 'libopenblas',
  'threading_layer': 'openmp',
  'user_api': 'blas',
  'version': '0.3.23'},
 {'filepath': '/Users/davefriedman/anaconda3/envs/jb/lib/libomp.dylib',
  'internal_api': 'openmp',
  'num_threads': 10,
  'prefix': 'libomp',
  'user_api': 'openmp',
  'version': None}]


In [4]:
sklearn.show_versions()


System:
    python: 3.11.4 | packaged by conda-forge | (main, Jun 10 2023, 18:08:41) [Clang 15.0.7 ]
executable: /Users/davefriedman/anaconda3/envs/jb/bin/python
   machine: macOS-13.3.1-arm64-arm-64bit

Python dependencies:
      sklearn: 1.3.0
          pip: 23.1.2
   setuptools: 68.0.0
        numpy: 1.24.3
        scipy: 1.11.1
       Cython: 3.0.0
       pandas: 2.0.3
   matplotlib: 3.7.2
       joblib: 1.3.0
threadpoolctl: 3.2.0

Built with OpenMP: True

threadpoolctl info:
       user_api: blas
   internal_api: openblas
    num_threads: 10
         prefix: libopenblas
       filepath: /Users/davefriedman/anaconda3/envs/jb/lib/python3.11/site-packages/numpy/.dylibs/libopenblas64_.0.dylib
        version: 0.3.21
threading_layer: pthreads
   architecture: armv8

       user_api: blas
   internal_api: openblas
    num_threads: 10
         prefix: libopenblas
       filepath: /Users/davefriedman/anaconda3/envs/jb/lib/libopenblas.0.dylib
        version: 0.3.23
threading_layer: openm

---

## Data Set

https://archive-beta.ics.uci.edu/dataset/94/spambase

---

```python
data = pd.read_csv(filepath_or_buffer = 'https://static.bc-edx.com/mbc/ai/m4/datasets/spam-data.csv')
data.head()

X = data.copy()
X = X.drop(columns = ['spam'])
y = data['spam']
y.value_counts()

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)

# STANDARD SCALER
scaler         = StandardScaler()
X_scaler       = scaler.fit(X = X_train)
X_train_scaled = X_scaler.transform(X = X_train)
X_test_scaled  = X_scaler.transform(X = X_test)

# LOGISTIC REGRESSION
lr_model = LogisticRegression(random_state = 1)
lr_model.fit(X = X_train_scaled,
             y = y_train)
y_pred = lr_model.predict(X = X_test_scaled)
accuracy_score(y_true = y_test, y_pred = y_pred)

# RANDOM FOREST
rf_model = RandomForestClassifier(random_state = 1)
rf_model.fit(X = X_train_scaled,
             y = y_train)
y_pred = rf_model.predict(X = X_test_scaled)
accuracy_score(y_true = y_test, y_pred = y_pred)
```

---

In [5]:
data = pd.read_csv(filepath_or_buffer = 'spam-data.csv')
data.head()

Unnamed: 0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,...,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total,spam
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,1
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1


In [6]:
data.info(memory_usage = True,
          show_counts  = True,
          verbose      = True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4601 entries, 0 to 4600
Data columns (total 58 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   word_freq_make              4601 non-null   float64
 1   word_freq_address           4601 non-null   float64
 2   word_freq_all               4601 non-null   float64
 3   word_freq_3d                4601 non-null   float64
 4   word_freq_our               4601 non-null   float64
 5   word_freq_over              4601 non-null   float64
 6   word_freq_remove            4601 non-null   float64
 7   word_freq_internet          4601 non-null   float64
 8   word_freq_order             4601 non-null   float64
 9   word_freq_mail              4601 non-null   float64
 10  word_freq_receive           4601 non-null   float64
 11  word_freq_will              4601 non-null   float64
 12  word_freq_people            4601 non-null   float64
 13  word_freq_report            4601 

In [7]:
X = data.copy()
X = X.drop(columns = ['spam'])
y = data['spam']
y.value_counts()

spam
0    2788
1    1813
Name: count, dtype: int64

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)

print(X_train.shape)
print(y_train.shape)
print(X_test .shape)
print(y_test .shape)

(3450, 57)
(3450,)
(1151, 57)
(1151,)


In [9]:
scaler   = StandardScaler()
X_scaler = scaler.fit(X = X_train)

X_train_scaled = X_scaler.transform(X = X_train)
X_test_scaled  = X_scaler.transform(X = X_test)

print(X_train_scaled.shape)
print(X_test_scaled .shape)

(3450, 57)
(1151, 57)


In [10]:
lr_model = LogisticRegression(random_state = 1)
lr_model.fit(X = X_train_scaled,
             y = y_train)

In [11]:
y_pred = lr_model.predict(X = X_test_scaled)

print(y_pred)

np.info(object = y_pred)

[0 0 1 ... 0 0 1]
class:  ndarray
shape:  (1151,)
strides:  (8,)
itemsize:  8
aligned:  True
contiguous:  True
fortran:  True
data pointer: 0x176b77800
byteorder:  little
byteswap:  False
type: int64


In [12]:
accuracy_score(y_true = y_test, y_pred = y_pred)

0.9287576020851434

In [13]:
rf_model = RandomForestClassifier(random_state = 1)
rf_model.fit(X = X_train_scaled,
             y = y_train)

In [14]:
y_pred = rf_model.predict(X = X_test_scaled)

print(y_pred)

np.info(object = y_pred)

[1 1 1 ... 1 0 1]
class:  ndarray
shape:  (1151,)
strides:  (8,)
itemsize:  8
aligned:  True
contiguous:  True
fortran:  True
data pointer: 0x10c0c8800
byteorder:  little
byteswap:  False
type: int64


In [15]:
accuracy_score(y_true = y_test, y_pred = y_pred)

0.9669852302345786

---