# spam-detector

%2020-04-19
___

In [15]:
import pandas as pd
import urllib
import requests
import os

DATABASE_URL = 'https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/'
LOCAL_DATABASE_PATH = 'spambase'

# Try opening local copies first before fetching from online database
try:
    df = pd.read_csv(os.path.join('spambase-data', 'spambase.data'),
                      header=None, index_col=False)
    print('Reading .data file from local copy of database...')
except OSError:
    df = pd.read_csv(urllib.parse.urljoin(database_url, 'spambase.data'),
                     header=None, index_col=False)
    print('Reading .data file from online of database...')

try:
    with open(os.path.join('spambase-data', 'spambase.names')) as f:
        names_file_text = f.read()
        print('Reading .names file from local copy of database...')
except OSError:
    names_file_text = requests.get(urllib.parse.urljoin(database_url, 'spambase.names')).text
    print('Reading .names file from online database...')

Reading .data file from local copy of database...
Reading .names file from local copy of database...


Attributes are specified in the .names format: http://www.cs.washington.edu/dm/vfml/appendixes/c45.htm

In [60]:
#print(names)

In [61]:
def get_attribute_names(names_file_text):
    # Anything between a '|' and the end of the line is ignored
    strip_comments = lambda line : line.split('|',1)[0]
    attr_names = []
    read_classes = False
    for line in names_file_text.splitlines():
        if len(line.strip()) == 0 or line[0] == '|':
            continue
        elif not read_classes:
            classes = strip_comments(line).split(',')
            read_classes = True
        else:
            attr_name, attr_type = strip_comments(line).split(':')
            attr_names.append(attr_name)
    return attr_names

# Add feature name to last column
df.columns = get_attribute_names(names_file_text) + ['spam']

In [62]:
# Number of Instances: 4601 (1813 Spam = 39.4%)
# Check for null entries: none found
#df.isnull().sum()

In [81]:
def normalise_capital_run_length_data(df):
    crl = df.filter(regex=('capital_run_length*'))
    # Min-Max normalisation
    normalise = lambda col : (col-col.min())/(col.max()-col.min())
    crl = (crl-crl.min())/(crl.max()-crl.min())
    for col_name in crl.columns:
        df[col_name] = normalise(df[col_name])
    return df
    

features = df.drop('spam', axis=1)
features = normalise_capital_run_length_data(features)
features

Unnamed: 0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,...,word_freq_conference,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total
0,0.00,0.64,0.64,0.0,0.32,0.00,0.00,0.00,0.00,0.00,...,0.0,0.000,0.000,0.0,0.778,0.000,0.000,0.002502,0.006007,0.017487
1,0.21,0.28,0.50,0.0,0.14,0.28,0.21,0.07,0.00,0.94,...,0.0,0.000,0.132,0.0,0.372,0.180,0.048,0.003735,0.010012,0.064836
2,0.06,0.00,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.0,0.010,0.143,0.0,0.276,0.184,0.010,0.008008,0.048458,0.142551
3,0.00,0.00,0.00,0.0,0.63,0.00,0.31,0.63,0.31,0.63,...,0.0,0.000,0.137,0.0,0.137,0.000,0.000,0.002303,0.003905,0.011995
4,0.00,0.00,0.00,0.0,0.63,0.00,0.31,0.63,0.31,0.63,...,0.0,0.000,0.135,0.0,0.135,0.000,0.000,0.002303,0.003905,0.011995
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4596,0.31,0.00,0.62,0.0,0.00,0.31,0.00,0.00,0.00,0.00,...,0.0,0.000,0.232,0.0,0.000,0.000,0.000,0.000129,0.000200,0.005492
4597,0.00,0.00,0.00,0.0,0.00,0.00,0.00,0.00,0.00,0.00,...,0.0,0.000,0.000,0.0,0.353,0.000,0.000,0.000504,0.000300,0.000821
4598,0.30,0.00,0.30,0.0,0.00,0.00,0.00,0.00,0.00,0.00,...,0.0,0.102,0.718,0.0,0.000,0.000,0.000,0.000367,0.000501,0.007386
4599,0.96,0.00,0.00,0.0,0.32,0.00,0.00,0.00,0.00,0.00,...,0.0,0.000,0.057,0.0,0.000,0.000,0.000,0.000133,0.000400,0.004861


In [82]:
from sklearn.model_selection import train_test_split

RANDOM_SEED = 9

train_features, test_features, train_spam, test_spam = train_test_split(features, df['spam'], test_size=0.2, random_state=RANDOM_SEED)

In [83]:
train_freq

Unnamed: 0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,...,word_freq_conference,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total
3610,0.00,0.00,0.00,0.0,0.00,0.00,0.00,0.00,0.00,0.00,...,0.0,0.000,0.000,0.000,0.000,0.000,0.000,1.000,1,5
2614,0.00,0.00,0.00,0.0,0.00,0.00,0.00,0.00,0.00,0.00,...,0.0,0.000,0.000,0.000,0.581,0.000,0.000,1.615,4,21
3328,0.00,0.00,0.00,0.0,0.00,0.00,0.00,0.00,0.00,0.00,...,0.0,0.000,0.150,0.075,0.037,0.000,0.000,2.367,12,206
3942,0.00,0.00,0.74,0.0,0.00,0.00,0.00,0.00,0.00,0.74,...,0.0,0.000,0.245,0.000,0.000,0.000,0.000,4.666,64,196
3923,0.08,0.00,0.16,0.0,0.00,0.08,0.00,0.08,0.73,0.00,...,0.0,0.126,0.172,0.057,0.000,0.022,0.000,3.212,44,665
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1149,0.00,0.00,0.00,0.0,0.00,0.00,0.00,0.00,0.00,0.00,...,0.0,0.000,0.000,0.000,0.298,0.597,0.000,3.333,12,30
1787,0.25,0.17,0.34,0.0,0.00,0.08,0.00,0.00,0.08,0.08,...,0.0,0.000,0.015,0.000,0.094,0.015,0.000,2.531,89,319
501,0.00,0.00,0.00,0.0,0.00,0.00,0.00,1.12,0.00,0.00,...,0.0,0.000,0.204,0.000,0.408,0.408,0.000,4.100,25,82
4444,0.00,0.00,0.00,0.0,0.00,0.00,0.00,0.00,0.00,0.00,...,0.0,0.000,0.178,0.059,0.000,0.000,0.059,7.046,70,303
