# spam-detector

%2020-04-19
___

In [1]:
import pandas as pd
import urllib
import requests
import os

DATABASE_URL = 'https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/'
LOCAL_DATABASE_PATH = 'spambase'

# Try opening local copies first before fetching from online database
try:
    df = pd.read_csv(os.path.join('spambase-data', 'spambase.data'),
                      header=None, index_col=False)
    print('Reading .data file from local copy of database...')
except OSError:
    df = pd.read_csv(urllib.parse.urljoin(database_url, 'spambase.data'),
                     header=None, index_col=False)
    print('Reading .data file from online of database...')

try:
    with open(os.path.join('spambase-data', 'spambase.names')) as f:
        names_file_text = f.read()
        print('Reading .names file from local copy of database...')
except OSError:
    names_file_text = requests.get(urllib.parse.urljoin(database_url, 'spambase.names')).text
    print('Reading .names file from online database...')

Reading .data file from local copy of database...
Reading .names file from local copy of database...


Attributes are specified in the .names format: http://www.cs.washington.edu/dm/vfml/appendixes/c45.htm

In [2]:
#print(names)

In [3]:
def get_attribute_names(names_file_text):
    # Anything between a '|' and the end of the line is ignored
    strip_comments = lambda line : line.split('|',1)[0]
    attr_names = []
    read_classes = False
    for line in names_file_text.splitlines():
        if len(line.strip()) == 0 or line[0] == '|':
            continue
        elif not read_classes:
            classes = strip_comments(line).split(',')
            read_classes = True
        else:
            attr_name, attr_type = strip_comments(line).split(':')
            attr_names.append(attr_name)
    return attr_names

# Add feature name to last column
df.columns = get_attribute_names(names_file_text) + ['spam']

In [4]:
# Number of Instances: 4601 (1813 Spam = 39.4%)
# Check for null entries: none found
#df.isnull().sum()

In [5]:
def normalise_capital_run_length_data(df):
    crl = df.filter(regex=('capital_run_length*'))
    # Min-Max normalisation
    normalise = lambda col : (col-col.min())/(col.max()-col.min())
    crl = (crl-crl.min())/(crl.max()-crl.min())
    for col_name in crl.columns:
        df[col_name] = normalise(df[col_name])
    return df
    

features = df.drop('spam', axis=1)
features = normalise_capital_run_length_data(features)

In [6]:
from sklearn.model_selection import train_test_split

RANDOM_SEED = 9

train_x, test_x, train_y, test_y = train_test_split(features, df['spam'], test_size=0.2, random_state=RANDOM_SEED)

In [7]:
from sklearn.ensemble import RandomForestClassifier

model1 = RandomForestClassifier(random_state=RANDOM_SEED)
model1 = model1.fit(train_x, train_y)

In [8]:
from sklearn.linear_model import LogisticRegression

model2 = LogisticRegression()
model2 = model2.fit(train_x, train_y)

In [9]:
import sklearn.metrics as skm
def display_metrics(model, inputs, outputs):
    score = model.score(inputs, outputs)
    probabilities = model.predict_proba(inputs)
    predictions = model.predict(inputs)
    
    roc_auc = skm.roc_auc_score(outputs, probabilities[:, 1])
    tp, fp, fn, tn = skm.confusion_matrix(outputs, predictions).ravel()
    precision = skm.precision_score(outputs, predictions)
    recall = skm.recall_score(outputs, predictions)
    
    metrics_report = (
        f'Accuracy       : {score}\n'
        f'ROC AUC        : {roc_auc}\n'
        f'TP, FP, FN, TN : {[tp, fp, fn, tn]}\n'
        f'Precision      : {precision}\n'
        f'Recall         : {recall}\n'
    )
    print(metrics_report)

display_metrics(model1, test_x, test_y)
#display_metrics(model1, train_x, train_y)
display_metrics(model2, test_x, test_y)
#display_metrics(model2, train_x, train_y)

Accuracy       : 0.9587404994571118
ROC AUC        : 0.9876847658388493
TP, FP, FN, TN : [552, 14, 24, 331]
Precision      : 0.9594202898550724
Recall         : 0.9323943661971831

Accuracy       : 0.9370249728555917
ROC AUC        : 0.9778679141989748
TP, FP, FN, TN : [542, 24, 34, 321]
Precision      : 0.9304347826086956
Recall         : 0.9042253521126761

