In [9]:
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LinearRegression

## Creating a Spam Email Bayesian Classifier
In this exercise I will use sci-kit learn to create and train a Bayesian Classifier to discern spam from other emails. Utilizes the spambase data from UCI Machine Learning Repository.

First, I read in the data using pandas and gave it a long list of column headings.

In [10]:
columnsx = ['word_freq_make',
            'word_freq_address',
            'word_freq_all', 
            'word_freq_3d',
            'word_freq_our',
            'word_freq_over',
            'word_freq_remove',
            'word_freq_internet',
            'word_freq_order',
            'word_freq_mail',
            'word_freq_receive',
            'word_freq_will',
            'word_freq_people',
            'word_freq_report',
            'word_freq_addresses',
            'word_freq_free',
            'word_freq_business',
            'word_freq_email',
            'word_freq_you',
            'word_freq_credit',
            'word_freq_your',
            'word_freq_font',
            'word_freq_000',
            'word_freq_money',
            'word_freq_hp',
            'word_freq_hpl',
            'word_freq_george',
            'word_freq_650',
            'word_freq_lab',
            'word_freq_labs',
            'word_freq_telnet',
            'word_freq_857',
            'word_freq_data',
            'word_freq_415',
            'word_freq_85',
            'word_freq_technology',
            'word_freq_1999',
            'word_freq_parts',
            'word_freq_pm',
            'word_freq_direct',
            'word_freq_cs',
            'word_freq_meeting',
            'word_freq_original',
            'word_freq_project',
            'word_freq_re',
            'word_freq_edu',
            'word_freq_table',
            'word_freq_conference',
            'char_freq_;',
            'char_freq_(',
            'char_freq_[',
            'char_freq_!',
            'char_freq_$',
            'char_freq_#',
            'capital_run_length_average',
            'capital_run_length_longest',
            'capital_run_length_total',
            'SPAM'
]
spambase = pd.read_csv('spambase.data', header=None, names=columnsx)
spambase.head()

Unnamed: 0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,...,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total,SPAM
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,1
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1


Using sci-kit learn's train_test_split method, I quickly and easily split each data frame into 60% training data and 40% test data.

In [11]:
train, test = train_test_split(spambase, test_size=0.4, train_size=0.6, random_state=60)
y_train = train.SPAM
y_test = test.SPAM
X_train = train.drop('SPAM', 1)
X_test = test.drop('SPAM', 1)

In [12]:
spambot = MultinomialNB()

In [13]:
spambot.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [14]:
spambot.score(X_test, y_test)

0.78435632808256384

In [15]:
spambot.score(X_train, y_train)

0.79021739130434787

## Find a better model
##### different split percentage does not have a significant effect

In [16]:
train, test = train_test_split(spambase, test_size=0.2, train_size=0.8, random_state=60)
y_train = train.SPAM
y_test = test.SPAM
X_train = train.drop('SPAM', 1)
X_test = test.drop('SPAM', 1)

In [17]:
spammy = MultinomialNB()

In [18]:
spammy.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [19]:
spammy.score(X_test, y_test)

0.76221498371335505

#### Out of three predictions based on character, word, or capital frequency alone, the Bayesian model based on word frequency performed the best, even better than the model of all of the data.

In [20]:
features = ['char_freq_;',
            'char_freq_(',
            'char_freq_[',
            'char_freq_!',
            'char_freq_$',
            'char_freq_#',
            'SPAM']

train, test = train_test_split(spambase[features], test_size=0.4, train_size=0.6, random_state=60)
y_train = train.SPAM
y_test = test.SPAM
X_train = train.drop('SPAM', 1)
X_test = test.drop('SPAM', 1)

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

bae = MultinomialNB()
bae.fit(X_train, y_train)
bae.score(X_test, y_test)

(2760, 6) (2760,) (1841, 6) (1841,)


0.77403585008147746

In [21]:
features = ['word_freq_make',
            'word_freq_address',
            'word_freq_all', 
            'word_freq_3d',
            'word_freq_our',
            'word_freq_over',
            'word_freq_remove',
            'word_freq_internet',
            'word_freq_order',
            'word_freq_mail',
            'word_freq_receive',
            'word_freq_will',
            'word_freq_people',
            'word_freq_report',
            'word_freq_addresses',
            'word_freq_free',
            'word_freq_business',
            'word_freq_email',
            'word_freq_you',
            'word_freq_credit',
            'word_freq_your',
            'word_freq_font',
            'word_freq_000',
            'word_freq_money',
            'word_freq_hp',
            'word_freq_hpl',
            'word_freq_george',
            'word_freq_650',
            'word_freq_lab',
            'word_freq_labs',
            'word_freq_telnet',
            'word_freq_857',
            'word_freq_data',
            'word_freq_415',
            'word_freq_85',
            'word_freq_technology',
            'word_freq_1999',
            'word_freq_parts',
            'word_freq_pm',
            'word_freq_direct',
            'word_freq_cs',
            'word_freq_meeting',
            'word_freq_original',
            'word_freq_project',
            'word_freq_re',
            'word_freq_edu',
            'word_freq_table',
            'word_freq_conference',
            'SPAM']

train, test = train_test_split(spambase[features], test_size=0.4, train_size=0.6, random_state=60)
y_train = train.SPAM
y_test = test.SPAM
X_train = train.drop('SPAM', 1)
X_test = test.drop('SPAM', 1)

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

bae = MultinomialNB()
bae.fit(X_train, y_train)
bae.score(X_test, y_test)

(2760, 48) (2760,) (1841, 48) (1841,)


0.86094513851167842

In [22]:
features = ['capital_run_length_average',
            'capital_run_length_longest',
            'capital_run_length_total',
            'SPAM']

train, test = train_test_split(spambase[features], test_size=0.4, train_size=0.6, random_state=60)
y_train = train.SPAM
y_test = test.SPAM
X_train = train.drop('SPAM', 1)
X_test = test.drop('SPAM', 1)

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

bae = MultinomialNB()
bae.fit(X_train, y_train)
bae.score(X_test, y_test)

(2760, 3) (2760,) (1841, 3) (1841,)


0.53340575774035848

## Linear Regression time!
Linear Regression models do not perform as well as Bayesian models.

In [23]:
train, test = train_test_split(spambase, test_size=0.4, train_size=0.6, random_state=60)
y_train = train.SPAM
y_test = test.SPAM
X_train = train.drop('SPAM', 1)
X_test = test.drop('SPAM', 1)

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

regr = LinearRegression()
regr.fit(X_train, y_train)
regr.score(X_test, y_test)

(2760, 57) (2760,) (1841, 57) (1841,)




0.49252035345961842

### Selective Feautres
The following model is based on all word counts and all character counts as features.

In [24]:
cols = ['word_freq_make',
        'word_freq_address',
        'word_freq_all', 
        'word_freq_3d',
        'word_freq_our',
        'word_freq_over',
        'word_freq_remove',
        'word_freq_internet',
        'word_freq_order',
        'word_freq_mail',
        'word_freq_receive',
        'word_freq_will',
        'word_freq_people',
        'word_freq_report',
        'word_freq_addresses',
        'word_freq_free',
        'word_freq_business',
        'word_freq_email',
        'word_freq_you',
        'word_freq_credit',
        'word_freq_your',
        'word_freq_font',
        'word_freq_000',
        'word_freq_money',
        'word_freq_hp',
        'word_freq_hpl',
        'word_freq_george',
        'word_freq_650',
        'word_freq_lab',
        'word_freq_labs',
        'word_freq_telnet',
        'word_freq_857',
        'word_freq_data',
        'word_freq_415',
        'word_freq_85',
        'word_freq_technology',
        'word_freq_1999',
        'word_freq_parts',
        'word_freq_pm',
        'word_freq_direct',
        'word_freq_cs',
        'word_freq_meeting',
        'word_freq_original',
        'word_freq_project',
        'word_freq_re',
        'word_freq_edu',
        'word_freq_table',
        'word_freq_conference',
        'char_freq_;',
        'char_freq_(',
        'char_freq_[',
        'char_freq_!',
        'char_freq_$',
        'char_freq_#',
        'SPAM']

train, test = train_test_split(spambase[cols], test_size=0.4, train_size=0.6, random_state=900)
y_train = train.SPAM
y_test = test.SPAM
X_train = train.drop('SPAM', 1)
X_test = test.drop('SPAM', 1)

spambot = MultinomialNB()

spambot.fit(X_train, y_train)
spambot.score(X_test, y_test)

0.8853883758826725

## Using Stats
I split the data bases and collected the means for spam and ham into a separate data frame. I tried to visually pick out the most important factors, disregarding the very specific namd and number instances. This worked out pretty well for me.

In [25]:
spam = spambase[spambase.SPAM == 1]
not_spam = spambase[spambase.SPAM == 0]

In [26]:
mean_frame = pd.DataFrame(dict(spam = spam.mean(), not_spam = not_spam.mean()))
mean_frame.head()

Unnamed: 0,not_spam,spam
word_freq_make,0.073479,0.152339
word_freq_address,0.244466,0.16465
word_freq_all,0.200581,0.403795
word_freq_3d,0.000886,0.164672
word_freq_our,0.18104,0.513955


In [27]:
use_cols = ['word_freq_remove', 'word_freq_3d', 'word_freq_internet',
            'word_freq_free', 'char_freq_!',
            'word_freq_edu', 'word_freq_re', 'word_freq_george',
            'word_freq_hp', 'word_freq_business', 'word_freq_your',
            'word_freq_credit', 'SPAM']
train, test = train_test_split(spambase[use_cols], test_size=0.4, train_size=0.6, random_state=900)
y_train = train.SPAM
y_test = test.SPAM
X_train = train.drop('SPAM', 1)
X_test = test.drop('SPAM', 1)

spambot = MultinomialNB()

spambot.fit(X_train, y_train)
spambot.score(X_test, y_test)

0.88810429114611622

### The best set of features... 
The best set of features I found to classify emails is a selection of the most significant word frequency counts and the character frequency of the exclamation point. It performs slightly better than the features of all word counts and all character counts.