Firstly, we import the data.

In [95]:
import pandas as pd

df = pd.read_csv('spambase.csv', header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,57
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,1
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1


Then we clean the data.
Check the shape of the data

In [96]:
df.shape

(4601, 58)

Check if there's any NaN.

In [97]:
df.isnull().values.any()

False

We can see that the data does not have NaNs.

Since all attributes are useful for spam recognization, we do not need to remove any columns.

In [98]:
from sklearn.model_selection import train_test_split
train_x, test_x, train_y, test_y = train_test_split(df.drop(57, axis=1), df[57], test_size=0.2, random_state=42)

In [109]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(random_state=13)
model.fit(train_x, train_y)

RandomForestClassifier(random_state=13)

In [110]:
predicted = model.predict(test_x)
model.score(test_x, test_y)

0.9587404994571118

In [111]:
from sklearn.metrics import roc_auc_score
probabilities = model.predict_proba(test_x)

In [112]:
roc_auc_score(test_y, probabilities[:, 1])

0.985902264715824

In [113]:
from sklearn.metrics import confusion_matrix
confusion_matrix(test_y, predicted)

array([[522,   9],
       [ 29, 361]])

In [114]:
from sklearn.metrics import precision_score

train_predictions = model.predict(train_x)
precision_score(train_y, train_predictions)

0.9992972593113141

In [115]:
from sklearn.metrics import recall_score

recall_score(train_y, train_predictions)

0.9992972593113141

In [124]:
import re
def get_word_frequency(content):
    words = ['make', 'address', 'all', '3d', 'our', 'over', 'remove', 'internet', 'order', 'mail', 'receive', 'will', 'people', 'report', 'addresses', 'free', 'business', 'email', 'you', 'credit', 'your', 'font', '000', 'money', 'hp', 'hpl', 'george', '650', 'lab', 'labs', 'telnet', '857', 'data', '415', '85','technology', '1999', 'parts', 'pm', 'direct', 'cs', 'meeting', 'original', 'project', 're', 'edu', 'table', 'conference']
    chars = [';', '(', "]", '!', '$', '#']

    # Find all words and its length
    pattern_word = re.compile(r'[a-zA-Z\d]+')
    content_words = pattern_word.findall(content)
    num_words = len(content_words)
    # Calculate the frequency probability
    frequency_word = [content_words.count(word) for word in words]
    word_freqs = [100*freq/num_words for freq in frequency_word]

    # Find all chars
    num_chars = len(content)
    frequency_char = [content.count(char) for char in chars]
    char_freqs = [100*freq/num_chars for freq in frequency_char]

    # Find the uninterrupted sequences of capital letters
    pattern_capital = re.compile(r'[A-Z]+')
    capital_words = pattern_capital.findall(content)
    num_capitals = len(capital_words)
    longest_capital = max(capital_words, key=len) if num_capitals > 0 else ''
    len_longest_capital = len(longest_capital)
    length_capitals = sum(list(map(len, capital_words)))
    average_length = length_capitals/num_capitals if num_capitals > 0 else 0

    # Final values
    values = []
    values.extend(word_freqs)
    values.extend(char_freqs)
    values.append(average_length)
    values.append(len_longest_capital)
    values.append(length_capitals)

    return dict(enumerate(values))





In [202]:
def predict_spam(content):
    input = [get_word_frequency(content)]
    return model.predict(pd.DataFrame(input))

In [203]:
predict_spam("""Hi Cheng ,

This is an important reminder for those completing the non-technical stream for the second project (Azure & Cloud Fundamentals):

You need to ensure that when you are choosing ‘Database Provider’ to set up your WordPress application, you select MySQL in App. Choosing anything else will spend all your credits.

""")

array([0])

In [204]:
predict_spam("""

PUBLIC ANNOUNCEMENT:

The new domain names are finally available to the general public at discount prices. Now you can register one of the exciting new .BIZ or .INFO domain names, as well as the original .COM and .NET names for just $14.95. These brand new domain extensions were recently approved by ICANN and have the same rights as the original .COM and .NET domain names. The biggest benefit is of-course that the .BIZ and .INFO domain names are currently more available. i.e. it will be much easier to register an attractive and easy-to-remember domain name for the same price.  

""")

array([1])

In [197]:
test_df = pd.read_csv("spam_ham_dataset.csv")
test_df = test_df[['text', 'label_num']]

test_input = []
for i in range(test_df.shape[0]):
    test_input.append(get_word_frequency(test_df.iloc[i][0]))
test_input = pd.DataFrame(test_input)

test_result = test_df['label_num']


In [200]:
model.score(test_input, test_result)

0.7412492748017792