In [1]:
import numpy as np
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

In [2]:
import pandas as pd

df = pd.read_table('SMSSpamCollection.txt', header=None)
df.columns = ['target', 'message']

In [3]:
df.shape

(5572, 2)

# Build a simple Bag of Words from Scratch

In [4]:
docs = ["Here at the Wall", 
        "What are the main reasons for.....", 
        "There are 700 possiblities that Alex will meet Alex Prime", 
       "Alpha prime is the member of Prime Groups",
       "Is that all you got ?"]


# Preprocess the text 


def preprocess(text):
    
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9]", " ", text)
    text = text.strip()
    text = text.split(" ")
    
    return text


preprocessed_docs = [preprocess(d) for d in docs]

print(preprocessed_docs)
    

[['here', 'at', 'the', 'wall'], ['what', 'are', 'the', 'main', 'reasons', 'for'], ['there', 'are', '700', 'possiblities', 'that', 'alex', 'will', 'meet', 'alex', 'prime'], ['alpha', 'prime', 'is', 'the', 'member', 'of', 'prime', 'groups'], ['is', 'that', 'all', 'you', 'got']]


In [5]:
def create_bow(docs):
    
    bow = []
    for d in docs:
        count = dict()
        for words in d:
            count[words] = count.get(words, 0) + 1
        bow.append(count)
        
    return bow

create_bow(preprocessed_docs)

[{'here': 1, 'at': 1, 'the': 1, 'wall': 1},
 {'what': 1, 'are': 1, 'the': 1, 'main': 1, 'reasons': 1, 'for': 1},
 {'there': 1,
  'are': 1,
  '700': 1,
  'possiblities': 1,
  'that': 1,
  'alex': 2,
  'will': 1,
  'meet': 1,
  'prime': 1},
 {'alpha': 1,
  'prime': 2,
  'is': 1,
  'the': 1,
  'member': 1,
  'of': 1,
  'groups': 1},
 {'is': 1, 'that': 1, 'all': 1, 'you': 1, 'got': 1}]

# Dealing with the DataSet at Hand

In [6]:
def preprocess_text(text):
    
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9]", " ", text)
    text = text.strip()
    text = text.split()
    text = ' '.join(list(filter(lambda x : x not in ['', ' '], text)))
    return text

In [7]:
df.message = df.message.apply(preprocess_text)
df

Unnamed: 0,target,message
0,ham,go until jurong point crazy available only in ...
1,ham,ok lar joking wif u oni
2,spam,free entry in 2 a wkly comp to win fa cup fina...
3,ham,u dun say so early hor u c already then say
4,ham,nah i don t think he goes to usf he lives arou...
...,...,...
5567,spam,this is the 2nd time we have tried 2 contact u...
5568,ham,will b going to esplanade fr home
5569,ham,pity was in mood for that so any other suggest...
5570,ham,the guy did some bitching but i acted like i d...


In [8]:
# Split the data into Train/Test
X_train, X_test, y_train, y_test = train_test_split(df.message.values, df.target.values, test_size=0.1, stratify=df.target)

In [9]:
# In sklearn you can create a BOW using the CountVectorizer() function

bow = CountVectorizer(stop_words='english')

In [10]:
# Fit the bag of words on the training docs
bow.fit(X_train)

CountVectorizer(stop_words='english')

In [11]:
X_train = bow.transform(X_train)
X_test = bow.transform(X_test)

In [12]:
naive_bayes = MultinomialNB()
naive_bayes.fit(X_train, y_train)

MultinomialNB()

In [13]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

print(f'Accuracy : {accuracy_score(y_test, naive_bayes.predict(X_test)):.3f}')
print(f'Precision : {precision_score(y_test, naive_bayes.predict(X_test), pos_label="spam"):.3f}')
print(f'Recall : {recall_score(y_test, naive_bayes.predict(X_test), pos_label="spam"):.3f}')
print(f'F1-Score : {f1_score(y_test, naive_bayes.predict(X_test), pos_label="spam"):.3f}')

Accuracy : 0.993
Precision : 0.973
Recall : 0.973
F1-Score : 0.973


# Testing the Model

In [14]:
# Test sample input

text = "You've Won! Winning an unexpected prize sounds great in theory. ..."
p_text = preprocess_text(text)
print(p_text)
p_text = bow.transform([p_text])
naive_bayes.predict_proba(p_text)[0][1]

you ve won winning an unexpected prize sounds great in theory


0.5703461185427379

# Saving model

In [15]:
import pickle

with open('model_spam.pickle', 'wb') as handle:
    pickle.dump(naive_bayes, handle)

with open('vectorizer_spam.pickle', 'wb') as handle:
    pickle.dump(bow, handle)