### Imports 

In [169]:
import pandas as pd
import string
import pprint
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

### Data loading and preprocessing

In [170]:
df = pd.read_table('SMSSpamCollection', names=['label', 'sms_message'], sep='\t')
df.head()

Unnamed: 0,label,sms_message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [171]:
df['label'] = df['label'].apply(lambda l: 0 if l == 'ham' else 1)
df.head()

Unnamed: 0,label,sms_message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [172]:
def get_label(index):
    return 'ham' if index == 0 else 'spam'

In [173]:
documents = df['sms_message']#.to_list()
documents[:5]

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: sms_message, dtype: object

In [174]:
lower_case_documents = documents.apply(lambda i: i.lower())
lower_case_documents.head()

0    go until jurong point, crazy.. available only ...
1                        ok lar... joking wif u oni...
2    free entry in 2 a wkly comp to win fa cup fina...
3    u dun say so early hor... u c already then say...
4    nah i don't think he goes to usf, he lives aro...
Name: sms_message, dtype: object

In [175]:
sans_punctuation_documents = lower_case_documents.apply(lambda i: 
                                                        i.translate(str.maketrans('', '', string.punctuation)))
sans_punctuation_documents.head()

0    go until jurong point crazy available only in ...
1                              ok lar joking wif u oni
2    free entry in 2 a wkly comp to win fa cup fina...
3          u dun say so early hor u c already then say
4    nah i dont think he goes to usf he lives aroun...
Name: sms_message, dtype: object

### Bag of words (from scratch)

In [178]:
preprocessed_documents = []
for i in sans_punctuation_documents:
    preprocessed_documents.append(i.split())
preprocessed_documents[0]

['go',
 'until',
 'jurong',
 'point',
 'crazy',
 'available',
 'only',
 'in',
 'bugis',
 'n',
 'great',
 'world',
 'la',
 'e',
 'buffet',
 'cine',
 'there',
 'got',
 'amore',
 'wat']

In [179]:
frequency_list = []
for i in preprocessed_documents:
    frequency_list.append(Counter(i))
pprint.pprint(frequency_list[0])

Counter({'go': 1,
         'until': 1,
         'jurong': 1,
         'point': 1,
         'crazy': 1,
         'available': 1,
         'only': 1,
         'in': 1,
         'bugis': 1,
         'n': 1,
         'great': 1,
         'world': 1,
         'la': 1,
         'e': 1,
         'buffet': 1,
         'cine': 1,
         'there': 1,
         'got': 1,
         'amore': 1,
         'wat': 1})


### Bag of words with scikit-learn

In [180]:
count_vector = CountVectorizer(min_df=0.01, max_df=0.99)
'''
min_df=0.01: ignore terms that appear in less than 1% of the documents
max_df=0.99: ignore terms that appear in more than 99% of the documents
'''

documents = df['sms_message'].to_list()
count_vector.fit(documents)
len(count_vector.get_feature_names())

214

In [181]:
doc_array = count_vector.transform(documents).toarray()
doc_array

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [182]:
frequency_matrix = pd.DataFrame(doc_array)
frequency_matrix.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,204,205,206,207,208,209,210,211,212,213
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Train & predict

In [192]:
X_train, X_test, y_train, y_test = train_test_split(df['sms_message'], 
                                                    df['label'], 
                                                    random_state=1)

print('Number of rows in the total set: {}'.format(df.shape[0]))
print('Number of rows in the training set: {}'.format(X_train.shape[0]))
print('Number of rows in the test set: {}'.format(X_test.shape[0]))

Number of rows in the total set: 5572
Number of rows in the training set: 4179
Number of rows in the test set: 1393


710     4mths half price Orange line rental & latest c...
3740                           Did you stitch his trouser
2711    Hope you enjoyed your new content. text stop t...
3155    Not heard from U4 a while. Call 4 rude chat pr...
3748    Ü neva tell me how i noe... I'm not at home in...
                              ...                        
905     We're all getting worried over here, derek and...
5192    Oh oh... Den muz change plan liao... Go back h...
3980    CERI U REBEL! SWEET DREAMZ ME LITTLE BUDDY!! C...
235     Text & meet someone sexy today. U can find a d...
5157                              K k:) sms chat with me.
Name: sms_message, Length: 4179, dtype: object

In [184]:
count_vector = CountVectorizer()
training_data = count_vector.fit_transform(X_train)
testing_data = count_vector.transform(X_test)

In [185]:
naive_bayes = MultinomialNB()
naive_bayes.fit(training_data, y_train)

MultinomialNB()

In [186]:
predictions = naive_bayes.predict(testing_data)
predictions

array([0, 0, 0, ..., 0, 1, 0], dtype=int64)

In [187]:
print('Accuracy score:  ', format(accuracy_score(predictions, y_test)))
print('Precision score: ', format(precision_score(predictions, y_test)))
print('Recall score:    ', format(recall_score(predictions, y_test)))
print('F1 score:        ', format(f1_score(predictions, y_test)))

Accuracy score:   0.9885139985642498
Precision score:  0.9405405405405406
Recall score:     0.9720670391061452
F1 score:         0.9560439560439562


In [200]:
messages = ['Hi! Do you want to earn the money? Follow the link below.', 
            'Hi Jack! How are you? How is your mom? Send me some money please']
messages = [m.lower() for m in messages]
messages = [m.translate(str.maketrans('', '', string.punctuation)) for m in messages]
messages_vector = count_vector.transform(messages).toarray()
messages_vector

array([[0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 2, 1, 1, 1, 0],
       [1, 0, 0, 0, 0, 1, 2, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1]],
      dtype=int64)