SMS SPAM CLASSIFICATION
==
**Model 1**

In [1]:
import re
import numpy as np
import pandas as pd
# split into training and testing sets
# USE from sklearn.model_selection import train_test_split to avoid seeing deprecation warning.
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.naive_bayes import MultinomialNB
from porter_stemmer import PorterStemmer

df = pd.read_csv("data/sms-20190303110043.csv", quotechar='"', quoting=1)
cols = list(df.columns)

# Output printing out first 5 rows
df.head()

Unnamed: 0,address,body,in_address_book,is_spam
0,MPESA,MGG8JN21NA Confirmed.Your M-PESA balance was ...,0,0
1,MPESA,"MGG2JN2WU6 Confirmed. Ksh2,080.00 paid to Java...",0,0
2,MPESA,MGG5JSJ9PF Confirmed. Ksh700.00 paid to SHINE ...,0,0
3,MPESA,"MGG3JT28BJ Confirmed. Ksh3,290.00 paid to JOSM...",0,0
4,MPESA,MGH6KB45VK Confirmed. Ksh350.00 paid to Pete's...,0,0


Let us clean and stem our sms text

In [2]:
STEM = PorterStemmer()

def repl_mpesa_ids(s:str, sub="xxmpesa confirmed") -> str:
    """Mpesa have unique IDs that needs replacing"""
    return re.sub(r"(^(\w)* confirm(\w)*)|(\w{32})", sub, s)

def repl_phone_nums(s:str, sub="xxphonenum") -> str:
    """Replace the hashed phone numbers"""
    return re.sub(r"^(\w{32})", sub, s)

def repl_num(s:str, sub="xxnum") -> str:
    """Removed digits"""
    if s.isdigit():
        return sub
    return s

def repl_measures(s:str, sub="xxmeasure") -> str:
    """measures such 1pm, 12am, 12kg, 12mb, 19th, 12:23 """
    return re.sub(r"((^(\d)+(\w)+$)|(\d+:\d+))", sub, s)

def repl_money(s:str, sub="xxcurr") -> str:
    """Remove money e.g Ksh12, Ksh3,227.00"""
    return re.sub(r"(((\w){2,4}(\d)+$)|(((\w){2,4})?(\d+)\,(\d+)\.(\d+))|(((\w){2,4})?(\d+)\.(\d+))|(\d+/\d+/\d+))", sub, s)

def word_stemming(word) -> iter:
    """Stemming using https://tartarus.org/martin/PorterStemmer/"""
    if not word:
        return word
    new_words = re.sub('[^0-9a-zA-Z]+', ' ', word.lower())
    return (STEM.stem(c, 0,len(c)-1) for c in new_words.split())

def preprocess(
        words: list,
        replace_unique=True,
        stem=True,
        replace_nums=True,
        replace_measures=True) -> iter:
    
    # Generalize money occurrence
    words = (repl_money(word) for word in words)
    
    # Generalize
    words = (repl_phone_nums(word) for word in words)
    
    # stemming
    if stem:
        new_words = ()
        for w in words:
            word_token = word_stemming(w)
            new_words += tuple(word_token)
        words = new_words
        del new_words
    
    # generalize all numbers
    if replace_nums:
        words = (repl_num(word) for word in words)
        
    # generalize all measures
    if replace_measures:
        words = (repl_measures(word) for word in words)
    
    return (word for word in words if word)

# Create a new column whose contents are in lower case
df['sms_text'] = df['body'].apply(lambda x: x.lower())

# Remove all mpesa references
df['sms_text'] = df['sms_text'].apply(repl_mpesa_ids)

# Clean and stem the column
df['sms_text'] = df['sms_text'].apply(lambda x: " ".join(preprocess(x.split())))

df.head()

Unnamed: 0,address,body,in_address_book,is_spam,sms_text
0,MPESA,MGG8JN21NA Confirmed.Your M-PESA balance was ...,0,0,xxmpesa confirm your m pesa balanc wa xxcurr o...
1,MPESA,"MGG2JN2WU6 Confirmed. Ksh2,080.00 paid to Java...",0,0,xxmpesa confirm xxcurr paid to java orbit on x...
2,MPESA,MGG5JSJ9PF Confirmed. Ksh700.00 paid to SHINE ...,0,0,xxmpesa confirm xxcurr paid to shine in sparkl...
3,MPESA,"MGG3JT28BJ Confirmed. Ksh3,290.00 paid to JOSM...",0,0,xxmpesa confirm xxcurr paid to josma invest da...
4,MPESA,MGH6KB45VK Confirmed. Ksh350.00 paid to Pete's...,0,0,xxmpesa confirm xxcurr paid to pete s cafe cha...


In [3]:
# words_array = count_vector.transform(df['sms_text'])
# words_array

In [4]:
# frequency_matrix = pd.DataFrame(words_array.toarray(), columns=count_vector.get_feature_names())
## TODO The matrix below has words that consist of numbers which we need to remove
# frequency_matrix

In [5]:
X_train, X_test, y_train, y_test = train_test_split(df['sms_text'], 
                                                    df['is_spam'], 
                                                    random_state=1)

print('Number of rows in the total set: {}'.format(df.shape[0]))
print('Number of rows in the training set: {}'.format(X_train.shape[0]))
print('Number of rows in the test set: {}'.format(X_test.shape[0]))

Number of rows in the total set: 3656
Number of rows in the training set: 2742
Number of rows in the test set: 914


In [6]:
count_vector = CountVectorizer(stop_words="english")

# Fit the training data and then return the matrix
training_data = count_vector.fit_transform(X_train)

# Transform testing data and return the matrix. Note we are not fitting the testing data into the CountVectorizer()
testing_data = count_vector.transform(X_test)

naive_bayes = MultinomialNB()
naive_bayes.fit(training_data, y_train)

predictions = naive_bayes.predict(testing_data)

print('Accuracy score: ', format(accuracy_score(y_test, predictions)))
print('Precision score: ', format(precision_score(y_test, predictions)))
print('Recall score: ', format(recall_score(y_test, predictions)))
print('F1 score: ', format(f1_score(y_test, predictions)))

Accuracy score:  0.8916849015317286
Precision score:  0.8678414096916299
Recall score:  0.9099307159353349
F1 score:  0.8883878241262683
