SMS SPAM CLASSIFICATION
==
**Model 2**

In [1]:
import re
import numpy as np
import pandas as pd
# split into training and testing sets
# USE from sklearn.model_selection import train_test_split to avoid seeing deprecation warning.
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.naive_bayes import MultinomialNB
import catboost
import xgboost
import clean

df = pd.read_csv("data/sms-20190303110043.csv", quotechar='"', quoting=1)
cols = list(df.columns)

# Output printing out first 5 rows
df.head()

Unnamed: 0,address,body,in_address_book,is_spam
0,MPESA,MGG8JN21NA Confirmed.Your M-PESA balance was ...,0,0
1,MPESA,"MGG2JN2WU6 Confirmed. Ksh2,080.00 paid to Java...",0,0
2,MPESA,MGG5JSJ9PF Confirmed. Ksh700.00 paid to SHINE ...,0,0
3,MPESA,"MGG3JT28BJ Confirmed. Ksh3,290.00 paid to JOSM...",0,0
4,MPESA,MGH6KB45VK Confirmed. Ksh350.00 paid to Pete's...,0,0


Let us clean and stem our sms text

In [2]:
# Create a new column by clean and stem the text column
df['sms_text'] = clean.clean_stem(df['body'])

text_to_inject = "abcdefgz"
def inject_word(value: str):
    return "{} {}".format(text_to_inject, value)

# prefix the sms_text with `abcdefgz` if sender was in address book
df['sms_text'] = df.apply(lambda x: inject_word(x.sms_text) if x.in_address_book == 1 else x.sms_text, axis=1)

In [3]:
X_train, X_test, y_train, y_test = train_test_split(df['sms_text'], 
                                                    df['is_spam'], 
                                                    random_state=1)

print('Number of rows in the total set: {}'.format(df.shape[0]))
print('Number of rows in the training set: {}'.format(X_train.shape[0]))
print('Number of rows in the test set: {}'.format(X_test.shape[0]))

Number of rows in the total set: 3656
Number of rows in the training set: 2742
Number of rows in the test set: 914


Lets now train the data

In [4]:
# create a count vectorizer object 
count_vector = CountVectorizer(analyzer='word', stop_words="english")
count_vector.fit(df['sms_text'])
cv_training_data = count_vector.fit_transform(X_train)
cv_testing_data = count_vector.transform(X_test)

# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', stop_words="english", max_features=5000)
tfidf_vect.fit(df['sms_text'])
tv_training_data =  tfidf_vect.transform(X_train)
tv_testing_data =  tfidf_vect.transform(X_test)

In [5]:
import time

def train_model(clf, training_data, testing_data, training_label):
    
    name = clf.__class__.__name__
    if name == "XGBClassifier":
        training_data = training_data.tocsc()
        testing_data = testing_data.tocsc()
        
    if name == "CatBoostClassifier":
        training_data = training_data.toarray()
        testing_data = testing_data.toarray()
    
    # fit the training dataset on the classifier
    start = time.time()
    clf.fit(training_data, training_label)
    end = time.time()
    training_time = end - start
    
    # predict the labels on validation dataset
    start = time.time()
    predict = clf.predict(testing_data)
    end = time.time()
    prediction_time = end - start
    
    return training_time, prediction_time, accuracy_score(predict, y_test)

In [6]:
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

classifiers = (
    MultinomialNB(),
    LogisticRegression(solver="lbfgs", max_iter=1000),
    SVC(gamma='scale'),
    RandomForestClassifier(n_estimators=100),
    XGBClassifier(),
    CatBoostClassifier(
        iterations=300, learning_rate=0.1, depth=6, loss_function='Logloss', logging_level='Silent', random_seed=21)    
)


In [7]:
d = {
    "Operation": [],
    "Training time": [],
    "Predicting time": [],
    "Accuracy": []
}
for clf in classifiers:
    name = clf.__class__.__name__
    
    # on Count Vectors
    training_time, prediction_time, accuracy = train_model(clf, cv_training_data, cv_testing_data, y_train)
    d["Operation"].append("{}, Count Vectors".format(name))
    d["Training time"].append("{:.6f}".format(training_time))
    d["Predicting time"].append("{:.6f}".format(prediction_time))
    d["Accuracy"].append("{:.6f}".format(accuracy))
    
    # on Word Level TF IDF Vectors
    training_time, prediction_time, accuracy = train_model(clf, tv_training_data, tv_testing_data, y_train)    
    d["Operation"].append("{}, Count Vectors".format(name))
    d["Training time"].append("{:.6f}".format(training_time))
    d["Predicting time"].append("{:.6f}".format(prediction_time))
    d["Accuracy"].append("{:.6f}".format(accuracy))
    

res_df = pd.DataFrame(data=d)
res_df

Unnamed: 0,Operation,Training time,Predicting time,Accuracy
0,"MultinomialNB, Count Vectors",0.002741,0.000192,0.900438
1,"MultinomialNB, Count Vectors",0.001351,0.00018,0.90919
2,"LogisticRegression, Count Vectors",0.098522,0.000124,0.920131
3,"LogisticRegression, Count Vectors",0.03055,0.000125,0.913567
4,"SVC, Count Vectors",0.351632,0.10534,0.929978
5,"SVC, Count Vectors",0.447818,0.13048,0.932166
6,"RandomForestClassifier, Count Vectors",1.317862,0.025613,0.916849
7,"RandomForestClassifier, Count Vectors",1.811051,0.025929,0.917943
8,"XGBClassifier, Count Vectors",1.019703,0.007383,0.904814
9,"XGBClassifier, Count Vectors",1.327915,0.007415,0.916849
