In [1]:
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from nltk.metrics.distance import jaccard_distance
import pandas as pd
import numpy as np
import re
import nltk

from nltk.metrics import ConfusionMatrix
from nltk.tokenize import word_tokenize

In [2]:
data_set = pd.read_csv('./given/smsspamcollection/SMSSpamCollection', sep='\t', names=['class', 'msg'])

In [3]:
class TextProcessor():    
    @staticmethod
    def clean_string(string):
        punctuation_marks_regex = r'[\n\t/\d\':,-_\(\)\.\?\";!]'
        punctuation_free_text = re.sub(punctuation_marks_regex, '', string).strip()
        lower_puntuation_free_text = punctuation_free_text.lower()
        lower_puntuation_and_spaces_free_text = re.sub(' +', ' ', lower_puntuation_free_text)
        return lower_puntuation_and_spaces_free_text

    @staticmethod
    def call(text):
        return word_tokenize(
            TextProcessor.clean_string(text)
        )

In [4]:
def strattified_shuffle_split(X, y, test_size=.5):
    train_i, test_i = next(
        StratifiedShuffleSplit(n_splits=1, test_size=test_size)
         .split(X, y)
    )
    return (X[train_i], y[train_i]), (X[test_i], y[test_i])

In [5]:
data_set['processed_msg'] = data_set['msg'].apply(TextProcessor.call)

## Bag Of Words Approach
### KNN Classifier

In [6]:
train, test = strattified_shuffle_split(data_set['msg'], data_set['class'])

In [7]:
cv = CountVectorizer()
train_vectorized = cv.fit_transform(train[0]), train[1]
test_vectorizer = cv.transform(test[0]), test[1]

In [8]:
clf = KNeighborsClassifier(1)
clf.fit(train_vectorized[0], train_vectorized[1])
predictions = clf.predict(test_vectorizer[0])

In [9]:
acc = accuracy_score(
    test_vectorizer[1],
    predictions
)

print(
f"""\
Bag of Words KNN
----------------

Accuracy: {acc}
Confussion Matrix:
{ConfusionMatrix(test_vectorizer[1].tolist(), predictions.tolist()).pretty_format()}
"""
)

Bag of Words KNN
----------------

Accuracy: 0.9486719310839914
Confussion Matrix:
     |         s |
     |    h    p |
     |    a    a |
     |    m    m |
-----+-----------+
 ham |<2411>   1 |
spam |  142 <232>|
-----+-----------+
(row = reference; col = test)




### SVM

In [10]:
clf = SVC(kernel='linear')
clf.fit(train_vectorized[0], train_vectorized[1])
predictions = clf.predict(test_vectorizer[0])

In [11]:
acc = accuracy_score(
    test_vectorizer[1],
    predictions
)

print(
f"""\
Bag of Words SVM
----------------

Accuracy: {acc}
Confussion Matrix:
{ConfusionMatrix(test_vectorizer[1].tolist(), predictions.tolist()).pretty_format()}
"""
)

Bag of Words SVM
----------------

Accuracy: 0.9838478104809764
Confussion Matrix:
     |         s |
     |    h    p |
     |    a    a |
     |    m    m |
-----+-----------+
 ham |<2411>   1 |
spam |   44 <330>|
-----+-----------+
(row = reference; col = test)




## Custom Approach
### KNN

In [12]:
class kNN(object):
    def __init__(self, metric_distance='jaccard'):
        self.alg = getattr(self,metric_distance, None)
        if self.alg == None: raise ValueError('Metric is invalid')
        
        self.X = None
        self.y = None
        
    def fit(self, X, y):
        self.X = X
        self.y = y
        
    def predict(self, X):
        result = []
        for row in X:
            min_index = self.X.tolist().index(
                min(
                    self.X,
                    key=(lambda alg, row: lambda x: alg(x, row))(self.alg, row)
                )
            )
            result.append(
                self.y.tolist()[min_index]
            )
        return np.array(result)
    
    def jaccard(self, sent1, sent2):
        if sent1 == []: sent1 = ['']
        if sent2 == []: sent2 = ['']
            
        return jaccard_distance(set(sent1), set(sent2))

In [13]:
train, test = strattified_shuffle_split(data_set['processed_msg'], data_set['class'])

In [14]:
clf = kNN()
clf.fit(train[0], train[1])
predictions = clf.predict(test[0])

In [15]:
acc = accuracy_score(
    test[1],
    predictions
)

print(
f"""\
Custom KNN
----------

Accuracy: {acc}
Confussion Matrix:
{ConfusionMatrix(test_vectorizer[1].tolist(), predictions.tolist()).pretty_format()}
"""
)

Custom KNN
----------

Accuracy: 0.9691313711414213
Confussion Matrix:
     |         s |
     |    h    p |
     |    a    a |
     |    m    m |
-----+-----------+
 ham |<2113> 299 |
spam |  326  <48>|
-----+-----------+
(row = reference; col = test)


