In [1]:
import numpy as np
import pandas as pd

# Preprocessing
import re
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

# Feature extraction, model evaluation and hyperparemter optimization
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Classifiers
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [2]:
df = pd.read_csv("spam.csv", encoding = 'latin-1')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [3]:
df = df[["v1", "v2"]]
df.columns = ["label", "message"]
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
## cleaning
nltk.download("stopwords")
def clean_message(message):    
    message = re.sub("[^A-Za-z]", " ", message)
    message = message.lower()
    message = message.split()
    stemmer = PorterStemmer()
    message = [stemmer.stem(word) for word in message if word not in set(stopwords.words("english"))]
    message = " ".join(message)
    return message

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [5]:
messages = []
for i in range(0, len(df)):
    cleaned_message = clean_message(df.message[i])
    messages.append(cleaned_message)
print(messages[0])

go jurong point crazi avail bugi n great world la e buffet cine got amor wat


In [6]:
labels = df["label"].values

def train(features,models):
  features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = 0.20, random_state = 42)
  k_fold = StratifiedKFold(n_splits = 5)
  for model in models:
    model.fit(features_train, labels_train)

    scores = cross_val_score(model, features_train, labels_train, cv = k_fold)
    print(type(model))
    print("Mean score:" , scores.mean(), "Std:", scores.std())
    print()

    labels_predicted = model.predict(features_test)
    
    print("Test Accuracy Score:", accuracy_score(labels_test, labels_predicted))
    print(classification_report(labels_test, labels_predicted))
    print("--------------------------------------------------")


In [7]:
## word count vectorize
count_vectorizer = CountVectorizer()
features = count_vectorizer.fit_transform(messages).toarray()
train(features,  [LogisticRegression(), MultinomialNB(), SVC()])

<class 'sklearn.linear_model._logistic.LogisticRegression'>
Mean score: 0.980926857010564 Std: 0.004660957535335627

Test Accuracy Score: 0.9775784753363229
              precision    recall  f1-score   support

         ham       0.97      1.00      0.99       965
        spam       1.00      0.83      0.91       150

    accuracy                           0.98      1115
   macro avg       0.99      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115

--------------------------------------------------
<class 'sklearn.naive_bayes.MultinomialNB'>
Mean score: 0.9786842012552027 Std: 0.00369297412139312

Test Accuracy Score: 0.9739910313901345
              precision    recall  f1-score   support

         ham       0.99      0.98      0.98       965
        spam       0.88      0.94      0.91       150

    accuracy                           0.97      1115
   macro avg       0.93      0.96      0.95      1115
weighted avg       0.98      0.97      0.97      111

In [8]:
## tf idf verctorize
tfidfVectorizer = TfidfVectorizer()
features = tfidfVectorizer.fit_transform(messages).toarray()
train(features,  [LogisticRegression(), MultinomialNB(), SVC()])

<class 'sklearn.linear_model._logistic.LogisticRegression'>
Mean score: 0.9596113099102637 Std: 0.008732063400315265

Test Accuracy Score: 0.9551569506726457
              precision    recall  f1-score   support

         ham       0.95      1.00      0.97       965
        spam       0.96      0.69      0.81       150

    accuracy                           0.96      1115
   macro avg       0.96      0.84      0.89      1115
weighted avg       0.96      0.96      0.95      1115

--------------------------------------------------
<class 'sklearn.naive_bayes.MultinomialNB'>
Mean score: 0.963427247059534 Std: 0.0023266457388220712

Test Accuracy Score: 0.9659192825112107
              precision    recall  f1-score   support

         ham       0.96      1.00      0.98       965
        spam       1.00      0.75      0.85       150

    accuracy                           0.97      1115
   macro avg       0.98      0.87      0.92      1115
weighted avg       0.97      0.97      0.96      1

we can see that svc model nearly has the same metrics with both features (tf-idf and word count). 
it seems models work better with wordcountvectorizer than tfidfvectorizer. I guess that when data is short and we have few unique dictionary words, this feature (word count vectorizer) works better but in general, tf idf is better as it considers generality of a word and ...

In [9]:
## hashing verctorize
hashingVectorizer = HashingVectorizer()
features = tfidfVectorizer.fit_transform(messages).toarray()
train(features,  [LogisticRegression(), MultinomialNB(), SVC()])

<class 'sklearn.linear_model._logistic.LogisticRegression'>
Mean score: 0.9596113099102637 Std: 0.008732063400315265

Test Accuracy Score: 0.9551569506726457
              precision    recall  f1-score   support

         ham       0.95      1.00      0.97       965
        spam       0.96      0.69      0.81       150

    accuracy                           0.96      1115
   macro avg       0.96      0.84      0.89      1115
weighted avg       0.96      0.96      0.95      1115

--------------------------------------------------
<class 'sklearn.naive_bayes.MultinomialNB'>
Mean score: 0.963427247059534 Std: 0.0023266457388220712

Test Accuracy Score: 0.9659192825112107
              precision    recall  f1-score   support

         ham       0.96      1.00      0.98       965
        spam       1.00      0.75      0.85       150

    accuracy                           0.97      1115
   macro avg       0.98      0.87      0.92      1115
weighted avg       0.97      0.97      0.96      1

It turns a collection of text documents into a scipy.sparse matrix holding token occurrence counts (or binary occurrence information), possibly normalized as token frequencies if norm=’l1’ or projected on the euclidean unit sphere if norm=’l2’.

In [12]:
## 0 - 1 occurance
count_vectorizer = CountVectorizer()
features = count_vectorizer.fit_transform(messages).toarray()
features_list = features.tolist()
for vector in features_list:
  for i in range(len(vector)):
    if vector[i] > 0:
      vector[i] = 1
train(features_list,  [LogisticRegression(), MultinomialNB(), SVC()])

<class 'sklearn.linear_model._logistic.LogisticRegression'>
Mean score: 0.980926857010564 Std: 0.004145927084854392

Test Accuracy Score: 0.9757847533632287
              precision    recall  f1-score   support

         ham       0.97      1.00      0.99       965
        spam       1.00      0.82      0.90       150

    accuracy                           0.98      1115
   macro avg       0.99      0.91      0.94      1115
weighted avg       0.98      0.98      0.97      1115

--------------------------------------------------
<class 'sklearn.naive_bayes.MultinomialNB'>
Mean score: 0.9798057807773801 Std: 0.003693354174248026

Test Accuracy Score: 0.9695067264573991
              precision    recall  f1-score   support

         ham       0.99      0.98      0.98       965
        spam       0.85      0.93      0.89       150

    accuracy                           0.97      1115
   macro avg       0.92      0.95      0.94      1115
weighted avg       0.97      0.97      0.97      11

in this part, we vectorize by using 1 value for occurance of the word in the sentence and 0 for not ocuurance. as we see for a model like naive bayes which mostly depends on word frequency features, the precission has fallen.