In [6]:
import pandas as pd

data = pd.read_csv('spam.csv')
data = data.drop(["Unnamed: 2","Unnamed: 3","Unnamed: 4"], axis=1)
data = data.rename(columns={"v1":"label", "v2":"text"})
data.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
import os
import re
import string
import math
import pandas as pd

class SpamDetector(object):
    """Implementation of Naive Bayes for binary classification"""
    def clean(self, s):
        translator = str.maketrans("", "", string.punctuation)
        return s.translate(translator)

    def tokenize(self, text):
        text = self.clean(text).lower()
        return re.split("\W+", text)

    def get_word_counts(self, words):
        word_counts = {}
        for word in words:
            word_counts[word] = word_counts.get(word, 0.0) + 1.0
        return word_counts

    def fit(self, X, Y):
        """Fit our classifier
        Arguments:
            X {list} -- list of document contents
            y {list} -- correct labels
        """
        self.num_messages = {}
        self.log_class_priors = {}
        self.word_counts = {}
        self.vocab = set()

        n = len(X)
        self.num_messages['spam'] = sum(1 for label in Y if label == 'spam')
        self.num_messages['ham'] = sum(1 for label in Y if label == 'ham')
        self.log_class_priors['spam'] = math.log(self.num_messages['spam'] / n )
        self.log_class_priors['ham'] = math.log(self.num_messages['ham'] / n )
        self.word_counts['spam'] = {}
        self.word_counts['ham'] = {}

        for x, y in zip(X, Y):
            c = 'spam' if y == 'spam' else 'ham'
            counts = self.get_word_counts(self.tokenize(x))
            for word, count in counts.items():
                if word not in self.vocab:
                    self.vocab.add(word)
                if word not in self.word_counts[c]:
                    self.word_counts[c][word] = 0.0

                self.word_counts[c][word] += count

    def predict(self, X):
        result = []
        for x in X:
            counts = self.get_word_counts(self.tokenize(x))
            spam_score = 0
            ham_score = 0
            for word, _ in counts.items():
                if word not in self.vocab: continue
                
                # add Laplace smoothing
                log_w_given_spam = math.log( (self.word_counts['spam'].get(word, 0.0) + 1) / (self.num_messages['spam'] + len(self.vocab)) )
                log_w_given_ham = math.log( (self.word_counts['ham'].get(word, 0.0) + 1) / (self.num_messages['ham'] + len(self.vocab)) )

                spam_score += log_w_given_spam
                ham_score += log_w_given_ham

            spam_score += self.log_class_priors['spam']
            ham_score += self.log_class_priors['ham']

            if spam_score > ham_score:
                result.append('spam')
            else:
                result.append('ham')
        return result
        

if __name__ == '__main__':
    from sklearn.model_selection import train_test_split
    data = pd.read_csv('spam.csv',encoding='latin-1')
    data = data.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)
    data = data.rename(columns={"v1":'label', "v2":'text'})
    print(data.head())
    tags = data["label"]
    texts = data["text"]
    X, y = texts, tags
    print(len(X))
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
    MNB = SpamDetector()
    MNB.fit(X_train.values, y_train.values)
    print(MNB.num_messages)
#     print(MNB.word_counts)
    pred = MNB.predict(X_test.values)
    true = y_test.values
    accuracy = sum(1 for i in range(len(pred)) if pred[i] == true[i]) / float(len(pred))
    print("{0:.4f}".format(accuracy))

  label                                               text
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...
5572
{'spam': 567, 'ham': 3612}
0.9612


In [9]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn import metrics

data = pd.read_csv('spam.csv',encoding='latin-1')
data = data.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)
data = data.rename(columns={"v1":'label', "v2":'text'})
print(data.head())
tags = data["label"]
texts = data["text"]

X, y = texts, tags

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)


vectorizer = CountVectorizer()
X_train_dtm = vectorizer.fit_transform(X_train)
print(X_train_dtm)

nb = MultinomialNB()
nb.fit(X_train_dtm, y_train)
X_test_dtm = vectorizer.transform(X_test)
y_pred_class = nb.predict(X_test_dtm)
metrics.accuracy_score(y_test, y_pred_class)

  label                                               text
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...
  (0, 4514)	1
  (0, 3626)	1
  (0, 3637)	1
  (0, 3483)	1
  (0, 5671)	1
  (0, 3253)	1
  (0, 4456)	1
  (0, 2226)	1
  (0, 4785)	1
  (0, 1478)	1
  (0, 4841)	1
  (0, 3441)	1
  (0, 2208)	1
  (0, 6315)	2
  (0, 6592)	1
  (0, 1525)	2
  (0, 4163)	2
  (0, 7363)	1
  (0, 5052)	2
  (0, 2102)	1
  (0, 6715)	1
  (0, 5700)	1
  (0, 808)	1
  (0, 791)	1
  (0, 908)	1
  :	:
  (4176, 4881)	1
  (4176, 4820)	1
  (4176, 3429)	1
  (4176, 1580)	1
  (4176, 4206)	1
  (4176, 7152)	1
  (4176, 4437)	1
  (4176, 6626)	1
  (4176, 2294)	1
  (4176, 3406)	1
  (4176, 3242)	1
  (4176, 4734)	1
  (4177, 6941)	1
  (4177, 5219)	1
  (4177, 1838)	1
  (4177, 4753)	1
  (4177, 3152)	1
  (4

0.9870782483847811