# SMS Spam Collection Dataset
https://www.kaggle.com/uciml/sms-spam-collection-dataset

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

data_dir = "../input/"

df = pd.read_csv(data_dir + '/spam.csv', encoding='latin-1')

# split into train and test
data_train, data_test, labels_train, labels_test = train_test_split(
    df.v2,
    df.v1, 
    test_size=0.2, 
    random_state=0) 

print(data_train.shape, data_test.shape, labels_train.shape, labels_test.shape)

(4457,) (1115,) (4457,) (1115,)


In [2]:

def GetVocabulary(data): 
    vocab_set = set([])
    for document in data:
        words = document.split()
        for word in words:
            vocab_set.add(word) 
    return list(vocab_set)

vocab_list = GetVocabulary(data_train)
print (vocab_list[:10])



['small,', 'dorm', 'morning:)', 'Summers', 'Dear', 'letters', 'wth', 'SEE', '430', 'afternon,']


In [3]:

def Document2Vector(vocab_list, data):
    word_vector = np.zeros(len(vocab_list))
    words = data.split()
    for word in words:
        if word in vocab_list:
            word_vector[vocab_list.index(word)] += 1
    return word_vector


In [4]:
train_matrix = []
for document in data_train.values:
    word_vector = Document2Vector(vocab_list, document)
    train_matrix.append(word_vector)

print (len(train_matrix[0]))

13504


In [5]:

def NaiveBayes_train(train_matrix,labels_train):
    num_docs = len(train_matrix) ##4457
    num_words = len(train_matrix[0]) ##13504
    
    spam_vector_count = np.ones(num_words)
    ham_vector_count = np.ones(num_words)  
    spam_total_count = num_words ##13504
    ham_total_count = num_words  ##13504                
    
    spam_count = 0
    ham_count = 0
    for i in range(num_docs):
        if labels_train[i] == 'spam':
            ham_vector_count += train_matrix[i]
            ham_total_count += sum(train_matrix[i])
            ham_count += 1
        else:
            spam_vector_count += train_matrix[i]
            spam_total_count += sum(train_matrix[i])
            spam_count += 1
    
    print (ham_count)
    print (spam_count)
    
    p_spam_vector = np.log(ham_vector_count/ham_total_count)
    p_ham_vector = np.log(spam_vector_count/spam_total_count)
    return p_spam_vector, np.log(spam_count/num_docs), p_ham_vector, np.log(ham_count/num_docs)
    
p_spam_vector, p_spam, p_ham_vector, p_ham = NaiveBayes_train(train_matrix, labels_train.values)

581
3876


In [None]:
data_test.values.shape

(1115,)

In [None]:

    
def Predict(test_word_vector,p_spam_vector, p_spam, p_ham_vector, p_ham):
    
    spam = sum(test_word_vector * p_spam_vector) + p_spam
    ham = sum(test_word_vector * p_ham_vector) + p_ham
    if spam > ham:
        return 'spam'
    else:
        return 'ham'

predictions = []
i = 0
for document in data_test.values:
    test_word_vector = Document2Vector(vocab_list, document)
    ans = Predict(test_word_vector, p_spam_vector, p_spam, p_ham_vector, p_ham)
    predictions.append(ans)

print (len(predictions))

In [None]:


from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
from sklearn.model_selection import cross_val_score


print (accuracy_score(labels_test, predictions))
print (classification_report(labels_test, predictions))
print (confusion_matrix(labels_test, predictions))
