<a href="https://www.kaggle.com/sid9300/learning-spam-detector?scriptVersionId=84542784" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

* [Importing Libraries](#importing)
* [Reading the File](#reading)
* [Preprocessing](#preprocessing)
* [Preparing to Create Features](#features)
* [Preparing to Create Train & Test Sets](#creating)
* [Preparing to Create Feature Maps](#feature-maps)
* [Training](#training)
* [Evaluation](#evaluation)

## <font color='#4a8bad'>Importing Libraries</font>
***
<a id="importing"></a>

In [1]:
import nltk
import random
import pandas as pd

from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

## <font color='#4a8bad'>Reading the File</font>
***
<a id="reading"></a>

In [2]:
spam = pd.read_csv("../input/spam-text-message-classification/SPAM text message 20170820 - Data.csv")
spam.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
data_set = []

for index,row in spam.iterrows():
    data_set.append((row.Message, row.Category))
    
print("Messages : ", len(data_set))
print("--------------------------")
data_set[:5]

Messages :  5572
--------------------------


[('Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
  'ham'),
 ('Ok lar... Joking wif u oni...', 'ham'),
 ("Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
  'spam'),
 ('U dun say so early hor... U c already then say...', 'ham'),
 ("Nah I don't think he goes to usf, he lives around here though", 'ham')]

## <font color='#4a8bad'>Preprocessing</font>
***
<a id="preprocessing"></a>

In [4]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def preprocess(document, stem = True):
    
    # Change the sentence to lower case
    document = document.lower()
    
    # Tokenize into words
    words = word_tokenize(document)
    
    # Remove the stop words
    words = [word for word in words if word not in stopwords.words("english")]
    
    if stem:
        words = [stemmer.stem(word) for word in words]
    else:
        words = [lemmatizer.lemmatize(word, pos='v') for word in words]
        
    # Join the words in sentence
    document = " ".join(words)
    
    return document

message_set = []
for (Message, Category) in data_set:
    words_filtered = [msg.lower() for msg in preprocess(Message).split() if len(msg) >= 3]
    message_set.append((words_filtered, Category))
    
print(message_set[:5])

[(['jurong', 'point', 'crazy..', 'avail', 'bugi', 'great', 'world', 'buffet', '...', 'cine', 'got', 'amor', 'wat', '...'], 'ham'), (['lar', '...', 'joke', 'wif', 'oni', '...'], 'ham'), (['free', 'entri', 'wkli', 'comp', 'win', 'cup', 'final', 'tkt', '21st', 'may', '2005.', 'text', '87121', 'receiv', 'entri', 'question', 'std', 'txt', 'rate', 'appli', '08452810075over18'], 'spam'), (['dun', 'say', 'earli', 'hor', '...', 'alreadi', 'say', '...'], 'ham'), (['nah', "n't", 'think', 'goe', 'usf', 'live', 'around', 'though'], 'ham')]


## <font color='#4a8bad'>Preparing to Create Features</font>
***
<a id="features"></a>

In [5]:
# Getting words from the messages
def get_words_in_messages(messages):
    all_words = []
    
    for (Message, Category) in messages:
        all_words.extend(Message)
        
    return all_words

# Getting the words using nltk library
def get_words_features(word_list):
    word_list = nltk.FreqDist(word_list)
    return word_list.keys()

# Creating word features from the dataset
word_features = get_words_features(get_words_in_messages(message_set))
print("Word Features : ", len(word_features))

Word Features :  8018


## <font color='#4a8bad'>Preparing to Create Train & Test Sets</font>
***
<a id="creating"></a>

In [6]:
print("Total Length        : ", len(message_set))
print("----------------------------------------")

slice_index = int(len(message_set) * .8)
print("Slicing By          : ", slice_index)
print("----------------------------------------")

random.shuffle(message_set)

train_set, test_set = message_set[:slice_index], message_set[slice_index:]
print("Length of Train Set : ", len(train_set))
print("Length of Test Set  : ", len(test_set))

Total Length        :  5572
----------------------------------------
Slicing By          :  4457
----------------------------------------
Length of Train Set :  4457
Length of Test Set  :  1115


## <font color='#4a8bad'>Preparing to Create Feature Maps</font>
***
<a id="feature-maps"></a>

In [7]:
def extract_features(document):
    document_set = set(document)
    features = {}
    
    for word in word_features:
        features['Contains(%s)' % word] = (word in document_set)
    
    return features

train_features = nltk.classify.apply_features(extract_features, train_set)
test_features = nltk.classify.apply_features(extract_features, test_set)

print("Length of Train Features : ", len(train_features))
print("Length of Test Features  : ", len(test_features))

Length of Train Features :  4457
Length of Test Features  :  1115


## <font color='#4a8bad'>Training</font>
***
<a id="training"></a>

In [8]:
# Training the classifier with NaiveBayes algorithm
spamClassifier = nltk.NaiveBayesClassifier.train(train_features)

## <font color='#4a8bad'>Evaluation</font>
***
<a id="evaluation"></a>

In [9]:
# Analyzing the accuracy of the test set
print("Accuracy of Training Set : ", round(nltk.classify.accuracy(spamClassifier, train_features)*100, 2))

Accuracy of Training Set :  99.26


In [10]:
# Analyzing the accuracy of the test set
print("Accuracy of Test Set    : ", round(nltk.classify.accuracy(spamClassifier, test_features)*100, 2))

Accuracy of Test Set    :  98.3


In [11]:
# Testing a example message with our newly trained classifier
m1 = 'CONGRATULATIONS!! As a valued account holder you have been selected to receive a £900 prize reward! Valid 12 hours only.'
m2 = 'Hi, how r u?'

print('Classification Result of 1st Example : ', spamClassifier.classify(extract_features(m1.split())))
print('Classification Result of 2nd Example : ', spamClassifier.classify(extract_features(m2.split())))

Classification Result of 1st Example :  spam
Classification Result of 2nd Example :  ham
