# SPAM Ham Detection

In [1]:
import random
import nltk
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [4]:
spam = pd.read_csv('https://cdn.upgrad.com/UpGrad/temp/a4964625-11c7-4043-adc5-23c0160b2ac1/SMSSpamCollection.txt', sep='\t', names=['label', 'message'])
spam.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
## Converting the read dataset in to a list of tuples, each tuple(row) contianing the message and it's label
data_set = []
for index,row in spam.iterrows():
    data_set.append((row['message'], row['label']))

In [6]:
print(data_set[:5])

[('Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...', 'ham'), ('Ok lar... Joking wif u oni...', 'ham'), ("Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's", 'spam'), ('U dun say so early hor... U c already then say...', 'ham'), ("Nah I don't think he goes to usf, he lives around here though", 'ham')]


### Preprocessing

In [7]:
## initialise the inbuilt Stemmer and the Lemmatizer
stemmer = PorterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()

In [8]:
def preprocess(document, stem=True):
    'changes document to lower case, removes stopwords and lemmatizes/stems the remainder of the sentence'

    # change sentence to lower case
    document = document.lower()

    # tokenize into words
    words = word_tokenize(document)

    # remove stop words
    words = [word for word in words if word not in stopwords.words("english")]

    if stem:
        words = [stemmer.stem(word) for word in words]
    else:
        words = [wordnet_lemmatizer.lemmatize(word, pos='v') for word in words]

    # join words to make sentence
    document = " ".join(words)

    return document

In [9]:
## - Performing the preprocessing steps on all messages
messages_set = []
for (message, label) in data_set:
    words_filtered = [e.lower() for e in preprocess(message, stem=False).split() if len(e) >= 3]
    messages_set.append((words_filtered, label))

In [10]:
print(messages_set[:5])

[(['jurong', 'point', 'crazy..', 'available', 'bugis', 'great', 'world', 'buffet', '...', 'cine', 'get', 'amore', 'wat', '...'], 'ham'), (['lar', '...', 'joke', 'wif', 'oni', '...'], 'ham'), (['free', 'entry', 'wkly', 'comp', 'win', 'cup', 'final', 'tkts', '21st', 'may', '2005.', 'text', '87121', 'receive', 'entry', 'question', 'std', 'txt', 'rate', 'apply', '08452810075over18'], 'spam'), (['dun', 'say', 'early', 'hor', '...', 'already', 'say', '...'], 'ham'), (['nah', "n't", 'think', 'usf', 'live', 'around', 'though'], 'ham')]


### Preparing to create features

In [14]:
def get_words_in_messages(messages):
    all_words = []
    for (message, label) in messages:
      all_words.extend(message)
    return all_words

In [15]:
def get_word_features(wordlist):
    wordlist = nltk.FreqDist(wordlist)
    word_features = wordlist.keys()
    return word_features

In [16]:
word_features = get_word_features(get_words_in_messages(messages_set))
print(len(word_features))

8393


### Preparing to create a train and test set

In [17]:
sliceIndex = int((len(messages_set)*.8))

In [18]:
random.shuffle(messages_set)

In [19]:
train_messages, test_messages = messages_set[:sliceIndex], messages_set[sliceIndex:]

In [20]:
len(train_messages)

4457

In [21]:
len(test_messages)

1115

### Preparing to create feature maps for train and test data

In [23]:
def extract_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains(%s)' % word] = (word in document_words)
    return features

In [24]:
## - creating the feature map of train and test data

training_set = nltk.classify.apply_features(extract_features, train_messages)
testing_set = nltk.classify.apply_features(extract_features, test_messages)

In [25]:
print(training_set[:5])

[({'contains(jurong)': False, 'contains(point)': False, 'contains(crazy..)': False, 'contains(available)': False, 'contains(bugis)': False, 'contains(great)': False, 'contains(world)': False, 'contains(buffet)': False, 'contains(...)': False, 'contains(cine)': False, 'contains(get)': False, 'contains(amore)': False, 'contains(wat)': False, 'contains(lar)': False, 'contains(joke)': False, 'contains(wif)': False, 'contains(oni)': False, 'contains(free)': False, 'contains(entry)': False, 'contains(wkly)': False, 'contains(comp)': False, 'contains(win)': False, 'contains(cup)': False, 'contains(final)': False, 'contains(tkts)': False, 'contains(21st)': False, 'contains(may)': False, 'contains(2005.)': False, 'contains(text)': False, 'contains(87121)': False, 'contains(receive)': False, 'contains(question)': False, 'contains(std)': False, 'contains(txt)': False, 'contains(rate)': False, 'contains(apply)': False, 'contains(08452810075over18)': False, 'contains(dun)': False, 'contains(say)': 

In [26]:
print('Training set size : ', len(training_set))
print('Test set size : ', len(testing_set))

Training set size :  4457
Test set size :  1115


### Training: 

In [27]:
spam_classifier = nltk.NaiveBayesClassifier.train(training_set)

### Evaluation:

In [28]:
print(nltk.classify.accuracy(spam_classifier, training_set))

0.9921471842046219


In [None]:
print(nltk.classify.accuracy(spam_classifier, testing_set))

In [30]:
m = 'CONGRATULATIONS!! As a valued account holder you have been selected to receive a £900 prize reward! Valid 12 hours only.'
print('Classification result : ', spam_classifier.classify(extract_features(m.split())))

Classification result :  spam


In [32]:
print(spam_classifier.show_most_informative_features(50))

Most Informative Features
         contains(award) = True             spam : ham    =    189.9 : 1.0
        contains(urgent) = True             spam : ham    =     96.7 : 1.0
         contains(await) = True             spam : ham    =     96.1 : 1.0
          contains(rate) = True             spam : ham    =     87.1 : 1.0
      contains(delivery) = True             spam : ham    =     87.1 : 1.0
          contains(code) = True             spam : ham    =     87.1 : 1.0
       contains(service) = True             spam : ham    =     85.6 : 1.0
         contains(nokia) = True             spam : ham    =     81.8 : 1.0
       contains(private) = True             spam : ham    =     73.7 : 1.0
      contains(landline) = True             spam : ham    =     73.7 : 1.0
          contains(club) = True             spam : ham    =     69.2 : 1.0
           contains(txt) = True             spam : ham    =     66.7 : 1.0
     contains(statement) = True             spam : ham    =     64.8 : 1.0

### Save the model to disk:

In [33]:
import pickle
f = open('spam_or_ham.pickle', 'wb')
pickle.dump(spam_classifier, f)
print('Classifier sotred at: ', f.name)
f.close()

Classifier sotred at:  spam_or_ham.pickle
