### SPAM Ham Detection

In [2]:
import random
import nltk
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [2]:
## Reading the given dataset
spam = pd.read_csv("https://cdn.upgrad.com/UpGrad/temp/bab3e784-e601-4911-9000-f1fbc994a62d/SMSSpamCollection.txt",  sep='\t', names=["label", "message"])

In [3]:
print(spam.head())

  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [4]:
## Converting the read dataset in to a list of tuples, each tuple(row) contianing the message and it's label
data_set = []
for index,row in spam.iterrows():
    data_set.append((row['message'], row['label']))

In [5]:
print(data_set[:5])

[('Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...', 'ham'), ('Ok lar... Joking wif u oni...', 'ham'), ("Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's", 'spam'), ('U dun say so early hor... U c already then say...', 'ham'), ("Nah I don't think he goes to usf, he lives around here though", 'ham')]


In [6]:
print(len(data_set))

5572


### Preprocessing

In [7]:
## initialise the inbuilt Stemmer and the Lemmatizer
stemmer = PorterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()

In [8]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

def preprocess(document, stem=True):
    'changes document to lower case, removes stopwords and lemmatizes/stems the remainder of the sentence'

    # change sentence to lower case
    document = document.lower()

    # tokenize into words
    words = word_tokenize(document)

    # remove stop words
    words = [word for word in words if word not in stopwords.words("english")]

    if stem:
        words = [stemmer.stem(word) for word in words]
    else:
        words = [wordnet_lemmatizer.lemmatize(word, pos='v') for word in words]

    # join words to make sentence
    document = " ".join(words)

    return document

[nltk_data] Downloading package punkt to /home/simplify/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/simplify/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
## - Performing the preprocessing steps on all messages
messages_set = []
for (message, label) in data_set:
    words_filtered = [e.lower() for e in preprocess(message, stem=True).split() if len(e) >= 3]
    messages_set.append((words_filtered, label))

In [10]:
print(messages_set[:5])

[(['jurong', 'point', 'crazi', 'avail', 'bugi', 'great', 'world', 'buffet', '...', 'cine', 'got', 'amor', 'wat', '...'], 'ham'), (['lar', '...', 'joke', 'wif', 'oni', '...'], 'ham'), (['free', 'entri', 'wkli', 'comp', 'win', 'cup', 'final', 'tkt', '21st', 'may', '2005.', 'text', '87121', 'receiv', 'entri', 'question', 'std', 'txt', 'rate', 'appli', '08452810075over18'], 'spam'), (['dun', 'say', 'earli', 'hor', '...', 'alreadi', 'say', '...'], 'ham'), (['nah', "n't", 'think', 'goe', 'usf', 'live', 'around', 'though'], 'ham')]


### Preparing to create features

In [11]:
## - creating a single list of all words in the entire dataset for feature list creation

def get_words_in_messages(messages):
    all_words = []
    for (message, label) in messages:
      all_words.extend(message)
    return all_words

In [12]:
## - creating a final feature list using an intuitive FreqDist, to eliminate all the duplicate words
## Note : we can use the Frequency Distribution of the entire dataset to calculate Tf-Idf scores like we did earlier.

def get_word_features(wordlist):

    #print(wordlist[:10])
    wordlist = nltk.FreqDist(wordlist)
    word_features = wordlist.keys()
    return word_features

In [13]:
## - creating the word features for the entire dataset
word_features = get_word_features(get_words_in_messages(messages_set))
print(len(word_features))

7618


In [14]:
word_features

dict_keys(['jurong', 'point', 'crazi', 'avail', 'bugi', 'great', 'world', 'buffet', '...', 'cine', 'got', 'amor', 'wat', 'lar', 'joke', 'wif', 'oni', 'free', 'entri', 'wkli', 'comp', 'win', 'cup', 'final', 'tkt', '21st', 'may', '2005.', 'text', '87121', 'receiv', 'question', 'std', 'txt', 'rate', 'appli', '08452810075over18', 'dun', 'say', 'earli', 'hor', 'alreadi', 'nah', "n't", 'think', 'goe', 'usf', 'live', 'around', 'though', 'freemsg', 'hey', 'darl', 'week', 'word', 'back', 'like', 'fun', 'still', 'xxx', 'chg', 'send', '£1.50', 'rcv', 'even', 'brother', 'speak', 'treat', 'aid', 'patent', 'per', 'request', "'mell", 'mell', 'oru', 'minnaminungint', 'nurungu', 'vettam', 'set', 'callertun', 'caller', 'press', 'copi', 'friend', 'winner', 'valu', 'network', 'custom', 'select', 'receivea', '£900', 'prize', 'reward', 'claim', 'call', '09061701461.', 'code', 'kl341', 'valid', 'hour', 'mobil', 'month', 'entitl', 'updat', 'latest', 'colour', 'camera', '08002986030', 'gon', 'home', 'soon', 'w

### Preparing to create a train and test set

In [15]:
## - creating slicing index at 80% threshold
sliceIndex = int((len(messages_set)*.8))

In [16]:
## - shuffle the pack to create a random and unbiased split of the dataset
random.shuffle(messages_set)

In [17]:
train_messages, test_messages = messages_set[:sliceIndex], messages_set[sliceIndex:]

In [18]:
len(train_messages)
len(test_messages)

1115

### Preparing to create feature maps for train and test data

In [19]:
## creating a LazyMap of feature presence for each of the 8K+ features with respect to each of the SMS messages
def extract_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains(%s)' % word] = (word in document_words)
    return features

In [20]:
## - creating the feature map of train and test data

training_set = nltk.classify.apply_features(extract_features, train_messages)
testing_set = nltk.classify.apply_features(extract_features, test_messages)

In [21]:
print(training_set[:5])

[({'contains(jurong)': False, 'contains(point)': False, 'contains(crazi)': False, 'contains(avail)': False, 'contains(bugi)': False, 'contains(great)': False, 'contains(world)': False, 'contains(buffet)': False, 'contains(...)': False, 'contains(cine)': False, 'contains(got)': False, 'contains(amor)': False, 'contains(wat)': False, 'contains(lar)': False, 'contains(joke)': False, 'contains(wif)': False, 'contains(oni)': False, 'contains(free)': False, 'contains(entri)': False, 'contains(wkli)': False, 'contains(comp)': False, 'contains(win)': False, 'contains(cup)': False, 'contains(final)': False, 'contains(tkt)': False, 'contains(21st)': False, 'contains(may)': False, 'contains(2005.)': False, 'contains(text)': False, 'contains(87121)': False, 'contains(receiv)': False, 'contains(question)': False, 'contains(std)': False, 'contains(txt)': False, 'contains(rate)': True, 'contains(appli)': False, 'contains(08452810075over18)': False, 'contains(dun)': False, 'contains(say)': False, 'con

In [22]:
print('Training set size : ', len(training_set))
print('Test set size : ', len(testing_set))

Training set size :  4457
Test set size :  1115


### Training

In [23]:
## Training the classifier with NaiveBayes algorithm
spamClassifier = nltk.NaiveBayesClassifier.train(training_set)

### Evaluation

In [24]:
## - Analyzing the accuracy of the test set
print(nltk.classify.accuracy(spamClassifier, training_set))

0.9937177473636976


In [25]:
## Analyzing the accuracy of the test set
print(nltk.classify.accuracy(spamClassifier, testing_set))

0.9757847533632287


In [26]:
## Testing a example message with our newly trained classifier
m = 'CONGRATULATIONS!! As a valued account holder you have been selected to receive a £900 prize reward! Valid 12 hours only.'
print('Classification result : ', spamClassifier.classify(extract_features(m.split())))

Classification result :  spam


In [27]:
## Priting the most informative features in the classifier
print(spamClassifier.show_most_informative_features(50))

Most Informative Features
         contains(award) = True             spam : ham    =    200.5 : 1.0
          contains(code) = True             spam : ham    =    114.3 : 1.0
         contains(nokia) = True             spam : ham    =    104.8 : 1.0
         contains(await) = True             spam : ham    =     88.4 : 1.0
        contains(servic) = True             spam : ham    =     84.1 : 1.0
       contains(attempt) = True             spam : ham    =     75.5 : 1.0
         contains(expir) = True             spam : ham    =     75.5 : 1.0
           contains(txt) = True             spam : ham    =     71.7 : 1.0
        contains(urgent) = True             spam : ham    =     69.7 : 1.0
       contains(landlin) = True             spam : ham    =     68.6 : 1.0
      contains(deliveri) = True             spam : ham    =     66.8 : 1.0
        contains(privat) = True             spam : ham    =     66.8 : 1.0
         contains(mobil) = True             spam : ham    =     62.3 : 1.0

In [28]:
## storing the classifier on disk for later usage
import pickle
f = open('nb_spam_classifier.pickle', 'wb')
pickle.dump(spamClassifier,f)
print('Classifier stored at ', f.name)
f.close()

Classifier stored at  nb_spam_classifier.pickle
