# Spam-Antispam classification using Natural Language Processing (nltk)
#  

In [1]:
# Import relevant Libraries

import pandas as pd
import numpy as np
import random
import nltk
import string
from nltk.stem.porter import PorterStemmer
from nltk.classify import NaiveBayesClassifier
from nltk.tokenize import word_tokenize
nltk.download("punkt")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Hemang\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Hemang\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
df = pd.read_csv('spam.csv',encoding = 'ANSI') # Read Dataset and store as a DataFrame using Pandas
df = df[['v1','v2']] # Keep only necessary columns
df['v1']=df['v1'].map({'ham':0,'spam':1}) # Map ham as '0' and Spam as '1'
df.head()

Unnamed: 0,v1,v2
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
df.shape

(5572, 2)

In [4]:
df.rename(columns = {'v1': 'spam', 'v2': 'text'}, inplace = True) # Re-naming columns

In [5]:
df['text'] = df['text'].str.lower() # converting text to lowercase
df.head()

Unnamed: 0,spam,text
0,0,"go until jurong point, crazy.. available only ..."
1,0,ok lar... joking wif u oni...
2,1,free entry in 2 a wkly comp to win fa cup fina...
3,0,u dun say so early hor... u c already then say...
4,0,"nah i don't think he goes to usf, he lives aro..."


In [6]:
# Downloading stopwords package from nltk module
# Creating a list of stop words and punctuations to omit from the 2 lists created above using 
# bag_of_words_features_filtered() function defined in next cell

nltk.corpus.stopwords.words("english")  
useless_words = nltk.corpus.stopwords.words("english") + list(string.punctuation)

In [7]:
def bag_of_words_features_filtered(words):  # Function which filters words that have some significance 
    my_dict = dict( [ (word, True) for word in words if not word in useless_words] )
    return my_dict

In [8]:
# Filtering out the 'text' column from the Dataframe df having column spam = '1' 

dfspam = df[df['spam']==1]['text']

In [9]:
# Filtering out the 'text' column from the Dataframe df having column spam = '1' 

dfham = df[df['spam']==0]['text']

In [10]:
# Creating a list of spam emails after data cleaning, word segregation performed on each email one-by-one

spam_list=[]
stemmer = PorterStemmer()
for i in range(dfspam.shape[0]):
    word = word_tokenize(dfspam.iloc[i])  #Tokenizing
    word1 = [word for word in word if len(word)>2] #  Removing single char words
    words = [stemmer.stem(x) for x in word1]  #Stemming
    spam_list.append((bag_of_words_features_filtered(words), "spam"))
spam_list[3:5]

[({'mobil': True,
   'month': True,
   'entitl': True,
   'updat': True,
   'latest': True,
   'colour': True,
   'camera': True,
   'free': True,
   'call': True,
   '08002986030': True},
  'spam'),
 ({'six': True,
   'chanc': True,
   'win': True,
   'cash': True,
   '100': True,
   '20,000': True,
   'pound': True,
   'txt': True,
   'csh11': True,
   'send': True,
   '87575.': True,
   'cost': True,
   '150p/day': True,
   '6day': True,
   '16+': True,
   'tsandc': True,
   'appli': True,
   'repli': True,
   'info': True},
  'spam')]

In [11]:
# Creating a list of Anti-spam(ham) emails after data cleaning, word segregation performed on each email one-by-one

ham_list=[]
for j in range(dfham.shape[0]):
    word = word_tokenize(dfham.iloc[j])  #Tokenizing
    word1 = [word for word in word if len(word)>1] #  Removing single char words
    words = [stemmer.stem(x) for x in word1]  #Stemming
    ham_list.append((bag_of_words_features_filtered(words), "ham"))
ham_list[3:5]

[({'nah': True,
   "n't": True,
   'think': True,
   'goe': True,
   'usf': True,
   'live': True,
   'around': True,
   'though': True},
  'ham'),
 ({'even': True,
   'brother': True,
   'like': True,
   'speak': True,
   'treat': True,
   'aid': True,
   'patent': True},
  'ham')]

In [12]:
#80/20 split for training data/test data

split1 = 3860
split2 = 597

# Fit training split to model
sentiment_classifier = NaiveBayesClassifier.train(ham_list[:split1]+spam_list[:split2]) # Naive Bayes Classifier

In [13]:
# Find the accuracy, using the training data

accuracy_traindata = nltk.classify.util.accuracy(sentiment_classifier,ham_list[:split1]+spam_list[:split2])*100
print(accuracy_traindata)

94.68252187570114


In [14]:
# Find the accuracy, using the test data
accuracy_testdata = nltk.classify.util.accuracy(sentiment_classifier,ham_list[split1:]+spam_list[split2:])*100
print(accuracy_testdata)

94.34977578475336
