### Import required libraries

In [1]:
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
import re

### Read the csv in the pandas dataframe, drop unwanted columns

In [2]:
df = pd.read_csv('spam.csv', encoding="latin-1")

In [3]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
df.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'], axis = 1, inplace=True)

### Save the text message labels in a separate variable(y) and encode them

In [5]:
y = df['v1']
y = y.map({'ham': 0, 'spam':1})

### Save the text messages in the dataframe

In [6]:
df = df['v2']

In [7]:
X1 = []
X2 = []

### Initialize the Porter Stemmer and Word Lemmatizer

In [8]:
ps = PorterStemmer()
lem = WordNetLemmatizer()

### Save the stemmed words in X1

In [9]:
for i in range(len(df)):
    words = re.sub('[^a-zA-Z]', ' ', df[i])             # Remove all characters except aplhabets
    words = words.lower()                               # Convert the message to lower case
    words = nltk.word_tokenize(words)           # Separate the message into individual words aka tokenize
    words = [ps.stem(word) for word in words] # Stem each word
    words = ' '.join(words)                                # Join all the stemmed words back into a message
    X1.append(words)                                    # Append the message into the X2 variable   

### Save the lemmatized words in X2

In [10]:
for i in range(len(df)):
    words = re.sub('[^a-zA-Z]', ' ', df[i])             # Remove all characters except aplhabets
    words = words.lower()                               # Convert the message to lower case
    words = nltk.word_tokenize(words)           # Separate the message into individual words aka tokenize
    words = [lem.lemmatize(word) for word in words]  # Lemmatize each word
    words = ' '.join(words)                                # Join all the lemmatized words back into a message
    X2.append(words)                                    # Append the message into the X2 variable

### Import the Count vectorizer(Bag of words) and Tfidf vectorizer and Initialize them

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

cv = CountVectorizer(max_features=3500)
tfidf = TfidfVectorizer()

# Import libraries for training the model

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

In [13]:
from sklearn.metrics import confusion_matrix

### Stemming and bag of words

In [14]:
#  Count vectorize the stemmed messages
stem_bow_df = cv.fit_transform(X1).toarray()   

In [15]:
# Split the data into training and validation sets
X_train1, X_test1, y_train1, y_test1 = train_test_split(stem_bow_df, y, test_size=0.3, random_state=32)

In [16]:
# Initialize the Naive Bayes model and fit the data set
nb1 = MultinomialNB()
nb1.fit(X_train1, y_train1)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [17]:
# Evaluate the model
print('Score: \t', nb1.score(X_test1,y_test1))
y_pred1 = nb1.predict(X_test1)
print('Confustion Matrix \n' ,confusion_matrix(y_test1, y_pred1))

Score: 	 0.9844497607655502
Confustion Matrix 
 [[1437   14]
 [  12  209]]


## Lemmatization and bag of words

In [18]:
#  TFIDF vectorize the lemmatized messages
lem_bow_df = cv.fit_transform(X2).toarray()  

In [19]:
# Split the data into training and validation sets
X_train2, X_test2, y_train2, y_test2 = train_test_split(lem_bow_df, y, test_size=0.3, random_state=32)

In [20]:
# Initialize the Naive Bayes model and fit the data set
nb2 = MultinomialNB()
nb2.fit(X_train2, y_train2)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [21]:
print('Score: \t',nb2.score(X_test2,y_test2))
y_pred2 = nb2.predict(X_test2)
print('Confustion Matrix \n' ,confusion_matrix(y_test2, y_pred2))

Score: 	 0.9838516746411483
Confustion Matrix 
 [[1434   17]
 [  10  211]]


## Stemming and TFIDF

In [22]:
#  TFIDF vectorize the stemmed messages
stem_tf_df = tfidf.fit_transform(X1).toarray()

In [23]:
# Split the data into training and validation sets
X_train3, X_test3, y_train3, y_test3 = train_test_split(stem_tf_df, y, test_size=0.25, random_state=42)

In [24]:
# Initialize the Naive Bayes model and fit the data set
nb3 = MultinomialNB()
nb3.fit(X_train3, y_train3)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [25]:
print('Score: \t',nb3.score(X_test3,y_test3))
y_pred3 = nb3.predict(X_test3)
print('Confustion Matrix \n' ,confusion_matrix(y_test3, y_pred3))

Score: 	 0.9519023689877961
Confustion Matrix 
 [[1200    2]
 [  65  126]]


## Lemmatization and TFIDF

In [26]:
##  TDIDF vectorize the lemmatized messages
lem_tf_df = tfidf.fit_transform(X2).toarray()  

In [27]:
# Split the data into training and validation sets
X_train4, X_test4, y_train4, y_test4 = train_test_split(lem_tf_df, y, test_size=0.25, random_state=42)

In [28]:
# Initialize the Naive Bayes model and fit the data set
nb4 = MultinomialNB()
nb4.fit(X_train4, y_train4)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [29]:
print('Score: \t', nb4.score(X_test4,y_test4))
y_pred4 = nb4.predict(X_test4)
print('Confustion Matrix \n' ,confusion_matrix(y_test4, y_pred4))

Score: 	 0.9526202440775305
Confustion Matrix 
 [[1200    2]
 [  64  127]]


## Stemming and bag of words gives the best model

In [30]:
#X_train, X_test, y_train, y_test = train_test_split(stem_bow_df, y, test_size=0.25, random_state=42)

In [31]:
# nb.fit(X_train, y_train)
# print(nb.score(X_test,y_test))
# y_pred = nb.predict(X_test)
# print(confusion_matrix(y_test, y_pred))

### Save the best model in the pickle file

In [36]:
import pickle

model_filename = "naive_bayes_model.pkl"
with open(model_filename, 'wb') as file:
    pickle.dump(nb1, file)
    
vectorizer_filename = "count_vect.pkl"
with open(vectorizer_filename, 'wb') as file:
    pickle.dump(cv, file)

In [33]:
d = 'congratulations you have won 1 million'
d = d.lower()
d = [d]
d = [ps.stem(word) for word in d if word]

In [34]:
d = cv.transform(d)

In [35]:
nb1.predict(d)

array([1], dtype=int64)