### Import required libraries

In [20]:
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
import re

### Read the csv in the pandas dataframe, drop unwanted columns

In [4]:
df = pd.read_csv('spam.csv', encoding="latin-1")

In [5]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [6]:
df.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'], axis = 1, inplace=True)

### Save the text message labels in a separate variable(y) and encode them

In [8]:
y = df['v1']
y = y.map({'ham': 0, 'spam':1})

### Save the text messages in the dataframe

In [None]:
df = df['v2']

In [17]:
X1 = []
X2 = []

### Initialize the Porter Stemmer and Word Lemmatizer

In [14]:
ps = PorterStemmer()
lem = WordNetLemmatizer()

### Save the stemmed words in X1

In [106]:
for i in range(len(df)):
    words = re.sub('[^a-zA-Z]', ' ', df[i])             # Remove all characters except aplhabets
    words = words.lower()                               # Convert the message to lower case
    words = nltk.word_tokenize(words)           # Separate the message into individual words aka tokenize
    words = [ps.stem(word) for word in words] # Stem each word
    words = ' '.join(words)                                # Join all the stemmed words back into a message
    X1.append(words)                                    # Append the message into the X2 variable   

### Save the lemmatized words in X2

In [107]:
for i in range(len(df)):
    words = re.sub('[^a-zA-Z]', ' ', df[i])             # Remove all characters except aplhabets
    words = words.lower()                               # Convert the message to lower case
    words = nltk.word_tokenize(words)           # Separate the message into individual words aka tokenize
    words = [lem.lemmatize(word) for word in words]  # Lemmatize each word
    words = ' '.join(words)                                # Join all the lemmatized words back into a message
    X2.append(words)                                    # Append the message into the X2 variable

### Import the Count vectorizer(Bag of words) and Tfidf vectorizer and Initialize them

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

cv = CountVectorizer(max_features=3500)
tfidf = TfidfVectorizer()

# Import libraries for training the model

In [116]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

In [122]:
from sklearn.metrics import confusion_matrix

### Stemming and bag of words

In [None]:
#  Count vectorize the stemmed messages
stem_bow_df = cv.fit_transform(X1).toarray()   

In [139]:
# Split the data into training and validation sets
X_train, X_test, y_train, y_test = train_test_split(stem_bow_df, y, test_size=0.25, random_state=82)

In [140]:
# Initialize the Naive Bayes model and fit the data set
nb = MultinomialNB()
nb.fit(X_train, y_train)

In [142]:
# Evaluate the model
print(nb.score(X_test,y_test)
y_pred = nb.predict(X_test)
print(confusion_matrix(y_test, y_pred))

0.9798994974874372

## Lemmatization and bag of words

In [None]:
#  TFIDF vectorize the lemmatized messages
lem_bow_df = cv.fit_transform(X2).toarray()  

In [145]:
# Split the data into training and validation sets
X_train, X_test, y_train, y_test = train_test_split(lem_bow_df, y, test_size=0.25, random_state=52)

In [146]:
# Initialize the Naive Bayes model and fit the data set
nb2 = MultinomialNB()
nb2.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [147]:
nb.score(X_test,y_test)
y_pred = nb.predict(X_test)
confusion_matrix(y_test, y_pred)

0.9806173725771715

## Stemming and TFIDF

In [None]:
#  TFIDF vectorize the stemmed messages
stem_tf_df = tfidf.fit_transform(X1).toarray()

In [150]:
# Split the data into training and validation sets
X_train, X_test, y_train, y_test = train_test_split(stem_tf_df, y, test_size=0.25, random_state=49)

In [151]:
# Initialize the Naive Bayes model and fit the data set
nb3 = MultinomialNB()
nb3.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [152]:
nb.score(X_test,y_test)
y_pred = nb.predict(X_test)
confusion_matrix(y_test, y_pred)

0.9698492462311558

## Lemmatization and TFIDF

In [None]:
##  TDIDF vectorize the lemmatized messages
lem_tf_df = tfidf.fit_transform(X2).toarray()  

In [155]:
# Split the data into training and validation sets
X_train, X_test, y_train, y_test = train_test_split(lem_tf_df, y, test_size=0.25, random_state=12)

In [156]:
nb.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [157]:
nb.score(X_test,y_test)

0.95908111988514

In [158]:
y_pred = nb.predict(X_test)

In [159]:
confusion_matrix(y_test, y_pred)

array([[1201,    0],
       [  57,  135]], dtype=int64)

In [160]:
import pickle

## Stemming and bag of words gives the best model

In [185]:
X_train, X_test, y_train, y_test = train_test_split(stem_bow_df, y, test_size=0.25, random_state=42)

In [186]:
nb.fit(X_train, y_train)
print(nb.score(X_test,y_test))
y_pred = nb.predict(X_test)
print(confusion_matrix(y_test, y_pred))

0.9791816223977028
[[1187   15]
 [  14  177]]


In [163]:
pkl_filename = "naive_bayes_model.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(nb, file)

In [164]:
pkl_filename = "count_vect.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(cv, file)

In [18]:
d = 'Congratulations you have won'
d = d.lower()
d = [d]
d = [ps.stem(word) for word in d if word not in stopwords.words('english')]


In [19]:
d

['congratulations you have won']

In [195]:
d = cv.transform(d)

In [201]:
d

['congratulations you have won']

In [196]:
nb.predict(d)

array([0], dtype=int64)