In [1]:
import pandas as pd

In [2]:
# Importing the dataset and initializing the column names
df = pd.read_csv("SMSSpamCollection", sep='\t', names=["label", "messages"])
df

Unnamed: 0,label,messages
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [3]:
#importing the libraries
import re
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
ps = PorterStemmer()
wordnet = WordNetLemmatizer()
corpus_stem = []
corpus_lemma=[]

In [4]:
#preprocessing the data
#for stemmer
for i in range(len(df)):
    review = re.sub('[^a-zA-Z]', ' ', df['messages'][i]) #removing all the punctuation marks
    review = review.lower() #lowering  the sentences
    review = review.split()
    review = [ps.stem(word) for word in review if word not in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus_stem.append(review)

In [5]:
#preprocessing the data
#for lemmatizer
for i in range(len(df)):
    review = re.sub('[^a-zA-Z]', ' ', df['messages'][i]) #removing all the punctuation marks
    review = review.lower() #lowering  the sentences
    review = review.split()
    review = [wordnet.lemmatize(word) for word in review if word not in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus_lemma.append(review)

# Bag of Words

In [6]:
#Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features= 5000)
X_stem = cv.fit_transform(corpus_stem).toarray()
X_lemma = cv.fit_transform(corpus_lemma).toarray()

In [7]:
#converting labels to numbers
y = pd.get_dummies(df['label'])
y = y.iloc[:,1].values
y

array([0, 0, 1, ..., 0, 0, 0], dtype=uint8)

In [8]:
#importing libraries for model training
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

In [9]:
X_stem_train, X_stem_test, y_train, y_test = train_test_split(X_stem, y, test_size=0.20, random_state=0)
X_lemma_train, X_lemma_test, y_train, y_test = train_test_split(X_lemma, y, test_size=0.20, random_state=0)

In [11]:
spam_detector_stem = MultinomialNB().fit(X_stem_train, y_train)
spam_detector_lemma = MultinomialNB().fit(X_lemma_train, y_train)

In [12]:
y_pred_stem = spam_detector_stem.predict(X_stem_test)
y_pred_lemma = spam_detector_lemma.predict(X_lemma_test)

In [13]:
#checking accuracy via confusion matrix
from sklearn.metrics import confusion_matrix
confusion_m_stem = confusion_matrix(y_test, y_pred_stem)
confusion_m_lemma = confusion_matrix(y_test, y_pred_lemma)

In [14]:
confusion_m_stem

array([[946,   9],
       [  8, 152]], dtype=int64)

In [15]:
confusion_m_lemma

array([[944,  11],
       [  9, 151]], dtype=int64)

In [16]:
#checking accuracy via accuracy score
from sklearn.metrics import accuracy_score
score_stem = accuracy_score(y_test, y_pred_stem)
score_lemma = accuracy_score(y_test, y_pred_lemma)

In [17]:
score_stem

0.9847533632286996

In [18]:
score_lemma

0.9820627802690582

We can see that stemmer performs better tham lemma, However both are almost similar. Next lets try using the TF-IDF instead of the Bag of Words method.

# TF-IDF

In [19]:
#tf-idf model\

#importing libraries
from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer(max_features= 5000)
X_stem = cv.fit_transform(corpus_stem).toarray()
X_lemma = cv.fit_transform(corpus_lemma).toarray()

In [20]:
#converting labels to numbers
y = pd.get_dummies(df['label'])
y = y.iloc[:,1].values
y

array([0, 0, 1, ..., 0, 0, 0], dtype=uint8)

In [21]:
# import libraries for model training
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

In [22]:
X_stem_train, X_stem_test, y_train, y_test = train_test_split(X_stem, y, test_size=0.20, random_state=0)
X_lemma_train, X_lemma_test, y_train, y_test = train_test_split(X_lemma, y, test_size=0.20, random_state=0)

In [23]:
spam_detector_stem = MultinomialNB().fit(X_stem_train, y_train)
spam_detector_lemma = MultinomialNB().fit(X_lemma_train, y_train)

In [24]:
y_pred_stem = spam_detector_stem.predict(X_stem_test)
y_pred_lemma = spam_detector_lemma.predict(X_lemma_test)

In [25]:
#checking accuracy via confusion matrix
from sklearn.metrics import confusion_matrix
confusion_m_stem = confusion_matrix(y_test, y_pred_stem)
confusion_m_lemma = confusion_matrix(y_test, y_pred_lemma)

In [26]:
confusion_m_stem

array([[955,   0],
       [ 29, 131]], dtype=int64)

In [27]:
confusion_m_lemma

array([[955,   0],
       [ 26, 134]], dtype=int64)

In [28]:
#checking accuracy via accuracy score
from sklearn.metrics import accuracy_score
score_stem = accuracy_score(y_test, y_pred_stem)
score_lemma = accuracy_score(y_test, y_pred_lemma)

In [29]:
score_stem

0.9739910313901345

In [30]:
score_lemma

0.9766816143497757

After using the Tf-IDF Vectorizer we can see that Bag of words outperforms Tf-IDF in both stemmer and lemma types.