# Loading the dataset

In [4]:
#loading the dataset
import pandas as pd

messages = pd.read_csv('SMSSpamCollection/SmsCollection', sep='\t', names = ['label','message'])
messages.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


# Data cleaning and preprocessing

In [5]:
#Data cleaning and preprocessing
import re
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/aditya/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
corpus = []

In [11]:
for n in range(0, len(messages)):
    # substuting other than alphabets with spaces
    review = re.sub('[^a-zA-Z]', ' ', messages['message'][n])
    
    review = review.lower()
    review = review.split()
    
    #converting each words to its stem form and filtering the stopwords
    review = [stemmer.stem(word) for word in review if not word in stopwords.words('english')]
    
    review = ' '.join(review)
    corpus.append(review)

In [12]:
# Creating the Bag of words
from sklearn.feature_extraction.text import CountVectorizer

# this will choose only top 2500 words leaving others.
cv = CountVectorizer(max_features=2500)

#format of X is (row no,unique no of perticular word): frequency
X = cv.fit_transform(corpus)

In [20]:
# appling one-hot encoding on the label
y = pd.get_dummies(messages['label'],drop_first=True)

# Train Test Split

In [22]:
# Train Test Split

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.20,random_state=0)

# Naive_bayes classification

In [30]:
# Multinomial Naive Bayes is a specialized version of Naive Bayes that is designed more for text documents. As
# it explicitly models the word counts and adjusts the underlying calculations to deal with in.
from sklearn.naive_bayes import MultinomialNB
spam_detect = MultinomialNB().fit(X_train, y_train)

y_ = spam_detect.predict(X_test)

  y = column_or_1d(y, warn=True)


# Testing and Evaluation

In [45]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

Confusion_matrix = confusion_matrix(y_test, y_)
print("Confusion Matrix:\n {0}".format(Confusion_matrix))

Confusion Matrix:
 [[946   9]
 [  7 153]]


In [46]:
print("Accuracy: {0}".format(accuracy_score(y_test,y_)))

Accuracy: 0.9856502242152466


In [48]:
print("Classification Report:\n {0}".format(classification_report(y_test, y_)))

Classification Report:
              precision    recall  f1-score   support

          0       0.99      0.99      0.99       955
          1       0.94      0.96      0.95       160

avg / total       0.99      0.99      0.99      1115

