In [28]:
!pip install nltk



In [129]:
import numpy as np # used to create numpy arrays
import pandas as pd # used to create data frames
import matplotlib.pyplot as plt
import nltk
import string

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/jasleen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jasleen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

Data Collection and Cleaning

In [131]:
sms_df = pd.read_csv('spam.csv',encoding='latin1')

# remove and rename columns
sms_df.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'],inplace=True)
sms_df.rename(columns={'v1':'category', 'v2':'message'},inplace=True)

encoder = LabelEncoder()
sms_df['category'] = encoder.fit_transform(sms_df['category'])

# check for / remove missing or duplicate values
sms_df.isnull().sum()
sms_df = sms_df.drop_duplicates(keep='first')

x = sms_df['message']
y = sms_df['category']

EDA

In [132]:
sms_df['num_chars'] = sms_df['message'].apply(len)
sms_df['num_words'] = sms_df['message'].apply(lambda x:len(nltk.word_tokenize(x)))
sms_df['num_sentences'] = sms_df['message'].apply(lambda x:len(nltk.sent_tokenize(x)))

Data Pre-Processing

In [133]:
def transform_text(text):
    ps = PorterStemmer()
    text = text.lower()
    text = nltk.word_tokenize(text)
    
    y = []
    for word in text:
        if word.isalnum():
            y.append(word)
    
    text = y[:]
    y.clear()
    
    for word in text:
        if word not in stopwords.words('english') and word not in string.punctuation:
            y.append(word)
            
    text = y[:]
    y.clear()
    
    for word in text:
        y.append(ps.stem(word))
    
    return " ".join(y)

In [135]:
sms_df['transformed_text'] = sms_df['message'].apply(transform_text)

Model Building

In [149]:
cv = CountVectorizer()
tfidf = TfidfVectorizer()
x = tfidf.fit_transform(sms_df['message']).toarray()
y = sms_df['category'].values

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=2)

[0 0 1 ... 0 0 0]


In [150]:
gnb = GaussianNB()
mnb = MultinomialNB()
bnb = BernoulliNB()

In [151]:
gnb.fit(x_train,y_train)
y_pred1 = gnb.predict(x_test)
print(accuracy_score(y_test,y_pred1))
print(confusion_matrix(y_test,y_pred1))
print(precision_score(y_test,y_pred1))

0.9110251450676983
[[823  73]
 [ 19 119]]
0.6197916666666666


In [152]:
mnb.fit(x_train,y_train)
y_pred2 = mnb.predict(x_test)
print(accuracy_score(y_test,y_pred2))
print(confusion_matrix(y_test,y_pred2))
print(precision_score(y_test,y_pred2))

0.9506769825918762
[[896   0]
 [ 51  87]]
1.0


In [153]:
bnb.fit(x_train,y_train)
y_pred3 = bnb.predict(x_test)
print(accuracy_score(y_test,y_pred3))
print(confusion_matrix(y_test,y_pred3))
print(precision_score(y_test,y_pred3))

0.9777562862669246
[[893   3]
 [ 20 118]]
0.9752066115702479


In [154]:
# tfidf -> mnb

In [155]:
import pickle
pickle.dump(tfidf,open('vectorizer.pkl','wb'))
pickle.dump(mnb,open('model.pkl','wb'))