In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv(r"https://raw.githubusercontent.com/codeforcauseorg/ML-Bootcamp-July/master/datasets/bayes/spam.csv", encoding = "ISO-8859-1")

In [3]:
le = LabelEncoder()

In [5]:
data = df.values   

In [6]:
data[0]

array(['ham',
       'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
       nan, nan, nan], dtype=object)

In [7]:
len(data)

5572

In [8]:
X = data[:, 1]
y = data[:, 0]

In [9]:
X.shape, y.shape

((5572,), (5572,))

In [11]:
from nltk.tokenize import RegexpTokenizer 

In [12]:
tokenizer = RegexpTokenizer('\w+')   #To extract all the words.

In [13]:
from nltk.corpus import stopwords
sw = set(stopwords.words('english'))   #contains stopwords

In [14]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()

In [22]:
def getStem(review):   #removes all the stem words.
    
    review = review.lower()   
    tokens = tokenizer.tokenize(review)   #breaking into stopwords.
    
    removed_stopwords = [w for w in tokens if w not in sw]    #this contains the words removed from stopwords.
    stemmed_words = [ps.stem(token) for token in removed_stopwords]   #stemming converts different forms of same words into single word.
    
    clean_review = ' '.join(stemmed_words)   #joining the stemmed words
    return clean_review

In [23]:
#getting a clean document 
def getDoc(document):
    
    d = []
    for doc in document:
        d.append(getStem(doc))
        
    return d

In [24]:
stemmed_doc = getDoc(X)

In [25]:
cv = CountVectorizer()

In [26]:
#Vocabulary creation
vc = cv.fit_transform(stemmed_doc)  

In [27]:
X = vc.toarray()

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 42)

In [30]:
#NB from sklearn
from sklearn.naive_bayes import MultinomialNB

In [31]:
model = MultinomialNB()

In [32]:
model.fit(X_train, y_train)

MultinomialNB()

In [33]:
model.score(X_test, y_test)

0.977705274605764

In [34]:
#Inputting the data

messages = ["""
            Hi Lavisha,

So, here's the deal:
I'm writing a brand new book on Optical Character Recognition, Tesseract, and OpenCV.
I'll be launching an IndieGoGo campaign in two weeks, to offer pre-orders of the book at a significantly discounted price.
Over the coming weeks, I'll be sharing more details on the book.
But in the meantime, I wanted to give you a high-level overview, and address some common questions I know I will receive.
""",
           """
           Lavisha, Bachelor of Technology (B.Tech) students have a chance to earn while studying in NIT Hamirpur by working with the world's largest student program - Internshala Student Partner (ISP). Develop your marketing and communication skills while you earn huge rewards!

Rewards - We are giving away financial rewards worth INR 25 lacs+ to our student partners. Apart from this, you can also win super reward iPhone 11 and many more

Eligibility - Any student from any degree, stream, and year of study can apply

Deadline - 19th August 2020. Click here or on the button below to apply
           """,
           """Hello CodeCheffer,

We hope you are hale, hearty, and coding. In case you forgot to mark your calendars, we just wanted to remind you that the August Long Challenge is almost here.

Our setters have gone great lengths to come up with innovative problems, and this contest promises a vibrant start to the month.

Contest details for the fanatics:
Start date: 7th August 15:00 hrs IST(check your timezone here)
Contest duration: 10 mind-boggling days.
Compete Now

Keep a close watch on our Twitter page for teasers to the Long Challenge problems. Don’t miss out on the fun!

More exciting News

We are launching a brand-new YouTube channel called "Learn Competitive Programming with CodeChef." The channel will be filled with content from educators across the globe, explaining how to solve popular CodeChef problems. So go and hit the subscribe button, because this is a ride you definitely want to be a part of.
           """]

In [35]:
#cleaning the text
def prepare(messages):
    d = getDoc(messages)
    
    #dont use fit_transform here while testing as it creates a new vocabulary. Use transform only.
    return cv.transform(d)

In [36]:
messages = prepare(messages)

In [37]:
y_pred = model.predict(messages)

In [38]:
y_pred

array(['ham', 'spam', 'ham'], dtype='<U4')