In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB 
import os

# Data

In [10]:
messages = []
categories = []
for line in open("SMSSpamCollection.csv"):
    category, message = line.split('\t')
    messages.append(message)
    categories.append(category)

df = pd.DataFrame({'Label':categories, 'Message':messages})
df.head()

Unnamed: 0,Label,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...\n
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [11]:
df[df["Label"] == "ham"].head(10)

Unnamed: 0,Label,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...\n
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
10,ham,I'm gonna be home soon and i don't want to tal...
13,ham,I've been searching for the right words to tha...
14,ham,I HAVE A DATE ON SUNDAY WITH WILL!!\n
16,ham,Oh k...i'm watching here:)\n


# Splitting the Test and Train Data 

In [13]:
train_set, test_set, train_label, test_label = train_test_split(df, df['Label'], test_size = 0.25, random_state = 42)
print("\nThe Trainset consists of {} records and {} columns ".format(train_set.shape[0],train_set.shape[1]))
print("\nThe Testset consists of {} records and {} columns ".format(test_set.shape[0],train_set.shape[1]))


The Trainset consists of 4180 records and 2 columns 

The Testset consists of 1394 records and 2 columns 


# Vectorizing text Data

In [15]:
countvect = CountVectorizer()
x_counts = countvect.fit(train_set.Message)

# preparing for training set
x_train_df = countvect.transform(train_set.Message)

# preparing for test set
x_test_df = countvect.transform(test_set.Message)

# Training

# Naive Bayes classifier

In [17]:
clf = MultinomialNB()
clf.fit(x_train_df,train_set.Label) 

MultinomialNB()

# Testing

In [18]:
predicted_values_NB = clf.predict(x_test_df)
accuracy = round((accuracy_score(test_set.Label, predicted_values_NB) * 100),2)
print("The accuracy of Naive Bayes clasifier is {}%".format(accuracy))


The accuracy of Naive Bayes clasifier is 98.78%


In [19]:
def classify_spam(text):
  pred1 = countvect.transform([text])
  return clf.predict(pred1)[0]

In [20]:
print(classify_spam("Free entry for a workshop at Chennai. Order now !!"))

spam
