In [2]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import plot_confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.dummy import DummyClassifier

In [4]:
spam_data = pd.read_csv("D:/py/SPAM.csv")

In [5]:
display (spam_data.head())
# spam_data.head() : both are same

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
display (spam_data.describe())

Unnamed: 0,Category,Message
count,5572,5572
unique,2,5157
top,ham,"Sorry, I'll call later"
freq,4825,30


In [7]:
print('How many of each label do we have:\n')
print(spam_data['Category'].value_counts())

How many of each label do we have:

ham     4825
spam     747
Name: Category, dtype: int64


In [8]:
# we need to encode our category into numerical data so that our model is able to use it.

spam_data['labels'] = LabelEncoder().fit_transform(spam_data['Category'])
print(spam_data.head())

# spam_data['labels'] : creating a column
# LabelEncoder : use to assign label

  Category                                            Message  labels
0      ham  Go until jurong point, crazy.. Available only ...       0
1      ham                      Ok lar... Joking wif u oni...       0
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...       1
3      ham  U dun say so early hor... U c already then say...       0
4      ham  Nah I don't think he goes to usf, he lives aro...       0


In [21]:
# split data onto training and test sets
X = spam_data['Message']
y = spam_data['labels']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, stratify = None)

# stratify = None : If not None, data is split in a stratified fashion, using this as the class labels.

In [19]:
# initializer our vectorizer

vectorizer = CountVectorizer()

# lets intitialize our Naive Bays and our Dummy Classifier

NB = MultinomialNB()
NB.fit(vectorizer.fit_transform(X_train), y_train)

Dummy = DummyClassifier(strategy = 'stratified')
Dummy.fit(vectorizer.fit_transform(X_train), y_train);

X_test_vector = vectorizer.transform(X_test)
y_pred = NB.predict(X_test_vector)

In [20]:
y_pred

array([0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,