## Read in text

In [4]:
# import libraries
import pandas as pd
import re
import string 
import nltk
pd.set_option('display.max_colwidth',100)

stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

data = pd.read_csv("SMSSpamCollection.tsv",sep='\t')
data.columns=['label','body_text']

In [8]:
import sklearn
print(sklearn.__version__)

1.3.0


###  Create a function to remove punctuation, tokenize, remove stopwords, and stem.

In [5]:
def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text

### Apply CountVectorizer

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(analyzer = clean_text)
X_counts = count_vect.fit_transform(data['body_text'])
print(X_counts.shape)
print(count_vect.get_feature_names_out())
print(X_counts)

(5567, 8104)
['' '0' '008704050406' ... 'ü' 'üll' '〨ud']
  (0, 3134)	1
  (0, 2790)	2
  (0, 436)	1
  (0, 7816)	1
  (0, 2120)	1
  (0, 7782)	1
  (0, 2909)	2
  (0, 2288)	1
  (0, 3011)	1
  (0, 7168)	1
  (0, 456)	1
  (0, 4640)	1
  (0, 443)	1
  (0, 7027)	1
  (0, 879)	1
  (0, 5917)	1
  (0, 5829)	1
  (0, 7350)	1
  (0, 5876)	1
  (0, 1228)	1
  (0, 73)	1
  (1, 4931)	1
  (1, 2586)	1
  (1, 7095)	1
  (1, 3332)	1
  :	:
  (5563, 3320)	1
  (5563, 8101)	1
  (5563, 3123)	1
  (5563, 2818)	1
  (5564, 6830)	1
  (5564, 4833)	1
  (5564, 5528)	1
  (5564, 6528)	1
  (5565, 3134)	1
  (5565, 4369)	1
  (5565, 7693)	1
  (5565, 5015)	1
  (5565, 7473)	1
  (5565, 6550)	1
  (5565, 1776)	1
  (5565, 2748)	1
  (5565, 3239)	1
  (5565, 3462)	1
  (5565, 3801)	1
  (5565, 3916)	1
  (5565, 997)	1
  (5565, 1564)	1
  (5566, 4937)	1
  (5566, 7306)	1
  (5566, 6070)	1


### Seperating Dependent and Independent Variable.

In [9]:
X = X_counts.toarray()
y = data.iloc[:,0].values

In [10]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [11]:
y

array(['spam', 'ham', 'ham', ..., 'ham', 'ham', 'ham'], dtype=object)

### Encoding the dependent variable(y)

In [12]:
from sklearn.preprocessing import LabelEncoder
labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y)

### Splitting the dataset

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.20, random_state = 0)

### Using Naive Bayes Classifier

In [16]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train,y_train)

### Predicting Results

In [17]:
y_pred = classifier.predict(X_test)

## Confusion matrix

In [18]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test,y_pred)

In [19]:
cm

array([[841, 118],
       [ 12, 143]], dtype=int64)

Here principal diagonal elements (841,143) are the correct outputs. and (12,118) are incorrect outputs.

(841+143)/(841+143+118+12) is the accuracy

In [20]:
print((841+143)/(841+143+118+12))

0.8833034111310593


88% accurate.