In [38]:
from sklearn import datasets
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

In [2]:
# Data loading
dados = pd.read_csv('sms_spam.csv')

In [3]:
dados.describe()

Unnamed: 0,type,text
count,5559,5559
unique,2,5156
top,ham,"Sorry, I'll call later"
freq,4812,30


In [4]:
dados.head()

Unnamed: 0,type,text
0,ham,Hope you are having a good week. Just checking in
1,ham,K..give back my thanks.
2,ham,Am also doing in cbe only. But have to pay.
3,spam,"complimentary 4 STAR Ibiza Holiday or £10,000 ..."
4,spam,okmail: Dear Dave this is your final notice to...


In [7]:
dados.type.value_counts(normalize = True)

ham     0.865623
spam    0.134377
Name: type, dtype: float64

In [13]:
random_data = dados.sample(frac = 1, random_state = 1)
X = random_data.text
y = random_data.type

In [32]:
X_rand_train, X_rand_test, y_rand_train, y_rand_test = train_test_split(X,y)

In [33]:
y_rand_train.value_counts(normalize=True)

ham     0.867594
spam    0.132406
Name: type, dtype: float64

In [34]:
y_rand_test.value_counts(normalize=True)

ham     0.859712
spam    0.140288
Name: type, dtype: float64

# Text Prep

In [56]:
# Without stepwords
vect_teste = CountVectorizer(ngram_range = (1,2 ), stop_words = 'english').fit(X_rand_train)
x_teste = vect_teste.transform(X_rand_train)

In [55]:
vect_teste.get_feature_names_out()

array(['document', 'document second', 'second', 'second document'],
      dtype=object)

In [35]:
# With stepwords
vectorizer = CountVectorizer(ngram_range = (1,2 )).fit(X_rand_train)
X_train_vec= vectorizer.transform(X_rand_train)
X_train_vec.toarray().shape

(4169, 40524)

In [62]:
vectorizer.get_feature_names_out()

array(['00', '00 in', '00 per', ..., 'éˆ ud', 'œharry', 'œharry potter'],
      dtype=object)

In [36]:
X_rand_train

4333    URGENT We are trying to contact you Last weeke...
1896                               Where you. What happen
1661    WINNER!! As a valued network customer you have...
2252    Huh so fast... Dat means u havent finished pai...
2823    Yes princess! I want to catch you with my big ...
                              ...                        
2265    Your account has been credited with 500 FREE T...
2548    Sounds gd... Haha... Can... Wah, u yan jiu so ...
4544           Yep then is fine 7.30 or 8.30 for ice age.
808                                Waiting for your call.
5364    Eh u remember how 2 spell his name... Yes i di...
Name: text, Length: 4169, dtype: object

In [37]:
X_train_vec

<4169x40524 sparse matrix of type '<class 'numpy.int64'>'
	with 109106 stored elements in Compressed Sparse Row format>

In [57]:
model = MultinomialNB(alpha = 0.1)
model.fit(X_train_vec, y_rand_train)#X_train_vec, y_rand_train)

MultinomialNB(alpha=0.1)

In [59]:
predictions = model.predict(vectorizer.transform(X_rand_test))
print("Accuracy:", 100 * sum(predictions == y_rand_test) / len(predictions), '%')

Accuracy: 98.48920863309353 %


In [61]:
model.predict(vect_teste.transform(
    [
        "How are you? I need to know the results from last month, please.",
        "Do you need any help? Call me",
        "1-month unlimited calls offer Activate now",
        "LAST CHANCE! This is a chance of a lifetime!!",
    ])
            )

array(['ham', 'ham', 'spam', 'spam'], dtype='<U4')