In [28]:
import numpy as np
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from wordcloud import WordCloud

In [1]:
# data from:
# https://www.kaggle.com/uciml/sms-spam-collection-dataset
!wget https://lazyprogrammer.me/course_files/spam.csv

--2024-07-05 00:20:24--  https://lazyprogrammer.me/course_files/spam.csv
Resolving lazyprogrammer.me (lazyprogrammer.me)... 104.21.23.210, 172.67.213.166, 2606:4700:3031::6815:17d2, ...
Connecting to lazyprogrammer.me (lazyprogrammer.me)|104.21.23.210|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 503663 (492K) [text/csv]
Saving to: ‘spam.csv’


2024-07-05 00:20:26 (1.39 MB/s) - ‘spam.csv’ saved [503663/503663]



In [4]:
#file contains some valid chars
#depending on which version of pandas
df = pd.read_csv('spam.csv', encoding='ISO-8859-1')

In [5]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [6]:
df = df.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)

In [7]:
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
#rename the columns
df.columns = ['labels', 'data']

In [9]:
df.head()

Unnamed: 0,labels,data
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
df['labels'].hist()

In [11]:
#create binary labels
df['b_labels'] = df['labels'].map({'ham': 0, 'spam': 1})

In [12]:
Y = df['b_labels'].to_numpy()

In [13]:
Y

array([0, 0, 1, ..., 0, 0, 0])

In [14]:
#split up the data
df_train, df_test, Ytrain, Ytest = train_test_split(df['data'], Y, test_size=0.33)

In [16]:
#try multiple ways of calculating features
#featurizer = TfidVectorizer(decode_error = 'ignore')
#Xtrain = featurizer.fit_transform(df_train)
#Xtest = featurizer.transform(df_test)

featurizer = CountVectorizer(decode_error='ignore')
Xtrain = featurizer.fit_transform(df_train)
Xtest = featurizer.transform(df_test)

In [17]:
Xtrain

<3733x6973 sparse matrix of type '<class 'numpy.int64'>'
	with 49268 stored elements in Compressed Sparse Row format>

In [19]:
#create the modal, train it, print the scores
model = MultinomialNB()
model.fit(Xtrain, Ytrain)
print("train acc:", model.score(Xtrain, Ytrain))
print("test acc:", model.score(Xtest, Ytest))

train acc: 0.993838735601393
test acc: 0.988036976617727


In [20]:
Ptrain = model.predict(Xtrain)
Ptest = model.predict(Xtest)
print("train F1 : ", f1_score(Ytrain, Ptrain))
print("test F1 : ", f1_score(Ytest, Ptest))

train F1 :  0.977205153617443
test F1 :  0.9527896995708154


In [21]:
Prob_train = model.predict_proba(Xtrain)[:,1]
Prob_test = model.predict_proba(Xtest)[:,1]
print("train AUC:", roc_auc_score(Ytrain, Prob_train))
print("test AUC:", roc_auc_score(Ytest, Prob_test))

train AUC: 0.9946257662410078
test AUC: 0.9778311965811965


In [22]:
cm = confusion_matrix(Ytrain, Ptrain)

In [23]:
cm

array([[3217,    9],
       [  14,  493]])

In [24]:
def plot_cm(cm):
  classes = ['ham', 'spam']
  df_cm = pd.DataFrame(cm, index=classes, columns=classes)
  ax = sn.heatmap(df_cm, annot=True, fmt='g')
  ax.set_xlabel("Predicted")
  ax.set_ylabel("Traget")
  plot_cm(cm)

In [29]:
# visualize the data
def visualize(label):
  words = ''
  for msg in df[df['labels'] == label]['data']:
    msg = msg.lower()
    words += msg + ' '
    wordcloud = WordCloud(width=600, height=400).generate(words)
    plt.imshow(wordcloud)
    plt.axis('off')
    plt.show()

In [None]:
visualize('spam')

In [None]:
visualize('ham')

In [33]:
# what is wrong?
X = featurizer.transform(df['data'])
df['predictions'] = model.predict(X)

In [None]:
# should be a spam
sneaky_spam = df[(df['predictions'] == 0) & (df['b_labels'] == 1)]['data']
for msg in sneaky_spam:
 print(msg)

In [None]:
#not a spam , but mislabelled
not_actually_spam = df[(df['predictions'] == 1) & (df['b_labels'] == 0)]['data']
for msg in not_actually_spam:
 print(msg)