In [2]:
#Import library
import pandas as pd

In [3]:
#Import data
df = pd.read_csv("spam.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [11]:
#Pre-processing data - changing spam into binarary integers
df["Spam"] = df["Category"].apply(lambda x: 1 if x=="spam" else 0)
df = df.drop(columns=["Category"])
df.head()

Unnamed: 0,Message,Spam
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [20]:
#Split train and test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.Message, df.Spam, test_size=0.2)

X_train.shape

(4457,)

In [26]:
#Vectorize using bag of words
from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer()
X_train_cv = v.fit_transform(X_train.values) #fit and transform the vocabulary
X_test_cv = v.transform(X_test) #transform X_test into bag of words

X_train_cv.toarray().shape

(4457, 7645)

In [21]:
v.vocabulary_ #print the vocabulary

{'mum': 4636,
 've': 7273,
 'sent': 6030,
 'you': 7721,
 'many': 4347,
 'messages': 4455,
 'since': 6181,
 'got': 3203,
 'here': 3413,
 'just': 3874,
 'want': 7387,
 'to': 6948,
 'know': 3961,
 'that': 6820,
 'are': 1040,
 'actually': 788,
 'getting': 3126,
 'them': 6831,
 'do': 2368,
 'enjoy': 2598,
 'the': 6823,
 'rest': 5758,
 'of': 4886,
 'your': 7726,
 'day': 2172,
 'haha': 3298,
 'yeah': 7693,
 'see': 5993,
 'now': 4835,
 'be': 1258,
 'there': 6841,
 'in': 3637,
 'sec': 5981,
 'today': 6955,
 'offer': 4892,
 'claim': 1811,
 'ur': 7215,
 '150': 304,
 'worth': 7619,
 'discount': 2341,
 'vouchers': 7337,
 'text': 6798,
 'yes': 7705,
 '85023': 662,
 'savamob': 5933,
 'member': 4436,
 'offers': 4895,
 'mobile': 4538,
 'cs': 2079,
 '08717898035': 136,
 '00': 0,
 'sub': 6564,
 '16': 314,
 'unsub': 7193,
 'reply': 5725,
 'life': 4101,
 'spend': 6384,
 'with': 7560,
 'someone': 6307,
 'for': 2940,
 'lifetime': 4104,
 'may': 4397,
 'meaningless': 4411,
 'but': 1561,
 'few': 2816,
 'moments

In [24]:
#Training naive bayes model
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_cv, y_train)

In [27]:
#Test results
from sklearn.metrics import classification_report

y_pred = model.predict(X_test_cv)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       966
           1       0.98      0.93      0.95       149

    accuracy                           0.99      1115
   macro avg       0.98      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115



In [28]:
#Alternate approach
#Creating a pipeline to vectorize and train the naive bayes model

from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [29]:
clf.fit(X_train, y_train)

In [30]:
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       966
           1       0.98      0.93      0.95       149

    accuracy                           0.99      1115
   macro avg       0.98      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115



Limitations:
1. High sparsity, waste huge computational space as there are a lot of redundant word.
2. Unable to derive any meaning from words since they are only individuals.
3. Out of Vocabulary problem, unable to derive meaning for any words not in vocabulary

Possible solutions:
1. Stop words, but not for all cases, only useful if we need to identify key words.
2. Possibly use n-gram to add additional meaning to the order of words used.