# Bag Of Words (BOW)

In [311]:
import pandas as pd
import numpy as np

In [312]:
df = pd.read_csv("spam.csv")
df.head(10)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [313]:
df.shape

(5572, 2)

In [314]:
df.Category.value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

In [315]:
df['spam'] = df['Category'].apply(lambda x: 1 if x == 'spam' else 0)

 - Spam = 1
 - Ham = 0

In [316]:
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


# Train test split

In [317]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.Message, df.spam, test_size=0.2)

In [318]:
X_train.shape

(4457,)

In [319]:
X_test.shape

(1115,)

In [320]:
X_train[:4]

2557    Fuck babe ... What happened to you ? How come ...
3649    As per your request 'Maangalyam (Alaipayuthe)'...
3422    Welcome! Please reply with your AGE and GENDER...
1782                               ;-( oh well, c u later
Name: Message, dtype: object

In [321]:

y_train[:4]

2557    0
3649    0
3422    1
1782    0
Name: spam, dtype: int64

In [322]:
type(X_train.values)

numpy.ndarray

# Create bag of words representation using CountVectorizer

In [323]:
from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer()
X_train_cv = v.fit_transform(X_train.values)
X_train_cv


<4457x7746 sparse matrix of type '<class 'numpy.int64'>'
	with 59341 stored elements in Compressed Sparse Row format>

In [324]:
X_train_cv.toarray()[:2][0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [325]:
X_train_cv.shape

(4457, 7746)

In [326]:
v.get_feature_names_out()[1771]

'checkboxes'

In [327]:
v.get_feature_names_out().shape

(7746,)

In [328]:
v.vocabulary_

{'fuck': 3036,
 'babe': 1195,
 'what': 7477,
 'happened': 3339,
 'to': 6936,
 'you': 7708,
 'how': 3527,
 'come': 1922,
 'never': 4760,
 'came': 1643,
 'back': 1204,
 'as': 1097,
 'per': 5156,
 'your': 7714,
 'request': 5733,
 'maangalyam': 4307,
 'alaipayuthe': 918,
 'has': 3356,
 'been': 1301,
 'set': 6043,
 'callertune': 1633,
 'for': 2950,
 'all': 934,
 'callers': 1632,
 'press': 5403,
 'copy': 2020,
 'friends': 3013,
 'welcome': 7453,
 'please': 5259,
 'reply': 5727,
 'with': 7552,
 'age': 882,
 'and': 980,
 'gender': 3112,
 'begin': 1310,
 '24m': 378,
 'oh': 4919,
 'well': 7455,
 'later': 4044,
 'does': 2403,
 'daddy': 2159,
 'have': 3370,
 'bb': 1265,
 'now': 4852,
 'so': 6288,
 'li': 4112,
 'hai': 3308,
 'me': 4420,
 'bored': 1451,
 'da': 2154,
 'lecturer': 4085,
 'repeating': 5720,
 'last': 4038,
 'weeks': 7443,
 'stuff': 6547,
 'waste': 7392,
 'time': 6901,
 'yo': 7704,
 'around': 1083,
 'friend': 3012,
 'of': 4901,
 'mine': 4503,
 'lookin': 4217,
 'pick': 5210,
 'up': 7185,


In [329]:
X_train_np = X_train_cv.toarray()
X_train_np[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [330]:
np.where(X_train_np[0]!=0)

(array([1195, 1204, 1643, 1922, 3036, 3339, 3527, 4760, 6936, 7477, 7708],
       dtype=int64),)

In [333]:
X_train[:4][3422]

'Welcome! Please reply with your AGE and GENDER to begin. e.g 24M'

In [336]:
X_train_np[0][1771]

0

# Train the naive bayes model

In [337]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_cv, y_train)

In [338]:
X_test_cv = v.transform(X_test)

In [340]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test_cv)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       988
           1       0.98      0.92      0.95       127

    accuracy                           0.99      1115
   macro avg       0.99      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115



# testing ...

In [342]:
emails = [
    'Hey mohan, can we get together to watch footbal game tomorrow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]

emails_cnt = v.transform(emails)
model.predict(emails_cnt)

array([0, 1], dtype=int64)

# Using Pipeline(sklearn)

In [343]:
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [347]:
clf.fit(X_train, y_train)

In [349]:
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred ))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       988
           1       0.98      0.92      0.95       127

    accuracy                           0.99      1115
   macro avg       0.99      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115

