<h2 align='center'>NLP Tutorial: Text Representation - Bag Of Words (BOW)</h2>

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("spam.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
df.Category.value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

In [4]:
df['spam'] = df['Category'].apply(lambda x: 1 if x =='spam' else 0)

In [5]:
df.shape

(5572, 3)

In [6]:
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


<h3>Train test split</h3>

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.Message, df.spam, test_size=0.2)

In [8]:
X_train.shape

(4457,)

In [9]:
X_test.shape

(1115,)

In [10]:
type(X_train)

In [11]:
X_train[:4]

5112    December only! Had your mobile 11mths+? You ar...
4330    1Apple/Day=No Doctor. 1Tulsi Leaf/Day=No Cance...
1020                      Don know..wait i will check it.
108     How would my ip address test that considering ...
Name: Message, dtype: object

In [12]:
type(y_train)

In [13]:
y_train[:4]

5112    1
4330    0
1020    0
108     0
Name: spam, dtype: int64

In [14]:
type(X_train.values)

numpy.ndarray

<h3>Create bag of words representation using CountVectorizer</h3>

In [15]:
from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer()

X_train_cv = v.fit_transform(X_train.values)
X_train_cv

<4457x7777 sparse matrix of type '<class 'numpy.int64'>'
	with 59595 stored elements in Compressed Sparse Row format>

In [16]:
X_train_cv.toarray()[:2][0]

array([0, 0, 0, ..., 0, 0, 0])

In [17]:
X_train_cv.shape

(4457, 7777)

In [18]:
v.get_feature_names_out()[1771]

'cheetos'

In [19]:
v.vocabulary_

{'december': 2198,
 'only': 4962,
 'had': 3294,
 'your': 7744,
 'mobile': 4554,
 '11mths': 272,
 'you': 7738,
 'are': 1062,
 'entitled': 2613,
 'to': 6961,
 'update': 7218,
 'the': 6846,
 'latest': 4039,
 'colour': 1895,
 'camera': 1637,
 'for': 2943,
 'free': 2990,
 'call': 1617,
 'vco': 7288,
 'on': 4950,
 '08002986906': 54,
 '1apple': 322,
 'day': 2179,
 'no': 4814,
 'doctor': 2373,
 '1tulsi': 335,
 'leaf': 4065,
 'cancer': 1648,
 '1lemon': 327,
 'fat': 2787,
 '1cup': 324,
 'milk': 4492,
 'bone': 1432,
 'problms': 5453,
 'litres': 4162,
 'watr': 7434,
 'diseases': 2345,
 'snd': 6305,
 'ths': 6907,
 'whom': 7528,
 'care': 1665,
 'don': 2399,
 'know': 3970,
 'wait': 7381,
 'will': 7548,
 'check': 1758,
 'it': 3764,
 'how': 3520,
 'would': 7643,
 'my': 4680,
 'ip': 3735,
 'address': 823,
 'test': 6816,
 'that': 6843,
 'considering': 1973,
 'computer': 1938,
 'isn': 3759,
 'minecraft': 4503,
 'server': 6059,
 'think': 6873,
 'should': 6154,
 'go': 3163,
 'honesty': 3479,
 'road': 5831,


In [20]:
X_train_np = X_train_cv.toarray()
X_train_np[0]

array([0, 0, 0, ..., 0, 0, 0])

In [21]:
np.where(X_train_np[0]!=0)

(array([  54,  272, 1062, 1617, 1637, 1895, 2198, 2613, 2943, 2990, 3294,
        4039, 4554, 4950, 4962, 6846, 6961, 7218, 7288, 7738, 7744]),)

In [38]:
#X_train[:4][1579]
X_train[1062]

'I might come to kerala for 2 days.so you can be prepared to take a leave once i finalise .dont plan any travel during my visit.need to finish urgent works.'

In [37]:
X_train_np[0][1062]

1

<h3>Train the naive bayes model</h3>

In [25]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_cv, y_train)

In [26]:
X_test_cv = v.transform(X_test)

<h3>Evaluate Performance</h3>

In [27]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test_cv)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       964
           1       0.98      0.92      0.95       151

    accuracy                           0.99      1115
   macro avg       0.98      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115



In [35]:
emails = [
    'Hey Ali, can we get together to watch footbal game tomorrow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]

emails_count = v.transform(emails)
model.predict(emails_count)

array([0, 1])

<h3>Train the model using sklearn pipeline and reduce number of lines of code</h3>

In [29]:
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [30]:
clf.fit(X_train, y_train)

In [31]:
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       964
           1       0.98      0.92      0.95       151

    accuracy                           0.99      1115
   macro avg       0.98      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115

