<h2 align='center'>NLP Tutorial: Text Representation - Bag Of Words (BOW)</h2>

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('spam.csv')
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
df.Category.value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

In [6]:
df.shape

(5572, 2)

In [8]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test =train_test_split(df.Message,df.Category,test_size=.2,random_state=3)

In [10]:
X_train

3075                  Don know. I did't msg him recently.
1787    Do you know why god created gap between your f...
1614                         Thnx dude. u guys out 2nite?
4304                                      Yup i'm free...
3266    44 7732584351, Do you want a New Nokia 3510i c...
                              ...                        
789     5 Free Top Polyphonic Tones call 087018728737,...
968     What do u want when i come back?.a beautiful n...
1667    Guess who spent all last night phasing in and ...
3321    Eh sorry leh... I din c ur msg. Not sad alread...
1688    Free Top ringtone -sub to weekly ringtone-get ...
Name: Message, Length: 4457, dtype: object

In [12]:
X_train[3:]

4304                                      Yup i'm free...
3266    44 7732584351, Do you want a New Nokia 3510i c...
2413    I don't know u and u don't know me. Send CHAT ...
4539     Dare i ask... Any luck with sorting out the car?
3000    Oh, then your phone phoned me but it disconnected
                              ...                        
789     5 Free Top Polyphonic Tones call 087018728737,...
968     What do u want when i come back?.a beautiful n...
1667    Guess who spent all last night phasing in and ...
3321    Eh sorry leh... I din c ur msg. Not sad alread...
1688    Free Top ringtone -sub to weekly ringtone-get ...
Name: Message, Length: 4454, dtype: object

In [13]:
X_train.values

array(["Don know. I did't msg him recently.",
       'Do you know why god created gap between your fingers..? So that, One who is made for you comes &amp; fills those gaps by holding your hand with LOVE..!',
       'Thnx dude. u guys out 2nite?', ...,
       'Guess who spent all last night phasing in and out of the fourth dimension',
       'Eh sorry leh... I din c ur msg. Not sad already lar. Me watching tv now. U still in office?',
       'Free Top ringtone -sub to weekly ringtone-get 1st week free-send SUBPOLY to 81618-?3 per week-stop sms-08718727870'],
      dtype=object)

In [14]:
X_train.values.shape

(4457,)

In [15]:
type(X_train.values)

numpy.ndarray

<h3>Create bag of words representation using CountVectorizer</h3>

In [19]:
from sklearn.feature_extraction.text import CountVectorizer
vector= CountVectorizer()

In [20]:
emails = [
            'Upto 20% discount reward!',
            'Hey mohan, discount Upto'
         ]
emails_cv = vector.fit_transform(emails)
emails_cv

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 8 stored elements and shape (2, 6)>

In [21]:
emails_cv.toarray()

array([[1, 1, 0, 0, 1, 1],
       [0, 1, 1, 1, 0, 1]], dtype=int64)

In [26]:
vector.get_feature_names_out()

array(['20', 'discount', 'hey', 'mohan', 'reward', 'upto'], dtype=object)

In [28]:
pd.DataFrame(emails_cv.toarray(),columns= vector.get_feature_names_out())

Unnamed: 0,20,discount,hey,mohan,reward,upto
0,1,1,0,0,1,1
1,0,1,1,1,0,1


In [29]:
v = CountVectorizer()

In [31]:
X_train_cv = v.fit_transform(X_train.values)
X_train_cv

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 59126 stored elements and shape (4457, 7694)>

In [35]:
X_train_cv.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [36]:
v.get_feature_names_out()

array(['00', '000', '000pes', ..., 'èn', 'ú1', '〨ud'], dtype=object)

In [33]:
pd.DataFrame(X_train_cv.toarray(),columns=v.get_feature_names_out())

Unnamed: 0,00,000,000pes,008704050406,0089,0121,01223585236,01223585334,0125698789,02,...,zeros,zindgi,zoe,zogtorius,zoom,zouk,zyada,èn,ú1,〨ud
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4452,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4453,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4454,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4455,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [34]:
v.vocabulary_

{'don': 2390,
 'know': 3934,
 'did': 2284,
 'msg': 4594,
 'him': 3403,
 'recently': 5591,
 'do': 2356,
 'you': 7654,
 'why': 7454,
 'god': 3146,
 'created': 2047,
 'gap': 3053,
 'between': 1316,
 'your': 7660,
 'fingers': 2845,
 'so': 6239,
 'that': 6769,
 'one': 4909,
 'who': 7447,
 'is': 3715,
 'made': 4283,
 'for': 2921,
 'comes': 1894,
 'amp': 937,
 'fills': 2832,
 'those': 6817,
 'gaps': 3054,
 'by': 1576,
 'holding': 3434,
 'hand': 3286,
 'with': 7500,
 'love': 4206,
 'thnx': 6815,
 'dude': 2467,
 'guys': 3256,
 'out': 4978,
 '2nite': 407,
 'yup': 7677,
 'free': 2963,
 '44': 487,
 '7732584351': 621,
 'want': 7325,
 'new': 4727,
 'nokia': 4777,
 '3510i': 445,
 'colour': 1883,
 'phone': 5160,
 'deliveredtomorrow': 2217,
 '300': 425,
 'minutes': 4476,
 'to': 6892,
 'any': 979,
 'mobile': 4520,
 '100': 251,
 'texts': 6754,
 'camcorder': 1610,
 'reply': 5675,
 'or': 4944,
 'call': 1592,
 '08000930705': 50,
 'and': 948,
 'me': 4383,
 'send': 5961,
 'chat': 1727,
 '86688': 682,
 'now': 

In [38]:
X_train_np = X_train_cv.toarray()
X_train_np[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [39]:
np.where(X_train_np[0]!=0)

(array([2284, 2390, 3403, 3934, 4594, 5591], dtype=int64),)

In [42]:
X_train_np[0][5591]

1

<h3>Train the naive bayes model</h3>

In [44]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(X_train_cv,y_train)

In [45]:
X_test_cv = v.transform(X_test)

In [46]:
X_test_cv.toarray().shape

(1115, 7694)

In [47]:
nb.score(X_test_cv,y_test)

0.9901345291479821