## 🔷 Concept of Bag of Words
The core idea of BoW is:

Treat each document (or sentence) as a collection of words, ignoring grammar and word order.

Build a vocabulary (dictionary) of all unique words in your corpus.

Represent each document as a vector: each position in the vector corresponds to a word in the vocabulary, and the value is usually the frequency of that word in the document.

In [10]:
import pandas as pd
messages=pd.read_csv('Datasets/spam.csv', sep=',', names=["label","message"])

In [11]:
messages

Unnamed: 0,label,message
0,ham,Ok lar... Joking wif u oni...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,U dun say so early hor... U c already then say...
3,ham,Even my brother is not like to speak with me. ...
4,ham,As per your request 'Melle Melle (Oru Minnamin...
...,...,...
66,ham,Smile in Pleasure Smile in Pain Smile when tro...
67,spam,Please call our customer service representativ...
68,ham,Havent planning to buy later. I check already ...
69,ham,Watching telugu movie..wat abt u?


In [12]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re
ps=PorterStemmer()

In [13]:
corpus=[]
for i in range(0,len(messages)):
    review=re.sub('[^a-zA-z]',' ',messages['message'][i])
    review=review.lower()
    review=review.split()
    review=[ps.stem(word) for word in review if not word in stopwords.words('english')]
    review=' '.join(review)
    corpus.append(review)

In [14]:
corpus

['ok lar joke wif u oni',
 'free entri wkli comp win fa cup final tkt st may text fa receiv entri question std txt rate c appli',
 'u dun say earli hor u c alreadi say',
 'even brother like speak treat like aid patent',
 'per request mell mell oru minnaminungint nurungu vettam set callertun caller press copi friend callertun',
 'winner valu network custom select receivea prize reward claim call claim code kl valid hour',
 'mobil month u r entitl updat latest colour mobil camera free call mobil updat co free',
 'search right word thank breather promis wont take help grant fulfil promis wonder bless time',
 'date sunday',
 'oh k watch',
 'eh u rememb spell name ye v naughti make v wet',
 'fine way u feel way gota b',
 'serious spell name',
 'go tri month ha ha joke',
 '_ pay first lar da stock comin',
 'aft finish lunch go str lor ard smth lor u finish ur lunch alreadi',
 'ffffffffff alright way meet sooner',
 'forc eat slice realli hungri tho suck mark get worri know sick turn pizza lol

## Create Bag Of Words

In [18]:
## Create the Bag OF Words model
from sklearn.feature_extraction.text import CountVectorizer
## for Binary BOW enable binary=True
cv=CountVectorizer(max_features=100)

In [19]:
X=cv.fit_transform(corpus).toarray()

In [20]:
import numpy as np
np.set_printoptions(edgeitems=30, linewidth=100000, 
    formatter=dict(float=lambda x: "%.3g" % x))
X

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0, 0, 1, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0

In [24]:
# using lemmitizer, part of speech and stop words
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import pos_tag, word_tokenize

# 1. Helper function to convert Treebank tags to WordNet tags
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # default to noun

lemmatizer = WordNetLemmatizer()

In [25]:
corpus=[]
for i in range(0,len(messages)):
    review=re.sub('[^a-zA-z]',' ',messages['message'][i])
    review=review.lower()
    review=word_tokenize(review)
    review=[word for word in review if not word in stopwords.words('english')]
    review=pos_tag(review)
    review=[lemmatizer.lemmatize(word, get_wordnet_pos(pos)) for word, pos in review]
    review=' '.join(review)
    corpus.append(review)

In [26]:
corpus

['ok lar joking wif u oni',
 'free entry wkly comp win fa cup final tkts st may text fa receive entry question std txt rate c apply',
 'u dun say early hor u c already say',
 'even brother like speak treat like aid patent',
 'per request melle melle oru minnaminunginte nurungu vettam set callertune caller press copy friend callertune',
 'winner value network customer select receivea prize reward claim call claim code kl valid hour',
 'mobile month u r entitle update late colour mobile camera free call mobile update co free',
 'search right word thank breather promise wont take help grant fulfil promise wonderful blessing time',
 'date sunday',
 'oh k watch',
 'eh u remember spell name yes v naughty make v wet',
 'fine way u feel way gota b',
 'seriously spell name',
 'go try month ha ha joking',
 '_ pay first lar da stock comin',
 'aft finish lunch go str lor ard smth lor u finish ur lunch already',
 'ffffffffff alright way meet sooner',
 'force eat slice really hungry tho suck mark ge

In [27]:
X=cv.fit_transform(corpus).toarray()

In [28]:
X

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0, 0, 1, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0

In [29]:
cv.vocabulary_

{'ok': np.int64(60),
 'lar': np.int64(39),
 'joking': np.int64(37),
 'wif': np.int64(92),
 'free': np.int64(28),
 'entry': np.int64(22),
 'fa': np.int64(24),
 'may': np.int64(51),
 'say': np.int64(71),
 'early': np.int64(19),
 'already': np.int64(1),
 'even': np.int64(23),
 'like': np.int64(41),
 'treat': np.int64(86),
 'request': np.int64(70),
 'melle': np.int64(53),
 'callertune': np.int64(6),
 'customer': np.int64(13),
 'prize': np.int64(65),
 'claim': np.int64(11),
 'call': np.int64(5),
 'mobile': np.int64(54),
 'month': np.int64(55),
 'update': np.int64(88),
 'late': np.int64(40),
 'promise': np.int64(66),
 'time': np.int64(83),
 'watch': np.int64(89),
 'remember': np.int64(68),
 'spell': np.int64(76),
 'name': np.int64(59),
 'yes': np.int64(97),
 'make': np.int64(48),
 'way': np.int64(90),
 'feel': np.int64(25),
 'go': np.int64(30),
 'try': np.int64(87),
 'ha': np.int64(33),
 'pay': np.int64(62),
 'first': np.int64(27),
 'da': np.int64(15),
 'finish': np.int64(26),
 'lunch': np.i

## Ngrams

In [30]:
## Create the Bag OF Words model with ngram
from sklearn.feature_extraction.text import CountVectorizer
## for Binary BOW enable binary=True
cv=CountVectorizer(max_features=100,binary=True,ngram_range=(2,3))
X=cv.fit_transform(corpus).toarray()

In [31]:
X

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0

In [32]:
cv.vocabulary_

{'ok lar': np.int64(79),
 'already say': np.int64(15),
 'brother like': np.int64(32),
 'aid patent': np.int64(13),
 'brother like speak': np.int64(33),
 'callertune caller': np.int64(58),
 'caller press': np.int64(56),
 'callertune caller press': np.int64(59),
 'caller press copy': np.int64(57),
 'call claim': np.int64(43),
 'call claim code': np.int64(44),
 'camera free': np.int64(62),
 'call mobile': np.int64(52),
 'camera free call': np.int64(63),
 'call mobile update': np.int64(53),
 'breather promise': np.int64(30),
 'blessing time': np.int64(29),
 'breather promise wont': np.int64(31),
 'spell name': np.int64(81),
 'yes naughty': np.int64(96),
 'yes naughty make': np.int64(97),
 'ha ha': np.int64(77),
 'aft finish': np.int64(3),
 'ard smth': np.int64(16),
 'aft finish lunch': np.int64(4),
 'ard smth lor': np.int64(17),
 'worried know': np.int64(84),
 'worried know sick': np.int64(85),
 'catch bus': np.int64(65),
 'bus fry': np.int64(38),
 'catch bus fry': np.int64(66),
 'bus fry 