In [1]:
import pandas as pd
messages=pd.read_csv('SpamClassifier-master/smsspamcollection/SMSSpamCollection',
                    sep='\t',names=["label","message"])

In [2]:
messages.head(20)

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [3]:
## Data Cleaning And Preprocessing
import re
import nltk
# nltk.download('stopwords')

In [4]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()

In [5]:
corpus=[]
for i in range(0,len(messages)):
    # reg expression means apart from a-z or A-Z any character you find replace it with blank space
    review=re.sub('[^a-zA-z]',' ',messages['message'][i])
    review=review.lower() # lower case each character
    review=review.split() # convert each review into list of words
    # remove stopwords and join them back into a sentence
    review=[ps.stem(word) for word in review if not word in stopwords.words('english')]
    review=' '.join(review)
    # append the sentence to a list corpus
    corpus.append(review)

Above approach can be used in multiple problem statements.

In [6]:
corpus[:20]

['go jurong point crazi avail bugi n great world la e buffet cine got amor wat',
 'ok lar joke wif u oni',
 'free entri wkli comp win fa cup final tkt st may text fa receiv entri question std txt rate c appli',
 'u dun say earli hor u c alreadi say',
 'nah think goe usf live around though',
 'freemsg hey darl week word back like fun still tb ok xxx std chg send rcv',
 'even brother like speak treat like aid patent',
 'per request mell mell oru minnaminungint nurungu vettam set callertun caller press copi friend callertun',
 'winner valu network custom select receivea prize reward claim call claim code kl valid hour',
 'mobil month u r entitl updat latest colour mobil camera free call mobil updat co free',
 'gonna home soon want talk stuff anymor tonight k cri enough today',
 'six chanc win cash pound txt csh send cost p day day tsandc appli repli hl info',
 'urgent week free membership prize jackpot txt word claim c www dbuk net lccltd pobox ldnw rw',
 'search right word thank breather

So this corpus created will be our new input.

#### Create Bag Of Words

Use [sklearn's CountVectorizer](https://scikit-learn.org/1.5/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html) to create a Bag of words model.

![](images/28.png)

Here we already have parameters to choose for lowercase and applying stopwords as we have already done that earlier, our focus will be on `ngram_range` parameter and `max_features` parameter.

The parameter `max_features` tells to take x (2500 words, here) number of words which has the maximum frequency.

In [7]:
## Create the Bag OF Words model
from sklearn.feature_extraction.text import CountVectorizer
## for Binary BOW enable binary=True
cv=CountVectorizer(max_features=2500,binary=True)

In [8]:
X=cv.fit_transform(corpus).toarray()

As we have choosen 2500 words(features) we will have 2500 features. 

In [9]:
X.shape

(5572, 2500)

Change `max_features`=100.

In [10]:
cv=CountVectorizer(max_features=100)
X=cv.fit_transform(corpus).toarray()
X.shape

(5572, 100)

In [11]:
import numpy as np
np.set_printoptions(edgeitems=30, linewidth=100000, 
    formatter=dict(float=lambda x: "%.3g" % x))
X[:10]

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

So we can see 2's and 1's here in the vector. By default, this gives a normal BOW.

So to get Binary BOW only we have to enable the parameter `binary` as _True_ (by defalt its _False_).  
> `binary=True`

In [12]:
cv=CountVectorizer(max_features=100, binary=True)
X=cv.fit_transform(corpus).toarray()
X.shape

(5572, 100)

In [13]:
X.shape

(5572, 100)

To see the top 100 words. Run the below code.

These are indexes of the column. So on quick look tne 4th column will be __back__ as can be seen from the key-value pair in dictionary - `'back': np.int64(3)`. 

And the 1st column will be `'alreadi': np.int64(0)`.

So it will have indexes from 0 to 99.

In [14]:
cv.vocabulary_

{'go': np.int64(22),
 'great': np.int64(25),
 'got': np.int64(24),
 'wat': np.int64(90),
 'ok': np.int64(56),
 'free': np.int64(18),
 'win': np.int64(94),
 'text': np.int64(77),
 'txt': np.int64(85),
 'say': np.int64(67),
 'alreadi': np.int64(0),
 'think': np.int64(80),
 'hey': np.int64(28),
 'week': np.int64(92),
 'back': np.int64(3),
 'like': np.int64(38),
 'still': np.int64(73),
 'send': np.int64(69),
 'even': np.int64(15),
 'friend': np.int64(19),
 'prize': np.int64(62),
 'claim': np.int64(7),
 'call': np.int64(4),
 'mobil': np.int64(47),
 'co': np.int64(8),
 'home': np.int64(30),
 'want': np.int64(89),
 'today': np.int64(82),
 'cash': np.int64(6),
 'day': np.int64(12),
 'repli': np.int64(64),
 'www': np.int64(96),
 'right': np.int64(65),
 'thank': np.int64(78),
 'take': np.int64(75),
 'time': np.int64(81),
 'use': np.int64(87),
 'messag': np.int64(44),
 'oh': np.int64(55),
 'ye': np.int64(97),
 'make': np.int64(42),
 'way': np.int64(91),
 'feel': np.int64(16),
 'dont': np.int64(14

### N-Grams

If we use parameter `ngram_range=(1,1)` then the result will be same as above.

All features will be 1.

In the below code, if we use parameter `ngram_range=(1,2)`  will be combination of unigram and bigram.
And if we use parameter `ngram_range=(2,3)`  will be combination of bigram and trigram.

In [17]:
## Create the Bag OF Words model with ngram
from sklearn.feature_extraction.text import CountVectorizer
## for Binary BOW enable binary=True
cv=CountVectorizer(max_features=100,binary=True,ngram_range=(2,3))
X=cv.fit_transform(corpus).toarray()

So below we can get max occuring bigram and trigram combination: `'call claim': np.int64(3)`.

To get combination of 3 words only or trigram - use `ngram_range(3,3)`.

To improve our accuracy we can go on trying (1,1) -> (1,2) -> (1,3) -> (2,2) -> (2,3) -> (3,3) and so on.

In [18]:
cv.vocabulary_

{'free entri': np.int64(34),
 'claim call': np.int64(18),
 'call claim': np.int64(3),
 'free call': np.int64(33),
 'chanc win': np.int64(17),
 'txt word': np.int64(91),
 'let know': np.int64(55),
 'go home': np.int64(37),
 'pleas call': np.int64(70),
 'lt gt': np.int64(61),
 'want go': np.int64(97),
 'like lt': np.int64(56),
 'like lt gt': np.int64(57),
 'sorri call': np.int64(83),
 'call later': np.int64(11),
 'sorri call later': np.int64(84),
 'ur award': np.int64(92),
 'call custom': np.int64(4),
 'custom servic': np.int64(25),
 'cash prize': np.int64(16),
 'call custom servic': np.int64(5),
 'po box': np.int64(71),
 'tri contact': np.int64(89),
 'draw show': np.int64(30),
 'show prize': np.int64(81),
 'prize guarante': np.int64(75),
 'guarante call': np.int64(43),
 'valid hr': np.int64(95),
 'draw show prize': np.int64(31),
 'show prize guarante': np.int64(82),
 'prize guarante call': np.int64(76),
 'select receiv': np.int64(78),
 'privat account': np.int64(72),
 'account statement

So we can get X below.

In [19]:
X

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0