### Bag of words

In [12]:
import numpy as np
import pandas as pd


In [15]:
df = pd.read_csv("spam.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [16]:
# Check class distribution
df.Category.value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

## preprocessing the dataset

In [17]:
# transforming the spam and ham labels to 0 and 1 using lambda function
df['spam'] = df['Category'].apply(lambda x: 1 if x == 'spam' else 0)



In [18]:
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [20]:
df.shape

(5572, 3)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.Message, df.spam, test_size=0.2)


(4457,)
(1115,)


In [21]:
X_train.shape

(4457,)

In [22]:
X_test.shape

(1115,)

In [23]:
type(X_train)

pandas.core.series.Series

In [25]:
X_train[:4]

2838                              1's reach home call me.
5371               Lol, oh you got a friend for the dog ?
2545    She is our sister.. She belongs 2 our family.....
2112    Yar he quite clever but aft many guesses lor. ...
Name: Message, dtype: object

In [24]:
type(y_train)


pandas.core.series.Series

In [26]:
y_train[:4]

2838    0
5371    0
2545    0
2112    0
Name: spam, dtype: int64

In [31]:
type(y_train.values)

numpy.ndarray

# Importing CountVectorizer from sklearn

In [32]:
# import CountVectorizer to convert text to a matrix of token counts or a bag of words model
from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer()
X_train_cv = v.fit_transform(X_train.values)
X_train_cv

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 59653 stored elements and shape (4457, 7739)>

In [33]:
X_train_cv.toarray()[:4]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [34]:
X_train_cv.shape

(4457, 7739)

In [38]:
v.get_feature_names_out()[1000:1050]

array(['anyone', 'anyones', 'anyplaces', 'anythin', 'anything',
       'anythingtomorrow', 'anytime', 'anyway', 'anyways', 'anywhere',
       'aom', 'apart', 'apartment', 'apes', 'apeshit', 'aphex', 'apnt',
       'apo', 'apologetic', 'apologise', 'apologize', 'apology', 'app',
       'apparently', 'appeal', 'appear', 'appendix', 'applausestore',
       'applebees', 'apples', 'application', 'apply', 'applyed',
       'applying', 'appointment', 'appreciate', 'appreciated',
       'approaches', 'approaching', 'approve', 'approx', 'apps', 'appt',
       'appy', 'april', 'aproach', 'apt', 'aptitude', 'aquarius', 'ar'],
      dtype=object)

In [39]:
v.get_feature_names_out().shape

(7739,)

In [40]:
dir(v)

['_CountVectorizer__metadata_request__fit',
 '_CountVectorizer__metadata_request__transform',
 '__annotations__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__sklearn_clone__',
 '__sklearn_tags__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_build_request_for_signature',
 '_char_ngrams',
 '_char_wb_ngrams',
 '_check_stop_words_consistency',
 '_check_vocabulary',
 '_count_vocab',
 '_doc_link_module',
 '_doc_link_template',
 '_doc_link_url_param_generator',
 '_get_default_requests',
 '_get_doc_link',
 '_get_metadata_request',
 '_get_param_names',
 '_get_params_html',
 '_html_repr',
 '_limit_features',
 '_parameter_constraints',
 '_repr_html_',
 '_repr_html_inner',
 '_repr

In [42]:
v.vocabulary_

{'reach': 5568,
 'home': 3466,
 'call': 1609,
 'me': 4404,
 'lol': 4185,
 'oh': 4900,
 'you': 7699,
 'got': 3196,
 'friend': 3003,
 'for': 2939,
 'the': 6800,
 'dog': 2386,
 'she': 6056,
 'is': 3744,
 'our': 4993,
 'sister': 6171,
 'belongs': 1324,
 'family': 2766,
 'hope': 3484,
 'of': 4879,
 'tomorrow': 6949,
 'pray': 5355,
 'her': 3401,
 'who': 7486,
 'was': 7381,
 'fated': 2785,
 'shoranur': 6093,
 'train': 6999,
 'incident': 3641,
 'lets': 4086,
 'hold': 3457,
 'hands': 3310,
 'together': 6932,
 'amp': 951,
 'fuelled': 3031,
 'by': 1593,
 'love': 4226,
 'concern': 1941,
 'prior': 5401,
 'grief': 3243,
 'pain': 5037,
 'pls': 5245,
 'join': 3838,
 'in': 3636,
 'dis': 2335,
 'chain': 1716,
 'pass': 5085,
 'it': 3755,
 'stop': 6479,
 'violence': 7288,
 'against': 862,
 'women': 7568,
 'yar': 7663,
 'he': 3360,
 'quite': 5513,
 'clever': 1834,
 'but': 1578,
 'aft': 854,
 'many': 4350,
 'guesses': 3267,
 'lor': 4205,
 'ask': 1096,
 'bring': 1514,
 'thk': 6841,
 'darren': 2160,
 'not': 4

In [47]:
X_train_np = X_train_cv.toarray()
X_train_np[0]

array([0, 0, 0, ..., 0, 0, 0])

In [46]:
np.where(X_train_np[0]!=0)

(array([1609, 3466, 4404, 5568]),)

### Training a model using Naive Bayes classifier

In [48]:
# training a model using Naive Bayes classifier
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train_cv, y_train)

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [49]:
X_test_cv = v.transform(X_test)


In [53]:
from sklearn.metrics import classification_report
y_pred =  model.predict(X_test_cv)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       967
           1       0.99      0.91      0.95       148

    accuracy                           0.99      1115
   macro avg       0.99      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115



In [54]:
emails = [
    'Hey mohan, can we get together to watch footbal game tomorrow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]

emails_count = v.transform(emails)
model.predict(emails_count)

array([0, 1])

Train the model using sklearn pipeline and reduce number of lines of code

In [60]:
from sklearn.pipeline import Pipeline
clf = Pipeline([('vectorizer', CountVectorizer()), ('nb', MultinomialNB())])


In [61]:
clf.fit(X_train, y_train)

0,1,2
,steps,"[('vectorizer', ...), ('nb', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'
,ngram_range,"(1, ...)"

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [62]:
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       967
           1       0.99      0.91      0.95       148

    accuracy                           0.99      1115
   macro avg       0.99      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115

