# NLP Tutorial: Text Representation - Bag Of Words (BOW)


In [47]:
import pandas as pd
import numpy as np

In [48]:
df = pd.read_csv("spam.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [49]:
df.Category.value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

In [50]:
df['spam'] = df['Category'].apply(lambda x: 1 if x=='spam'  else 0)

In [51]:
df.shape

(5572, 3)

In [52]:
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [53]:
# !pip3 install scikit-learn

In [54]:
# Train test split

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.Message, df.spam, test_size=0.2,)


In [55]:
X_train.shape

(4457,)

In [56]:
X_test.shape

(1115,)

In [57]:
type(X_train)


pandas.core.series.Series

In [58]:
X_train[:4]


3778    Mila, age23, blonde, new in UK. I look sex wit...
192     I'm sorry. I've joined the league of people th...
635     Dear Voucher Holder, 2 claim this weeks offer,...
3019         I didn't get the second half of that message
Name: Message, dtype: object

In [59]:
type(X_train.values)


numpy.ndarray

# Create bag of words representation using CountVectorizer


In [60]:
from sklearn.feature_extraction.text import CountVectorizer
v = CountVectorizer()

X_train_cv = v.fit_transform(X_train.values)
X_train_cv

<4457x7687 sparse matrix of type '<class 'numpy.int64'>'
	with 59185 stored elements in Compressed Sparse Row format>

In [61]:
X_train_cv.toarray()[:2][0]


array([0, 0, 0, ..., 0, 0, 0])

In [62]:
X_train_cv.shape


(4457, 7687)

In [63]:
v.get_feature_names_out()[1000:1050]

array(['anyone', 'anyones', 'anyplaces', 'anythiing', 'anythin',
       'anything', 'anythingtomorrow', 'anytime', 'anyway', 'anyways',
       'anywhere', 'aom', 'apart', 'apartment', 'aphex', 'apnt', 'apo',
       'apologetic', 'apologise', 'apologize', 'apology', 'app',
       'apparently', 'appeal', 'appear', 'appendix', 'applebees',
       'apples', 'application', 'apply', 'applying', 'appointment',
       'appointments', 'appreciate', 'appreciated', 'approaches',
       'approaching', 'appropriate', 'approved', 'approx', 'apps', 'appt',
       'april', 'aproach', 'apt', 'aptitude', 'aquarius', 'ar', 'arab',
       'arabian'], dtype=object)

In [64]:
v.get_feature_names_out().shape

(7687,)

In [65]:
dir(v)

['__annotations__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__sklearn_clone__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_build_request_for_signature',
 '_char_ngrams',
 '_char_wb_ngrams',
 '_check_feature_names',
 '_check_n_features',
 '_check_stop_words_consistency',
 '_check_vocabulary',
 '_count_vocab',
 '_get_default_requests',
 '_get_metadata_request',
 '_get_param_names',
 '_get_tags',
 '_limit_features',
 '_more_tags',
 '_parameter_constraints',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_sort_features',
 '_stop_words_id',
 '_validate_data',
 '_validate_ngram_range',
 '_validate_params',
 '_validate_vocabulary',
 '_warn_for_unused_params',

In [66]:
v.vocabulary_

{'mila': 4433,
 'age23': 860,
 'blonde': 1395,
 'new': 4713,
 'in': 3622,
 'uk': 7061,
 'look': 4157,
 'sex': 5986,
 'with': 7493,
 'guys': 3272,
 'if': 3582,
 'like': 4074,
 'fun': 3025,
 'me': 4366,
 'text': 6723,
 'mtalk': 4580,
 'to': 6872,
 '69866': 583,
 '18': 310,
 '30pp': 426,
 'txt': 7034,
 '1st': 327,
 '5free': 552,
 '50': 530,
 'increments': 3639,
 'help08718728876': 3386,
 'sorry': 6272,
 've': 7203,
 'joined': 3808,
 'the': 6750,
 'league': 4018,
 'of': 4847,
 'people': 5102,
 'that': 6746,
 'dont': 2395,
 'keep': 3878,
 'touch': 6935,
 'you': 7651,
 'mean': 4369,
 'great': 3220,
 'deal': 2180,
 'have': 3343,
 'been': 1296,
 'friend': 2985,
 'at': 1122,
 'all': 910,
 'times': 6839,
 'even': 2636,
 'personal': 5121,
 'cost': 2011,
 'do': 2359,
 'week': 7382,
 'dear': 2184,
 'voucher': 7269,
 'holder': 3448,
 'claim': 1812,
 'this': 6792,
 'weeks': 7386,
 'offer': 4853,
 'your': 7657,
 'pc': 5085,
 'go': 3146,
 'http': 3517,
 'www': 7586,
 'tlp': 6863,
 'co': 1858,
 'express

In [67]:
X_train_np = X_train_cv.toarray()
X_train_np[:4]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [68]:
np.where(X_train_np[0]!=0)

(array([ 310,  327,  426,  530,  552,  583,  860, 1395, 3025, 3272, 3386,
        3582, 3622, 3639, 4074, 4157, 4366, 4433, 4580, 4713, 5986, 6723,
        6872, 7034, 7061, 7493]),)

In [69]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()

model.fit(X_train_cv, y_train)


In [70]:
X_test_cv =v.transform(X_test)
print(X_test_cv)

  (0, 2474)	1
  (0, 3651)	1
  (0, 6872)	1
  (0, 7316)	1
  (0, 7340)	1
  (1, 2800)	1
  (1, 3159)	1
  (1, 3343)	1
  (1, 3622)	2
  (1, 4439)	1
  (1, 4800)	1
  (1, 7537)	1
  (2, 961)	1
  (2, 983)	1
  (2, 1122)	2
  (2, 1200)	1
  (2, 1273)	1
  (2, 1289)	1
  (2, 2647)	1
  (2, 3101)	1
  (2, 3187)	1
  (2, 3343)	1
  (2, 3493)	1
  (2, 3622)	1
  (2, 3734)	1
  :	:
  (1112, 2835)	1
  (1112, 3346)	1
  (1112, 3393)	2
  (1112, 3422)	1
  (1112, 4168)	1
  (1112, 4366)	1
  (1112, 4621)	1
  (1112, 4894)	1
  (1112, 6473)	1
  (1112, 6547)	1
  (1112, 6692)	1
  (1112, 6793)	1
  (1112, 7638)	1
  (1113, 1639)	1
  (1113, 4889)	1
  (1113, 5751)	1
  (1113, 6226)	1
  (1113, 6750)	1
  (1113, 7034)	1
  (1114, 4873)	3
  (1114, 5186)	1
  (1114, 6763)	1
  (1114, 6879)	1
  (1114, 7141)	1
  (1114, 7419)	1


In [71]:
from sklearn.metrics import classification_report
y_pred = model.predict(X_test_cv)

print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       955
           1       0.99      0.91      0.94       160

    accuracy                           0.98      1115
   macro avg       0.99      0.95      0.97      1115
weighted avg       0.98      0.98      0.98      1115

