<h2 align='center'>NLP Tutorial: Text Representation - Bag Of Words (BOW)</h2>

In [5]:
import pandas as pd
import numpy as np

In [6]:
df = pd.read_csv("C:/Users/FPTSHOP/OneDrive/Documents/JVB_Training/NLP/data/spam.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
df.Category.value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

In [8]:
df['spam'] = df['Category'].apply(lambda x: 1 if x =='spam' else 0)

In [9]:
df.shape

(5572, 3)

In [10]:
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


<h3>Train test split</h3>

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.Message, df.spam, test_size=0.2)

In [12]:
X_train.shape

(4457,)

In [13]:
X_test.shape

(1115,)

In [14]:
type(X_train)

pandas.core.series.Series

In [15]:
X_train[:4]

3085    Ok lor. I ned 2 go toa payoh 4 a while 2 retur...
3798    For The First Time In The History 'Need' 'Comf...
3841    HEY MATE! HOWS U HONEY?DID U AVE GOOD HOLIDAY?...
847     My stomach has been thru so much trauma I swea...
Name: Message, dtype: object

In [16]:
type(y_train)

pandas.core.series.Series

In [17]:
y_train[:4]

3085    0
3798    0
3841    0
847     0
Name: spam, dtype: int64

In [18]:
type(X_train.values)

numpy.ndarray

<h3>Create bag of words representation using CountVectorizer</h3>

In [19]:
from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer()

X_train_cv = v.fit_transform(X_train.values)
X_train_cv

<4457x7796 sparse matrix of type '<class 'numpy.int64'>'
	with 59134 stored elements in Compressed Sparse Row format>

In [20]:
X_train_cv.toarray()[:2][0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [21]:
X_train_cv.shape

(4457, 7796)

In [22]:
v.get_feature_names_out()[1771]

'chat80155'

In [23]:
v.vocabulary_

{'ok': 4961,
 'lor': 4241,
 'ned': 4766,
 'go': 3193,
 'toa': 6987,
 'payoh': 5177,
 'while': 7548,
 'return': 5820,
 'smth': 6327,
 'wan': 7427,
 'send': 6068,
 'me': 4443,
 'there': 6884,
 'or': 5024,
 'wat': 7452,
 'for': 2969,
 'the': 6866,
 'first': 2899,
 'time': 6952,
 'in': 3669,
 'history': 3469,
 'need': 4767,
 'comfort': 1940,
 'and': 984,
 'luxury': 4306,
 'are': 1072,
 'sold': 6351,
 'at': 1145,
 'same': 5953,
 'price': 5463,
 'india': 3692,
 'onion': 4990,
 'rs': 5895,
 'lt': 4285,
 'gt': 3291,
 'petrol': 5227,
 'beer': 1324,
 'shesil': 6138,
 'hey': 3447,
 'mate': 4416,
 'hows': 3554,
 'honey': 3508,
 'did': 2332,
 'ave': 1183,
 'good': 3216,
 'holiday': 3498,
 'gimmi': 3169,
 'de': 2218,
 'goss': 3229,
 'my': 4706,
 'stomach': 6543,
 'has': 3371,
 'been': 1322,
 'thru': 6930,
 'so': 6342,
 'much': 4674,
 'trauma': 7079,
 'swear': 6700,
 'just': 3905,
 'can': 1662,
 'eat': 2541,
 'better': 1362,
 'lose': 4243,
 'weight': 7504,
 'yeah': 7734,
 'we': 7473,
 'got': 3231,
 '

In [24]:
X_train_np = X_train_cv.toarray()
X_train_np[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [25]:
np.where(X_train_np[0]!=0)

(array([3193, 4241, 4443, 4766, 4961, 5024, 5177, 5820, 6068, 6327, 6884,
        6987, 7427, 7452, 7548], dtype=int64),)

In [None]:
X_train_np[0][1771]

1

<h3>Train the naive bayes model</h3>

In [None]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_cv, y_train)

In [None]:
X_test_cv = v.transform(X_test)

<h3>Evaluate Performance</h3>

In [None]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test_cv)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       976
           1       0.98      0.93      0.95       139

    accuracy                           0.99      1115
   macro avg       0.98      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115



In [None]:
emails = [
    'Hey mohan, can we get together to watch footbal game tomorrow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]

emails_count = v.transform(emails)
model.predict(emails_count)

array([0, 1], dtype=int64)

<h3>Train the model using sklearn pipeline and reduce number of lines of code</h3>

In [None]:
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [None]:
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       976
           1       0.98      0.93      0.95       139

    accuracy                           0.99      1115
   macro avg       0.98      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115

