### Text Representation : Bag of Words

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv(r"D:\nlp-text files\spam.csv")

In [3]:
df

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [4]:
df['Category'].value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

In [8]:
df['spam'] = df['Category'].apply(lambda x:1 if x == 'spam' else 0)

In [9]:
df['spam'].value_counts()

0    4825
1     747
Name: spam, dtype: int64

In [10]:
df

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,1
5568,ham,Will ü b going to esplanade fr home?,0
5569,ham,"Pity, * was in mood for that. So...any other s...",0
5570,ham,The guy did some bitching but I acted like i'd...,0


### Train Test Split

In [11]:
from sklearn.model_selection import train_test_split

In [13]:
X_train,X_test,y_train,y_test = train_test_split(df.Message,df.spam,test_size = 0.2)

In [14]:
X_train.shape

(4457,)

In [15]:
X_test.shape

(1115,)

### Creating a bag of words representation using count vectorizers

In [16]:
from sklearn.feature_extraction.text import CountVectorizer

In [17]:
v = CountVectorizer()

In [18]:
X_train_cv = v.fit_transform(X_train.values)
X_train_cv

<4457x7793 sparse matrix of type '<class 'numpy.int64'>'
	with 59691 stored elements in Compressed Sparse Row format>

In [23]:
X_train_np = X_train_cv.toarray()

In [26]:
np.where(X_train_np[0]!=0)

(array([   0,  306,  317,  685, 1743, 1841, 2105, 2361, 2875, 4325, 4479,
        4580, 4878, 4929, 4935, 5315, 5964, 6605, 6839, 6983, 6990, 7253,
        7384, 7658, 7739], dtype=int64),)

###  Train the Naive Bayes Model

In [27]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_cv, y_train)

In [28]:
X_test_cv = v.transform(X_test)

###  Evaluating the Model

In [29]:
#When we are dealing with imbalanced dataset we have to go with classification report to know about other metrics
from sklearn.metrics import classification_report

y_pred = model.predict(X_test_cv)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       981
           1       0.96      0.90      0.93       134

    accuracy                           0.98      1115
   macro avg       0.97      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [30]:
emails = [
    'Hey mohan, can we get together to watch footbal game tomorrow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]

emails_count = v.transform(emails)
model.predict(emails_count)

array([0, 1], dtype=int64)