In [1]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv(r'D:\NLP - Self\Dataset\SPAM text message.csv')
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
df.shape

(5572, 2)

In [5]:
## data cleaning and preprocessing
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import re

In [7]:
messages = df['Message']
messages

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object

### Model using PorterStemmer

In [6]:
porter = PorterStemmer()

In [8]:
corpus = []

for i in range(len(messages)):
    review = re.sub('[^a-zA-Z]',' ',messages[i])
    review = review.lower()
    review = review.split()
    review = [porter.stem(word) for word in review if word not in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [10]:
## creating a Bag of words Model
from sklearn.feature_extraction.text import CountVectorizer

In [11]:
cv = CountVectorizer()

In [12]:
X = cv.fit_transform(corpus).toarray()
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [13]:
X.shape

(5572, 6296)

In [14]:
## there are 6296 words present in the 'corpus' list.... 
## I don't need all 6296 words ... the reason is that some of the words are frequently not present
## therefore CounterVectorizer(max_features = 2500) is ok

cv = CountVectorizer(max_features=2500)
## top frequent 2500 columns i am choosing

In [16]:
## therefore

X = cv.fit_transform(corpus).toarray()
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [17]:
X.shape

(5572, 2500)

In [19]:
df['Category'] = df['Category'].replace(['spam'],1)
df['Category'] = df['Category'].replace(['ham'],0)

In [20]:
df.head()

Unnamed: 0,Category,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [23]:
y = df['Category'].values

In [25]:
from sklearn.model_selection import train_test_split

In [26]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=0)

In [27]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(3900, 2500)
(1672, 2500)
(3900,)
(1672,)


In [29]:
from sklearn.naive_bayes import MultinomialNB

In [30]:
model = MultinomialNB().fit(X_train,y_train)

In [31]:
y_pred = model.predict(X_test)

#### Accuracy Score

In [32]:
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report

In [33]:
confusion_matrix(y_test,y_pred)

array([[1437,   14],
       [  12,  209]], dtype=int64)

In [34]:
accuracy_score(y_test,y_pred)

0.9844497607655502

### Model using WordNetLemmatizer

In [35]:
lemmatizer = WordNetLemmatizer()

In [36]:
corpus_lem = [] ## empty list

for i in range(len(messages)):
    review = re.sub('[^a-zA-Z]',' ',messages[i])
    review = review.lower()
    review = review.split()
    review = [lemmatizer.lemmatize(word) for word in review if word not in stopwords.words('english')]
    review = ' '.join(review)
    
    corpus_lem.append(review) ## appending words into a list

In [38]:
## creating Bag of Words model
cv = CountVectorizer(max_features=2500)

In [39]:
X = cv.fit_transform(corpus_lem).toarray()
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [40]:
## we've already created 'y'
## splitting the model into train & test

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=0)

In [41]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(3900, 2500)
(1672, 2500)
(3900,)
(1672,)


In [44]:
model_lem = MultinomialNB().fit(X_train,y_train)

In [45]:
y_pred = model_lem.predict(X_test)

In [46]:
## accuracies of a model

confusion_matrix(y_test,y_pred)

array([[1439,   12],
       [  14,  207]], dtype=int64)

In [47]:
accuracy_score(y_test,y_pred)

0.9844497607655502