In [1]:
import pandas as pd
import numpy as np

import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.model_selection import cross_val_score

import pickle 
import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
df = pd.read_csv('spam_ham_dataset.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [4]:
df.drop(['Unnamed: 0', 'label'], axis=1, inplace=True)

In [5]:
df.rename({'label_num': 'spam'}, inplace=True, axis=1)

In [6]:
df.head()

Unnamed: 0,text,spam
0,Subject: enron methanol ; meter # : 988291\r\n...,0
1,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,"Subject: photoshop , windows , office . cheap ...",1
4,Subject: re : indian springs\r\nthis deal is t...,0


In [7]:
df.shape

(5171, 2)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5171 entries, 0 to 5170
Data columns (total 2 columns):
text    5171 non-null object
spam    5171 non-null int64
dtypes: int64(1), object(1)
memory usage: 80.9+ KB


## Text Cleaning

In [9]:
corpus = []

for review in df['text']:
    review = review[9:]  # Remove Subject: 
    
    # Punctuation
    review = re.sub(pattern='[^A-Za-z]', 
                    repl=' ', 
                    string=review)
    review = review.lower()
    review = review.split()
    
    # StopWords
    all_stopwords = set(stopwords.words('english'))
    all_stopwords.remove('not')
    
    # Stemming
    ps = PorterStemmer()
    
    review = [ps.stem(word) for word in review if word not in all_stopwords]
    review = ' '.join(review)
    
    corpus.append(review)

In [10]:
corpus[0]

'enron methanol meter follow note gave monday preliminari flow data provid daren pleas overrid pop daili volum present zero reflect daili activ obtain ga control chang need asap econom purpos'

## Model Making

### Bag of Model

In [11]:
cv = CountVectorizer(max_features=1500)
X = cv.fit_transform(corpus)

In [12]:
X

<5171x1500 sparse matrix of type '<class 'numpy.int64'>'
	with 201161 stored elements in Compressed Sparse Row format>

In [13]:
X = X.toarray()

In [14]:
X

array([[0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [15]:
Y = df['spam']

### Train Test Split

In [16]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.8, test_size=0.2, random_state=0)

### Naive Bayes 

In [17]:
classifier = MultinomialNB()
classifier.fit(x_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [18]:
y_predictions = classifier.predict(x_test)

### Model Evaluaion

In [19]:
print(f'Accuracy: {accuracy_score(y_test, y_predictions)*100}%\n')
print(f'Confusion Matrix: \n{confusion_matrix(y_test, y_predictions)}\n')
print(classification_report(y_test, y_predictions))

Accuracy: 93.04347826086956%

Confusion Matrix: 
[[688  44]
 [ 28 275]]

              precision    recall  f1-score   support

           0       0.96      0.94      0.95       732
           1       0.86      0.91      0.88       303

   micro avg       0.93      0.93      0.93      1035
   macro avg       0.91      0.92      0.92      1035
weighted avg       0.93      0.93      0.93      1035



In [20]:
# <<--- K fold cross validation --->>

accuracies = cross_val_score(estimator = classifier, X = x_train, y = y_train, cv = 10) 

print("Average Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Average Accuracy: 93.96 %
Standard Deviation: 0.96 %


## Save Model

In [22]:
with open('Model.txt','wb') as f:
    pickle.dump(classifier, f)