In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set()

  from pandas.core import (


## Step 1 : Load the data

In [2]:
# Link = https://www.kaggle.com/search?q=spam+classification+in%3Adatasets
messages = pd.read_csv("Data/email.csv")

In [3]:
messages.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
messages = messages[:5571]

In [5]:
messages.isnull().sum()

Category    0
Message     0
dtype: int64

## Step 2 : Text Preprocessing 

In [6]:
import nltk
import re
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
stemming = PorterStemmer()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/hardiksharma/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
corpus = []
for i in range(0,len(messages)):
    # REMOVING ANY SPECIAL CHARACTERS
    review = re.sub('[^a-zA-Z]',' ', messages['Message'][i]) 
    
    # CONVERTING TO A LOWERCASE AND SPLITTING
    review = review.lower()                                  
    review = review.split()
    
    # STEMMING/LEMMATIZATION 
    review = [stemming.stem(word) for word in review if word not in set(stopwords.words('english'))] 
    review = " ".join(review)
    corpus.append(review)

In [8]:
corpus

['go jurong point crazi avail bugi n great world la e buffet cine got amor wat',
 'ok lar joke wif u oni',
 'free entri wkli comp win fa cup final tkt st may text fa receiv entri question std txt rate c appli',
 'u dun say earli hor u c alreadi say',
 'nah think goe usf live around though',
 'freemsg hey darl week word back like fun still tb ok xxx std chg send rcv',
 'even brother like speak treat like aid patent',
 'per request mell mell oru minnaminungint nurungu vettam set callertun caller press copi friend callertun',
 'winner valu network custom select receivea prize reward claim call claim code kl valid hour',
 'mobil month u r entitl updat latest colour mobil camera free call mobil updat co free',
 'gonna home soon want talk stuff anymor tonight k cri enough today',
 'six chanc win cash pound txt csh send cost p day day tsandc appli repli hl info',
 'urgent week free membership prize jackpot txt word claim c www dbuk net lccltd pobox ldnw rw',
 'search right word thank breather

## Step 3 : Train/Test Split

In [9]:
y = messages["Category"].map({'ham': 0, 'spam': 1}).astype(int)

In [10]:
len(corpus),y.shape

(5571, (5571,))

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(corpus,y,test_size=0.25)

In [12]:
X_train, X_test, y_train, y_test

(['well done costa del sol holiday await collect call toclaim sae tc pobox stockport sk xh cost pm max min',
  'ee msg na poortiyagi odalebeku hanumanji name hanuman bajarangabali maruti pavanaputra sankatmochan ramaduth mahav ee name lt gt janarig ivatt kalisidar next saturday olag ondu good news keluviri maretar ind dodda problum nalli siguviri idu matra lt gt true neglet',
  'still attend da talk',
  'manag puzzel',
  'rightio well arent bright earli morn',
  'world famamu',
  'ye fine',
  'want new nokia colour phone deliveredtomorrow free minut mobil free text free camcord repli call',
  'attend noth',
  'cool come like lt gt ish',
  'good even',
  'thk em find wtc far weiyi goin e rest dunno yet r ur goin dinner den might b abl join',
  'realli get hang around',
  'tuesday night r u real',
  'alex say ok ok',
  'freemsg award free mini digit camera repli snap collect prize quizclub opt stop p wk sp rwm ph',
  'blanket suffici thx',
  'talk',
  'creepi crazi',
  'six chanc win cas

## Step 4 : Apply the N-gram Bag of Words / TF-IDF

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 2500, ngram_range = (1,3)) # MAX FEATURES = MAXIMUM VOCABULARY SIZE 

# from sklearn.feature_extraction.text import TfidfVectorizer
# tf_idf = TfidfVectorizer(max_features=2500, ngram_range=(1,2))

In [14]:
X_train = cv.fit_transform(X_train).toarray()
X_test = cv.transform(X_test).toarray()

# X_train = tf_idf.fit_transform(X_train).toarray()
# X_test = tf_idf.transform(X_test).toarray()

In [15]:
print(cv.vocabulary_)
#print(tf_idf.vocabulary_)

{'well': 2377, 'done': 554, 'costa': 442, 'del': 515, 'sol': 1932, 'holiday': 945, 'await': 108, 'collect': 377, 'call': 230, 'toclaim': 2144, 'sae': 1785, 'tc': 2070, 'pobox': 1613, 'stockport': 1997, 'sk': 1908, 'xh': 2458, 'cost': 438, 'pm': 1604, 'max': 1275, 'min': 1305, 'well done': 2378, 'costa del': 443, 'del sol': 516, 'sol holiday': 1933, 'holiday await': 946, 'await collect': 109, 'collect call': 378, 'call toclaim': 258, 'toclaim sae': 2145, 'sae tc': 1787, 'tc pobox': 2071, 'pobox stockport': 1614, 'stockport sk': 1998, 'sk xh': 1909, 'xh cost': 2459, 'cost pm': 439, 'pm max': 1608, 'max min': 1276, 'costa del sol': 444, 'del sol holiday': 517, 'sol holiday await': 1934, 'holiday await collect': 947, 'await collect call': 110, 'collect call toclaim': 379, 'call toclaim sae': 259, 'toclaim sae tc': 2146, 'sae tc pobox': 1788, 'tc pobox stockport': 2072, 'pobox stockport sk': 1615, 'stockport sk xh': 1999, 'sk xh cost': 1910, 'xh cost pm': 2460, 'cost pm max': 440, 'pm max m

## Step 5 : Load the Naive Bayes Model

In [16]:
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()
mnb.fit(X_train, y_train)
y_pred = mnb.predict(X_test)

## Step 6 : Performance Metrics

In [17]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score

score = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(score)
print(cm)
print(f1)
print(classification_report(y_test, y_pred))

0.9827709978463748
[[1198    7]
 [  17  171]]
0.9344262295081968
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1205
           1       0.96      0.91      0.93       188

    accuracy                           0.98      1393
   macro avg       0.97      0.95      0.96      1393
weighted avg       0.98      0.98      0.98      1393

