### Importing Libraries

In [2]:
import numpy as np
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

### Importing Dataset

In [3]:
df = pd.read_csv("C:/Users/kjay1/Downloads/SPAM Classification/SMSSpamCollection.txt", sep='\t',names=['Result','Message'])
df

Unnamed: 0,Result,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


### Exploratory Data Analysis

In [4]:
df.isnull().sum()

Result     0
Message    0
dtype: int64

In [5]:
df.duplicated().sum()

403

In [6]:
df = df.drop_duplicates(keep = 'first', ignore_index = True)
df

Unnamed: 0,Result,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5164,spam,This is the 2nd time we have tried 2 contact u...
5165,ham,Will ü b going to esplanade fr home?
5166,ham,"Pity, * was in mood for that. So...any other s..."
5167,ham,The guy did some bitching but I acted like i'd...


In [7]:
ps = PorterStemmer()
corpus = []

In [8]:
for i in range(len(df)):
    review = re.sub('[^a-zA-Z]',' ',df['Message'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [9]:
cv = CountVectorizer()
x = cv.fit_transform(corpus).toarray()

In [10]:
y = pd.get_dummies(df['Result'],drop_first=True)
y

Unnamed: 0,spam
0,0
1,0
2,1
3,0
4,0
...,...
5164,1
5165,0
5166,0
5167,0


In [11]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42, stratify=y)

In [12]:
len(x_train), len(x_test), len(y_train), len(y_test)

(3876, 1293, 3876, 1293)

### Model training & Fitting

In [13]:
model = MultinomialNB().fit(x_train, y_train.values.ravel())

In [14]:
y_pred = model.predict(x_test)
y_pred

array([1, 0, 0, ..., 0, 0, 0], dtype=uint8)

### Evaluating Performance Metrics

In [15]:
score = accuracy_score(y_test, y_pred)*100
score

97.06109822119103

In [16]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.97      0.98      1130
           1       0.84      0.94      0.89       163

    accuracy                           0.97      1293
   macro avg       0.92      0.96      0.94      1293
weighted avg       0.97      0.97      0.97      1293



In [17]:
print(confusion_matrix(y_test, y_pred))

[[1101   29]
 [   9  154]]
