In [70]:
# Importing the libraries

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [71]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [72]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\conne\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [73]:
data = pd.read_csv('SpamDataset', sep='\t',
                           names=["result", "text"])

In [74]:
data.head()

Unnamed: 0,result,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [75]:
data.shape

(5572, 2)

In [76]:
print(data['result'].unique())

['ham' 'spam']


In [77]:
data.isna().any().any()

False

In [78]:
lemmatizer = WordNetLemmatizer()
corpus = []

In [79]:
for i in range(len(data)):
    rev = re.sub('[^a-zA-Z]', ' ', data['text'][i])
    rev = rev.lower()
    rev = rev.split()
    rev = [lemmatizer.lemmatize(word) for word in rev if not word in set(stopwords.words('english'))]
    rev = ' '.join(rev)
    corpus.append(rev)

In [80]:
print(corpus[:5])

['go jurong point crazy available bugis n great world la e buffet cine got amore wat', 'ok lar joking wif u oni', 'free entry wkly comp win fa cup final tkts st may text fa receive entry question std txt rate c apply', 'u dun say early hor u c already say', 'nah think go usf life around though']


In [81]:
print(data['text'][:5])

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: text, dtype: object


In [82]:
# Creating the TF-IDF model

from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer()
X = cv.fit_transform(corpus).toarray()

In [83]:
X.shape

(5572, 7098)

In [84]:
print(X[:5])

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [85]:
y = pd.get_dummies(data['result'])
y = y.iloc[:,1].values

In [86]:
print(y[:5])

[0 0 1 0 0]


In [87]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [88]:
# Training model using Random Forest classifier

from sklearn.ensemble import RandomForestClassifier
model=RandomForestClassifier(criterion='entropy')
model.fit(X_train, y_train)

In [89]:
y_pred = model.predict(X_test)

In [90]:
print(y_pred[:5])

[0 1 0 0 0]


In [91]:
print(y_test[:5])

[0 1 0 0 0]


In [92]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[955   0]
 [ 20 140]]


In [93]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

0.9820627802690582
