In [14]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [15]:
df = pd.read_csv('spam.csv',encoding='ISO-8859-1')

In [16]:
print(df.columns)

Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')


In [17]:
df.describe()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
count,5572,5572,50,12,6
unique,2,5169,43,10,5
top,ham,"Sorry, I'll call later","bt not his girlfrnd... G o o d n i g h t . . .@""","MK17 92H. 450Ppw 16""","GNT:-)"""
freq,4825,30,3,2,2


In [18]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [30]:
df.shape

(5572, 5)

In [19]:
# stopwords and punkt for tokenization
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Jagadeesh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Jagadeesh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [20]:
stop_words = set(stopwords.words('english'))

In [21]:
def preprocess_text(text):
    words = word_tokenize(text.lower())
    return ' '.join([word for word in words if word.isalpha() and word not in stop_words])


df['v2'] = df['v2'].apply(preprocess_text)

In [22]:
# feature extraction
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['v2'])
y = df['v1']

In [23]:
# splitting and training the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [24]:
# Train the spam detector using Naive Bayes (you can try other algorithms as well)
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

# Test the spam detector on the testing set
y_pred = classifier.predict(X_test)



In [25]:
# Evaluate the performance
accuracy = accuracy_score(y_test, y_pred)*100
report = classification_report(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}",'%')
print("Classification Report:\n", report)
print("Confusion Matrix:\n", confusion)


Accuracy: 96.50 %
Classification Report:
               precision    recall  f1-score   support

         ham       0.96      1.00      0.98       965
        spam       1.00      0.74      0.85       150

    accuracy                           0.97      1115
   macro avg       0.98      0.87      0.92      1115
weighted avg       0.97      0.97      0.96      1115

Confusion Matrix:
 [[965   0]
 [ 39 111]]
