In [14]:
# Import necessary libraries
import numpy as np  # For numerical operations (e.g., array manipulations)
import pandas as pd  # For data manipulation and analysis (e.g., reading CSV files, handling DataFrames)
from sklearn.model_selection import train_test_split  # To split the dataset into training and testing sets
from sklearn.feature_extraction.text import CountVectorizer  # To convert text data into numerical feature vectors
from sklearn.naive_bayes import MultinomialNB  # A Naive Bayes classifier suitable for text classification
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report  # For evaluating model performance
import string  # To handle punctuation removal
from nltk.corpus import stopwords  # To remove stopwords from text

In [22]:

# Load the dataset
# The dataset used here is 'spam.csv', which contains SMS messages labeled as 'ham' (not spam) or 'spam'
data = pd.read_csv(r"C:\Users\Jagadeep\Downloads\spam.csv", encoding="latin-1")

# Display the first few rows of the dataset to understand its structure
print("First few rows of the dataset:")
print(data.head())

# Select only the relevant columns ('label' and 'message') and rename them for clarity
data = data.iloc[:, :2]  # Keep only the first two columns
data.columns = ['label', 'message']  # Rename columns to 'label' and 'message'


First few rows of the dataset:
     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  


In [24]:
# Map the labels ('ham' and 'spam') to numerical values (0 for 'ham', 1 for 'spam')
data['label'] = data['label'].map({'ham': 0, 'spam': 1})

In [26]:
# Define a function to preprocess the text data

# 1. Converts text to lowercase
# 2. Removes punctuation
# 3. Removes stopwords (common words like 'the', 'is', etc.)
def preprocess_text(text):
    text = text.lower()  # Convert text to lowercase
    text = ''.join([char for char in text if char not in string.punctuation])  # Remove punctuation
    text = ' '.join([word for word in text.split() if word not in stopwords.words('english')])  # Remove stopwords
    return text
# Apply the preprocessing function to the 'message' column
data['message'] = data['message'].apply(preprocess_text)

In [28]:
# Convert the text data into numerical feature vectors using CountVectorizer

vectorizer = CountVectorizer()
x = vectorizer.fit_transform(data['message'])  # Fit and transform the text data into numerical features
y = data['label']  # Target variable (labels)

In [30]:
# Split the dataset into training and testing sets (80% training, 20% testing)
# random_state ensures reproducibility of the split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [36]:
# Initialize the Multinomial Naive Bayes classifier
model = MultinomialNB()
y_train = y_train.fillna(0)
model.fit(x_train, y_train)

In [40]:
# Evaluate the model's performance
y_pred = model.predict(x_test)
# Calculate and print the accuracy score (percentage of correct predictions)
print("Accuracy:", accuracy_score(y_test, y_pred) * 100)

Accuracy: 97.57847533632287


In [42]:
# Generate and print the confusion matrix
# The confusion matrix shows the number of true positives, true negatives, false positives, and false negatives
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Confusion Matrix:
 [[951  14]
 [ 13 137]]


In [44]:
# Generate and print the classification report
# The classification report includes precision, recall, F1-score, and support for each class
print("Classification Report:\n", classification_report(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99       965
           1       0.91      0.91      0.91       150

    accuracy                           0.98      1115
   macro avg       0.95      0.95      0.95      1115
weighted avg       0.98      0.98      0.98      1115

