# Using package

In [25]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Load the data
data = pd.read_csv('C:/PERSONAL/Studies/DUK/sem 3/AIML/Lab_drive/Spam_filter_NaiveBayes/Dataset/spam_emails.csv')

# Handle NaN values
data.dropna(subset=['text', 'label'], inplace=True)

# Preprocess the data
X = data['text']  # Features
y = data['label']  # Labels (spam or not spam)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorization
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)  # Fit and transform on training data
X_test_vectorized = vectorizer.transform(X_test)        # Only transform on test data

# Train the Naive Bayes classifier
model = MultinomialNB()
model.fit(X_train_vectorized, y_train)

# Make predictions
y_pred = model.predict(X_test_vectorized)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print(classification_report(y_test, y_pred))

Accuracy: 0.99
              precision    recall  f1-score   support

    not spam       0.98      1.00      0.99        49
        spam       1.00      0.98      0.99        51

    accuracy                           0.99       100
   macro avg       0.99      0.99      0.99       100
weighted avg       0.99      0.99      0.99       100



# From scratch

In [None]:
import csv
import math
from collections import defaultdict

def load_data(filename):
    data = []
    with open(filename, 'r') as f:
        reader = csv.reader(f)
        for row in reader:
            data.append({'text': row[0], 'label': row[1]})
    return data

def calculate_priors(data):
    spam_count = sum(1 for item in data if item['label'] == 'spam')
    not_spam_count = len(data) - spam_count
    prior_spam = spam_count / len(data)
    prior_not_spam = not_spam_count / len(data)
    return prior_spam, prior_not_spam

def calculate_likelihoods(data, prior_spam, prior_not_spam):
    likelihood_spam = defaultdict(int)
    likelihood_not_spam = defaultdict(int)
    for item in data:
        words = item['text'].split()
        for word in words:
            if item['label'] == 'spam':
                likelihood_spam[word] += 1
            else:
                likelihood_not_spam[word] += 1
    for word in likelihood_spam:
        likelihood_spam[word] /= len([item for item in data if item['label'] == 'spam'])
    for word in likelihood_not_spam:
        likelihood_not_spam[word] /= len([item for item in data if item['label'] == 'not spam'])
    return likelihood_spam, likelihood_not_spam


def classify(text, prior_spam, prior_not_spam, likelihood_spam, likelihood_not_spam):
    words = text.split()
    p_spam = math.log(prior_spam)
    p_not_spam = math.log(prior_not_spam)
    for word in words:
        if word in likelihood_spam:
            p_spam += math.log(likelihood_spam[word])
        else:
            p_spam -= math.log(len([item for item in data if item['label'] == 'spam']))
        if word in likelihood_not_spam:
            p_not_spam += math.log(likelihood_not_spam[word])
        else:
            p_not_spam -= math.log(len([item for item in data if item['label'] == 'not spam']))
    return 'spam' if p_spam > p_not_spam else 'not spam'

data = load_data('/content/drive/MyDrive/Sem 3/AI ML/Lab/Spam_filter_NaiveBayes/Dataset/spam_emails - spam_emails.csv')

prior_spam, prior_not_spam = calculate_priors(data)

likelihood_spam, likelihood_not_spam = calculate_likelihoods(data, prior_spam, prior_not_spam)

test_text = 'This is a test email'
print(classify(test_text, prior_spam, prior_not_spam, likelihood_spam, likelihood_not_spam))

spam


In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Load the data
data = pd.read_csv('/content/drive/MyDrive/Sem 3/AI ML/Lab/Spam_filter_NaiveBayes/Dataset/spam_emails - spam_emails.csv')

# Check for NaN values
print("Checking for NaN values in the dataset:")
print(data.isnull().sum())

# Handle NaN values
# Option 1: Drop rows with NaN values
data.dropna(subset=['text', 'label'], inplace=True)

# Option 2: Fill NaN values with an empty string (uncomment if you prefer this)
# data['text'].fillna('', inplace=True)

# Preprocess the data
X = data['text']  # Features
y = data['label']  # Labels (spam or not spam)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorization
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Train the Naive Bayes classifier
model = MultinomialNB()
model.fit(X_train_vectorized, y_train)

# Make predictions
y_pred = model.predict(X_test_vectorized)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print(classification_report(y_test, y_pred))

Checking for NaN values in the dataset:
text     1
label    0
dtype: int64
Accuracy: 0.99
              precision    recall  f1-score   support

    not spam       0.98      1.00      0.99        49
        spam       1.00      0.98      0.99        51

    accuracy                           0.99       100
   macro avg       0.99      0.99      0.99       100
weighted avg       0.99      0.99      0.99       100

