In [3]:
import pandas as pd
import numpy as np
import spacy
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder
import pickle

# Load the dataset
df = pd.read_csv('clean.csv')

# Drop unnecessary columns
df.drop(columns=['Unnamed: 0'], inplace=True)

# Remove duplicates
df = df.drop_duplicates()

# Remove empty values
df.dropna(inplace=True)

# Define X and y
X = df['Message']
y = df['Category']

# Encode labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)  # Encode 'ham' as 0 and 'spam' as 1

# Initialize spaCy for tokenization and lemmatization
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Function for tokenization and lemmatization
def tokenize_lemmatize(text):
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
    return ' '.join(tokens)

# Apply tokenization and lemmatization to messages
X_processed = X.apply(tokenize_lemmatize)

# Create a pipeline with CountVectorizer and Multinomial Naive Bayes
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', MultinomialNB()),
])

# Perform 5-fold cross-validation
cv_scores = cross_val_score(pipeline, X_processed, y, cv=5, scoring='accuracy')
print(f'Cross-validated Accuracy: {np.mean(cv_scores):.4f}')

# Fit the model on the entire dataset
pipeline.fit(X_processed, y)

# Save the model as a pickle file
with open('naive_bayes_model.pkl', 'wb') as f:
    pickle.dump(pipeline, f)

# Function to evaluate metrics after cross-validation
def evaluate_metrics(model, X, y):
    y_pred = model.predict(X)
    accuracy = accuracy_score(y, y_pred)
    precision = precision_score(y, y_pred, pos_label=1)  # Assuming 'spam' is positive
    recall = recall_score(y, y_pred, pos_label=1)
    f1 = f1_score(y, y_pred, pos_label=1)
    return accuracy, precision, recall, f1

# Evaluate metrics on the whole dataset
accuracy, precision, recall, f1 = evaluate_metrics(pipeline, X_processed, y)
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1-score: {f1:.4f}')


Cross-validated Accuracy: 0.9318
Accuracy: 0.9762
Precision: 0.9710
Recall: 0.9760
F1-score: 0.9735


In [4]:
import pandas as pd
import numpy as np
import spacy
import pickle

# Load the saved model
with open('naive_bayes_model.pkl', 'rb') as f:
    model = pickle.load(f)

# Sample normal messages
normal_messages = [
    "Hey! How are you?",
    "Reminder: Tomorrow is the deadline for submitting the report.",
    "I'll be there by 5 PM.",
    "What time are we meeting today?",
    "Did you get my email?",
    "Let's catch up for coffee this weekend.",
    "Please review the document and let me know your feedback.",
    "Have a great day!",
    "See you soon."
]

# Define labels for interpretation
label_map = {0: 'ham', 1: 'spam'}

# Tokenize and preprocess the normal messages
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

def tokenize_lemmatize(text):
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
    return ' '.join(tokens)

normal_messages_processed = pd.Series(normal_messages).apply(tokenize_lemmatize)

# Predict labels using the model
predicted_labels = model.predict(normal_messages_processed)

# Map predicted labels to actual categories ('ham' or 'spam')
predicted_categories = [label_map[label] for label in predicted_labels]

# Print the predicted categories for each normal message
for message, category in zip(normal_messages, predicted_categories):
    print(f"Message: {message} ==> Predicted Category: {category}")


Message: Hey! How are you? ==> Predicted Category: ham
Message: Reminder: Tomorrow is the deadline for submitting the report. ==> Predicted Category: ham
Message: I'll be there by 5 PM. ==> Predicted Category: spam
Message: What time are we meeting today? ==> Predicted Category: ham
Message: Did you get my email? ==> Predicted Category: ham
Message: Let's catch up for coffee this weekend. ==> Predicted Category: ham
Message: Please review the document and let me know your feedback. ==> Predicted Category: ham
Message: Have a great day! ==> Predicted Category: ham
Message: See you soon. ==> Predicted Category: ham
