In [7]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import joblib


def process(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = ''.join(char for char in text if char.isalnum() or char.isspace())
    # Tokenize text
    tokens = text.split()
    # Remove stop words
    stop_words = set(['a', 'an', 'the', 'in', 'on', 'at', 'to',
                     'from', 'by', 'for', 'of', 'was', 'were', 'is', 'am'])
    tokens = [token for token in tokens if token not in stop_words]
    # Join the tokens back into a string
    text = ' '.join(tokens)
    return text


# Load the data from the CSV file
df = pd.read_csv('../dataset/emails.csv')

# Preprocess the text data
df['text'] = df['text'].apply(process)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df['spam'], random_state=0)

# Create a CountVectorizer object to transform the text data into a bag of words
vectorizer = CountVectorizer()

# Fit the vectorizer to the training data and transform the training and test data
X_train_vector = vectorizer.fit_transform(X_train)
X_test_vector = vectorizer.transform(X_test)

# Train a Multinomial Naive Bayes classifier on the training data
classifier = MultinomialNB()
classifier.fit(X_train_vector, y_train)

# Make predictions on the test data and calculate accuracy
y_pred = classifier.predict(X_test_vector)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Save the trained model to a file
joblib.dump(classifier, "spam_classifier_model.joblib")


Accuracy: 0.9888268156424581


['spam_classifier_model.joblib']

In [11]:
import joblib

# Load the saved model from file
model = joblib.load("spam_classifier_model.joblib")

# Preprocess the new email text using the same preprocessing function and vectorizer used for training
new_email = "Are you sure this it the correct way? Dog!!"
processed_text = process(new_email)
new_email_vector = vectorizer.transform([processed_text])

# Use the trained model to predict whether the new email is spam or not
prediction = model.predict(new_email_vector)[0]
print("Prediction:", prediction)


Prediction: 1
