### Importing required libraries 

In [40]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from joblib import dump, load
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE



ModuleNotFoundError: No module named 'imblearn'

### Loading dataset and Using Preprocessing Technique to Clean it 

In [None]:
mail_data = pd.read_csv("Emails.csv")  

#### replacing the missing values with empty strings..

In [None]:
mail_data = mail_data.fillna("")

#### Checking Duplicate emails from the dataset 

In [41]:
mail_data = mail_data.drop_duplicates()

print("Number of duplicate rows:", mail_data.duplicated().sum())

Number of duplicate rows: 0


In [42]:
mail_data.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


#### Preparing Data for Training

In [43]:
X = mail_data['text']
Y = mail_data['spam']  

#### Spliting data into Training and Testing Sets

In [44]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=3)

#### Text to Numerical Features using TF-IDF

In [45]:
vectorizer = TfidfVectorizer(min_df=1, stop_words="english", lowercase=True)

# Transform text data into feature vectors
X_train_features = vectorizer.fit_transform(X_train)
X_test_features = vectorizer.transform(X_test)

In [46]:
dump(vectorizer, "tfidf_vectorizer.joblib")
print("Vectorizer saved as tfidf_vectorizer.joblib")

Vectorizer saved as tfidf_vectorizer.joblib


#### Label Encoding (Spam = 1, Ham = 0)

In [47]:
label_encoder = LabelEncoder()
Y_train_encoded = label_encoder.fit_transform(Y_train)
Y_test_encoded = label_encoder.transform(Y_test)

In [48]:
smote = SMOTE(random_state=3)
X_train_balanced, Y_train_balanced = smote.fit_resample(X_train_features, Y_train_encoded)

NameError: name 'SMOTE' is not defined

#### Train Naive Bayes Model

In [None]:
# Training Naive Bayes classifier
nb_model = MultinomialNB()
nb_model.fit(X_train_features, Y_train_encoded)

#### Evaluating Models Performance

In [None]:
# Make predictions
Y_pred_nb = nb_model.predict(X_test_features)

# Calculate Accuracy
accuracy_nb = accuracy_score(Y_test_encoded, Y_pred_nb)
print(f"Naive Bayes Accuracy: {accuracy_nb:.4f}")


#### Saved Model

In [49]:
dump(nb_model, "naive_bayes_model.joblib")
print("Model saved as naive_bayes_model.joblib")

Model saved as naive_bayes_model.joblib


#### Function to Predict Spam Emails


In [54]:
def predict_spam_nb(input_text, model_filename="naive_bayes_model.joblib", vectorizer_filename="tfidf_vectorizer.joblib"):
    # Load the trained model and vectorizer
    loaded_model = load(model_filename)
    loaded_vectorizer = load(vectorizer_filename)

    # Convert input text into a feature vector
    input_features = loaded_vectorizer.transform([input_text])

    # Predict
    prediction = loaded_model.predict(input_features)[0]

    # Output result
    if prediction == 1:
        print(f"The input  is classified as: Ham Mail")
    else:
        print(f"The input is classified as: Spam Mail")


In [55]:
new_email = """Congratulations! You won a free iPhone! Click the link to claim now.
"""

In [56]:
predict_spam_nb(new_email)

The input is classified as: Spam Mail


In [53]:
import numpy as np
print(np.unique(Y_train_encoded, return_counts=True))


(array([0, 1]), array([3468, 1088]))
