In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score,classification_report ,confusion_matrix,ConfusionMatrixDisplay

Spam is 1 Not Spam is 2

In [2]:
email_data = pd.read_csv("data/emails.csv")

# separating the text and category
X = email_data["text"]
y = email_data["spam"]

# splitting the data into training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=2)

# Turning the data into numerical format using TfidfVectorizer
vectorizer = TfidfVectorizer(min_df = 1, stop_words="english", lowercase=True)

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Changing the y variable to type int
y_train = y_train.astype("int")
y_test = y_test.astype("int")
email_data.shape

(5728, 2)

In [3]:
from sklearn.svm import SVC

# Train the SVC model
svc = SVC(kernel='sigmoid', probability=True)  # Enable probability estimates
svc.fit(X_train_vec, y_train)

# Predict probabilities for each class
predicted_probabilities = svc.predict_proba(X_test_vec)

# Define your custom threshold
new_threshold = 0.525  

# Create adjusted predictions based on the threshold
y_pred_adjusted = np.array([1 if prob[1] > new_threshold else 0 for prob in predicted_probabilities])

# Calculate accuracy and print the classification report
accuracy = accuracy_score(y_test, y_pred_adjusted)
print(f"Accuracy (Adjusted): {accuracy:.4f}")

target_names = ['Spam', 'Not Spam']
print(classification_report(y_test, y_pred_adjusted, target_names=target_names))


Accuracy (Adjusted): 0.9965
              precision    recall  f1-score   support

        Spam       1.00      1.00      1.00       429
    Not Spam       0.99      1.00      0.99       144

    accuracy                           1.00       573
   macro avg       0.99      1.00      1.00       573
weighted avg       1.00      1.00      1.00       573



Takes a List of strings as input and categories them

In [14]:
import pandas as pd
combined_data = email_data
def test_and_update(email_headers, correct_label):
    combined_data = email_data  # Initialize the combined_data DataFrame

    for text in email_headers:
        test = [text]
        test_vec = vectorizer.transform(test)
        prediction = svc.predict(test_vec)

        if prediction[0] == 1:
            print("Category: Spam Mail")
        else:
            print("Category: Not Spam Mail")

        # If the prediction does not match the correct label, update the dataset and retrain the model
        if prediction[0] != correct_label:
            print("Retraining the model...")

            # Create a new DataFrame for the corrected label
            new_data_point = pd.DataFrame({"text": [text], "spam": [correct_label]})

            # Concatenate the new data point with the original data
            combined_data = pd.concat([combined_data, new_data_point], ignore_index=True)

            # Update the model with the combined dataset
            X_train_vec = vectorizer.transform(combined_data["text"])
            y_train = combined_data["spam"]
            svc.fit(X_train_vec, y_train)

    return combined_data

# Example usage:
correct_label = 1  # Set the label (1 for spam or 0 for not spam)
non_spam_email_subjects = [
    "❗🤑 9111isaac_𝗬𝗼𝘂 𝗿𝗲𝗰𝗲𝗶𝘃𝗲𝗱 𝗮 𝗱𝗶𝗿𝗲𝗰𝘁 𝗱𝗲𝗽𝗼𝘀𝗶𝘁𝗲𝗱 𝗼𝗳 💲2,500.00.🥳 -----#Mon, 16 Oct 2023 17:16:04 +0200"
]

# Call the function to test and update the model
test_and_update(non_spam_email_subjects, correct_label)


Category: Not Spam Mail
Retraining the model...


Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1
...,...,...
5724,"Subject: re : receipts from visit jim , than...",0
5725,Subject: re : enron case study update wow ! a...,0
5726,"Subject: re : interest david , please , call...",0
5727,Subject: news : aurora 5 . 2 update aurora ve...,0


Second data set to test the accuracy of the previously trained data

In [5]:
# Load the new data and extract text
email_data2 = pd.read_csv("data/spam_ham_dataset.csv")
X_2 = email_data2["text"]
y_2 = email_data2["label_num"]

# Apply the same vectorizer used for the original data
X_test_vec2 = vectorizer.transform(X_2)

# Ensure that y_2 is in the appropriate data type (e.g., integer)
y_2 = y_2.astype(int)

# Predict probabilities for each class
predicted_probabilities2 = svc.predict_proba(X_test_vec2)

# Define your custom threshold (you can use the same threshold)
new_threshold = 0.525

# Create adjusted predictions based on the threshold
y_pred_adjusted2 = np.array([1 if prob[1] > new_threshold else 0 for prob in predicted_probabilities2])

# Calculate accuracy and print the classification report for the new data
accuracy2 = accuracy_score(y_2, y_pred_adjusted2)
print(f"Accuracy on new data: {accuracy2:.4f}")

target_names = ['Spam', 'Not Spam']
print(classification_report(y_2, y_pred_adjusted2, target_names=target_names))


Accuracy on new data: 0.9453
              precision    recall  f1-score   support

        Spam       0.97      0.96      0.96      3672
    Not Spam       0.90      0.92      0.91      1499

    accuracy                           0.95      5171
   macro avg       0.93      0.94      0.93      5171
weighted avg       0.95      0.95      0.95      5171



In [12]:
# Save the combined_data DataFrame to the emails.csv file
combined_data.to_csv("data/emails.csv", index=False)
test= pd.read_csv("data/emails.csv")
test.shape

(5728, 2)