In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,classification_report ,confusion_matrix,ConfusionMatrixDisplay

Spam is 1 Not Spam is 2

In [2]:
email_data = pd.read_csv("data/emails.csv")

# separating the text and category
X = email_data["text"]
y = email_data["spam"]

# splitting the data into training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=12)

# Turning the data into numerical format using TfidfVectorizer
vectorizer = TfidfVectorizer(min_df = 1, stop_words="english", lowercase=True)

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

email_data.shape
# Changing the y variable to type int
y_train = y_train.astype("int")
y_test = y_test.astype("int")
email_data.shape




(5728, 2)

In [3]:
from sklearn.linear_model import LogisticRegression
# Create an instance of the Logistic Regression classifier
logistic_reg = LogisticRegression(max_iter=1000) 

# Train the Logistic Regression model
logistic_reg.fit(X_train_vec, y_train)

In [4]:
# Predict probabilities for each class
predicted_probabilities = logistic_reg.predict_proba(X_test_vec)

# Define your custom threshold
new_threshold = 0.39

# Create adjusted predictions based on the threshold
y_pred_adjusted = np.array([1 if prob[1] > new_threshold else 0 for prob in predicted_probabilities])

# Calculate accuracy and print the classification report
accuracy = accuracy_score(y_test, y_pred_adjusted)
print(f"Accuracy: {accuracy:.4f}")

print(classification_report(y_test, y_pred_adjusted, target_names=['Spam', 'Not Spam']))

Accuracy: 0.9913
              precision    recall  f1-score   support

        Spam       1.00      0.99      0.99       429
    Not Spam       0.97      0.99      0.98       144

    accuracy                           0.99       573
   macro avg       0.99      0.99      0.99       573
weighted avg       0.99      0.99      0.99       573



In [5]:
email_data2 = pd.read_csv("data/spam_ham_dataset.csv")
X_2 = email_data2["text"]
y_2 = email_data2["label_num"]
X_test_vec2 = vectorizer.transform(X_2)

# Assuming you have already loaded the new data and transformed it into X_test_vec2 as shown in the previous response.

# Predict probabilities for each class
predicted_probabilities2 = logistic_reg.predict_proba(X_test_vec2)

# Define your custom threshold (you can use the same threshold)
new_threshold = 0.39

# Create adjusted predictions based on the threshold
y_pred_adjusted2 = np.array([1 if prob[1] > new_threshold else 0 for prob in predicted_probabilities2])

# Calculate accuracy and print the classification report for the new data
accuracy2 = accuracy_score(y_2, y_pred_adjusted2)
print(f"Accuracy on new data: {accuracy2:.4f}")

target_names = ['Spam', 'Not Spam']
print(classification_report(y_2, y_pred_adjusted2, target_names=target_names))



Accuracy on new data: 0.9430
              precision    recall  f1-score   support

        Spam       0.97      0.95      0.96      3672
    Not Spam       0.89      0.92      0.90      1499

    accuracy                           0.94      5171
   macro avg       0.93      0.94      0.93      5171
weighted avg       0.94      0.94      0.94      5171



In [14]:
# Initialize combined_data outside the function
combined_data = email_data

def test_and_update_logistic_regression(email_headers, correct_label, combined_data):
    # Remove this line --> combined_data = email_data2.copy()

    for text in email_headers:
        test = [text]
        test_vec = vectorizer.transform(test)
        predicted_probabilities = logistic_reg.predict_proba(test_vec)

        # Define your custom threshold
        new_threshold = 0.465

        # Create adjusted predictions based on the threshold
        y_pred_adjusted = np.array([1 if prob[1] > new_threshold else 0 for prob in predicted_probabilities])

        if y_pred_adjusted[0] == 1:
            print("Category: Spam Mail")
        else:
            print("Category: Not Spam Mail")

        # If the prediction does not match the correct label, update the dataset
        if y_pred_adjusted[0] != correct_label:
            print("Updating the model...")

            # Create a new DataFrame for the corrected label
            new_data_point = pd.DataFrame({"text": [text], "label_num": [correct_label]})

            # Concatenate the new data point with the original data
            combined_data = pd.concat([combined_data, new_data_point], ignore_index=True)

    # Assuming you have already trained the logistic_reg model, you can skip the training part.

    return combined_data

# Example usage:
correct_label = 1  # Set the label (1 for spam or 0 for not spam)
non_spam_email_subjects = [
    "❗🤑 9111isaac_𝗬𝗼𝘂 𝗿𝗲𝗰𝗲𝗶𝘃𝗲𝗱 𝗮 𝗱𝗶𝗿𝗲𝗰𝘁 𝗱𝗲𝗽𝗼𝘀𝗶𝘁𝗲𝗱 𝗼𝗳 💲2,500.00.🥳 -----#Mon, 16 Oct 2023 17:16:04 +0200"
]

# Call the function to test and update the model
combined_data = test_and_update_logistic_regression(non_spam_email_subjects, correct_label, combined_data)


Category: Not Spam Mail
Updating the model...


In [15]:
# Save the combined_data DataFrame to the emails.csv file
combined_data.to_csv("data/emails.csv", index=False)
test= pd.read_csv("data/emails.csv")
test.shape

(5729, 3)