In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Dataset
data = {
    "message": [
        "Win $1000 now by clicking this link!",
        "Congratulations, you are selected to receive a free gift card!",
        "Please call this number to claim your prize.",
        "Hey, are we still meeting for coffee tomorrow?",
        "Don't forget the team meeting at 10:00 AM.",
        "Can you send me the report by EOD?",
        "This is not a spam message, just wanted to say hi.",
        "Your subscription will expire soon. Renew now to avoid interruption.",
        "Free tickets to the concert! Claim now!",
        "Let's catch up later this week."
    ],
    "label": [
        "spam", "spam", "spam", "ham", "ham",
        "ham", "ham", "spam", "spam", "ham"
    ]
}

# Convert to DataFrame
df = pd.DataFrame(data)

In [2]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(df["message"], df["label"], test_size=0.3, random_state=42)

In [3]:
# Vectorize using Bag of Words
#vectorizer = CountVectorizer(stop_words='english')
#X_train_vec = vectorizer.fit_transform(X_train)
#X_test_vec = vectorizer.transform(X_test)

In [8]:
from sentence_transformers import SentenceTransformer
# Load Embedding Model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate Embeddings
X_train_vec = model.encode(X_train.tolist())
X_test_vec = model.encode(X_test.tolist())

In [9]:
# Train a Logistic Regression Classifier
classifier = LogisticRegression(max_iter=200)
classifier.fit(X_train_vec, y_train)

In [10]:
# Evaluate the Model
y_pred = classifier.predict(X_test_vec)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.3333333333333333
Classification Report:
               precision    recall  f1-score   support

         ham       0.33      1.00      0.50         1
        spam       0.00      0.00      0.00         2

    accuracy                           0.33         3
   macro avg       0.17      0.50      0.25         3
weighted avg       0.11      0.33      0.17         3


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [12]:
# Predict new messages
new_messages = [
    "Win a free iPhone by clicking this link!",
    "Let's meet for dinner tonight.",
    "Claim your $500 voucher before it's too late!",
    "Don't forget to update your project status."
]
new_messages_vec = model.encode(new_messages)
#new_messages_vec = vectorizer.transform(new_messages)
new_predictions = classifier.predict(new_messages_vec)

# Print Predictions
for message, prediction in zip(new_messages, new_predictions):
    print(f"Message: {message}")
    print(f"Prediction: {prediction}")
    print("-" * 50)

Message: Win a free iPhone by clicking this link!
Prediction: ham
--------------------------------------------------
Message: Let's meet for dinner tonight.
Prediction: ham
--------------------------------------------------
Message: Claim your $500 voucher before it's too late!
Prediction: spam
--------------------------------------------------
Message: Don't forget to update your project status.
Prediction: ham
--------------------------------------------------
