In [None]:
# Save the trained model
output_dir = "./custom_ner_model"
nlp.to_disk(output_dir)
print(f"Model saved to {output_dir}")

# Load the trained model
loaded_nlp = spacy.load(output_dir)
print("Custom Model Loaded Successfully!")


In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


In [None]:
# Load the SMS Spam Collection dataset
url = "https://raw.githubusercontent.com/justmarkham/DAT8/master/data/sms.tsv"
df = pd.read_csv(url, sep="\t", names=["label", "message"])

# Display dataset sample
print("Dataset Sample:")
print(df.head())

# Dataset overview
print("\nDataset Shape:", df.shape)
print("Label Distribution:\n", df["label"].value_counts())


In [None]:
import re

# Preprocess text: lowercase and remove non-alphabetic characters
def preprocess_text(text):
    text = text.lower()  # Lowercasing
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove punctuation and numbers
    return text

df["cleaned_message"] = df["message"].apply(preprocess_text)

# Display processed data
print("\nCleaned Messages:")
print(df[["message", "cleaned_message"]].head())


In [None]:
# Initialize the TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words="english", max_features=3000)

# Transform the cleaned messages into TF-IDF features
X = tfidf_vectorizer.fit_transform(df["cleaned_message"])
y = df["label"].map({"ham": 0, "spam": 1})  # Convert labels to binary (0: ham, 1: spam)

print("\nTF-IDF Features Shape:", X.shape)


In [None]:
# Split data into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training Samples: {X_train.shape[0]}, Test Samples: {X_test.shape[0]}")


In [None]:
# Initialize the Multinomial Naive Bayes model
nb_model = MultinomialNB()

# Train the model
nb_model.fit(X_train, y_train)

print("\nNaive Bayes Model Trained Successfully!")


In [None]:
# Make predictions on the test set
y_pred = nb_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f"Test Accuracy: {accuracy:.4f}")
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(class_report)


In [None]:
# Function to predict if a message is spam or not
def predict_spam(message, vectorizer, model):
    message_cleaned = preprocess_text(message)
    message_tfidf = vectorizer.transform([message_cleaned])
    prediction = model.predict(message_tfidf)[0]
    return "Spam" if prediction == 1 else "Not Spam"

# Test with custom messages
test_messages = [
    "Congratulations! You've won a free iPhone. Click here to claim your prize.",
    "Hi John, are we still on for the meeting tomorrow?",
    "Urgent! Your account has been compromised. Update your password now."
]

for message in test_messages:
    result = predict_spam(message, tfidf_vectorizer, nb_model)
    print(f"\nMessage: {message}\nPrediction: {result}")


In [None]:
import joblib

# Save the model and vectorizer
joblib.dump(nb_model, "spam_classifier.pkl")
joblib.dump(tfidf_vectorizer, "tfidf_vectorizer.pkl")
print("\nModel and Vectorizer Saved Successfully!")
