<a href="https://colab.research.google.com/github/harshadmehta6786/spam-sms-/blob/main/Untitled8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from joblib import dump, load
import os

# Ensure stopwords are downloaded
nltk.download('stopwords')

# Load dataset
file_path = "/content/spam.csv"  # Update this path if needed
if not os.path.exists(file_path):
    raise FileNotFoundError(f"Dataset not found at {file_path}. Please upload it.")

df = pd.read_csv(file_path, encoding='latin-1')  # Use 'latin-1' for compatibility

# Rename columns if necessary
df = df.iloc[:, :2]  # Keep only first 2 columns (label & message)
df.columns = ["label", "message"]

# Convert labels: 'ham' → 0, 'spam' → 1
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

# Text preprocessing function
def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z]', ' ', text)  # Remove non-alphabet characters
    text = text.lower().split()  # Convert to lowercase and tokenize
    ps = PorterStemmer()
    text = [ps.stem(word) for word in text if word not in stopwords.words('english')]
    return ' '.join(text)

# Apply text preprocessing
df['message'] = df['message'].apply(preprocess_text)

# Create text classification pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),  # Convert text to TF-IDF features
    ('classifier', MultinomialNB())  # Train a Naïve Bayes classifier
])

# Split dataset into training and testing
X_train, X_test, y_train, y_test = train_test_split(df['message'], df['label'], test_size=0.2, random_state=42)

# Train model
pipeline.fit(X_train, y_train)

# Save trained model
model_path = "spam_sms_detector.joblib"
dump(pipeline, model_path)
print(f"Model saved as {model_path}")

# Predictions
y_pred = pipeline.predict(X_test)

# Model evaluation
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:\n", report)
print("Confusion Matrix:\n", conf_matrix)

# Function to predict spam or ham
def predict_sms(text):
    if not os.path.exists(model_path):
        raise FileNotFoundError(f"Trained model not found. Please train and save the model first.")
    pipeline = load(model_path)  # Load trained model
    prediction = pipeline.predict([text])
    return "Spam" if prediction[0] == 1 else "Ham"

# Example usage
sms = "Congratulations! You have won a lottery of $1000. Call now!"
print(f"Message: {sms}\nPrediction: {predict_sms(sms)}")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Model saved as spam_sms_detector.joblib
Accuracy: 0.97
Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       1.00      0.75      0.86       150

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.96      1115

Confusion Matrix:
 [[965   0]
 [ 37 113]]
Message: Congratulations! You have won a lottery of $1000. Call now!
Prediction: Ham
