<a href="https://colab.research.google.com/github/hemasundar784/sms-spam-detection/blob/main/code_Sms_spam_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import re
import nltk
import pickle
import numpy as np

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')

# Reading data set
sms_data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/spam1.csv", encoding="latin-1")

# Removing null columns
sms_data.dropna(how="any", inplace=True, axis=1)

sms_data.columns = ["label", "message"]

# Preprocessing the data
def change(data):
    extract_message = re.sub("[^a-zA-Z]", " ", data)
    extract_message = extract_message.lower()
    extract_message = extract_message.split()
    extract_message = [lemmatizing.lemmatize(j) for j in extract_message if j not in stopwords.words('english')]
    extract_message = " ".join(extract_message)
    return extract_message

message_data = []
lemmatizing = WordNetLemmatizer()

for i in range(len(sms_data)):
    message_data.append(change(sms_data["message"][i]))

# Change the data into numerical format
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
X = tfidf.fit_transform(message_data)

# Dump the TfidfVectorizer
pickle.dump(tfidf, open('transform.pkl', 'wb'))

# Split the data for training and testing
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X.toarray(), sms_data['label'], test_size=0.3, random_state=0)
print(len(X_train), len(X_test))

# Multinomial Naive Bayes implementation
class MultinomialNaiveBayes:
    def __init__(self):
        self.class_prob = {}
        self.word_prob = {}
        self.classes = []

    def fit(self, X, y):
        self.classes = np.unique(y)
        for label in self.classes:
            self.class_prob[label] = np.sum(y == label) / len(y)
            word_count = np.sum(X[y == label], axis=0)
            self.word_prob[label] = (word_count + 1) / (np.sum(word_count) + len(word_count))

    def predict(self, X):
        predictions = []
        for sample in X:
            probs = []
            for label in self.classes:
                prob = np.log(self.class_prob[label]) + np.sum(np.log(self.word_prob[label][sample > 0]))
                probs.append(prob)
            predictions.append(self.classes[np.argmax(probs)])
        return predictions

# Create and fit the Multinomial Naive Bayes classifier
mnb = MultinomialNaiveBayes()
mnb.fit(X_train, y_train)

# Make predictions using the same classifier instance
y_pred = mnb.predict(X_test)

# Dump the Multinomial Naive Bayes model
pickle.dump(mnb, open('spam_model.pkl', 'wb'))

# Evaluate the model
from sklearn.metrics import accuracy_score, classification_report

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
print("Classification Report is:")
print(report)
print("Accuracy is:", accuracy * 100)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


3990 1711
Classification Report is:
              precision    recall  f1-score   support

         ham       0.98      0.99      0.99      1454
        spam       0.93      0.90      0.92       257

    accuracy                           0.98      1711
   macro avg       0.96      0.95      0.95      1711
weighted avg       0.98      0.98      0.98      1711

Accuracy is: 97.54529514903565


In [None]:
tem = "your mobile number has own 1,000,000,00 pound from coca cola london to claim winning amount send your name/number/address/age to email@.com"
sample_message = change(tem)
print("tem", sample_message)
l = tfidf.transform([sample_message]).toarray()

# Calculate probabilities manually
class_probs = []
for label in mnb.classes:
    class_prob = np.log(mnb.class_prob[label]) + np.sum(np.log(mnb.word_prob[label][l.flatten() > 0]))
    class_probs.append(class_prob)

class_probs = np.exp(class_probs - np.max(class_probs))  # Ensure numerical stability
class_probs /= np.sum(class_probs)  # Normalize to get probabilities

# Get the percentage of each class
for i, label in enumerate(mnb.classes):
    print(f"Percentage of {label}: {class_probs[i] * 100:.2f}%")

print("Predicted Probabilities:", class_probs)
predicted_class = mnb.classes[np.argmax(class_probs)]
print("Predicted Class:", predicted_class)

tem mobile number pound coca cola london claim winning amount send name number address age email com
Percentage of ham: 0.01%
Percentage of spam: 99.99%
Predicted Probabilities: [1.40008579e-04 9.99859991e-01]
Predicted Class: spam


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
