**MAIL SPAM DETECTION USING TENSORFLOW**

In [None]:
#LOADING THE REQUIRED MODULES
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report

In [None]:
#Loading the dataset into python
df=pd.read_csv('spam_ham_dataset.csv')

In [None]:
#printing the first and last 5 rows of the dataset
df

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0
...,...,...,...,...
5166,1518,ham,Subject: put the 10 on the ft\r\nthe transport...,0
5167,404,ham,Subject: 3 / 4 / 2000 and following noms\r\nhp...,0
5168,2933,ham,Subject: calpine daily gas nomination\r\n>\r\n...,0
5169,1409,ham,Subject: industrial worksheets for august 2000...,0


In [None]:
#describing the dataset
df.describe()

Unnamed: 0.1,Unnamed: 0,label_num
count,5171.0,5171.0
mean,2585.0,0.289886
std,1492.883452,0.453753
min,0.0,0.0
25%,1292.5,0.0
50%,2585.0,0.0
75%,3877.5,1.0
max,5170.0,1.0


In [None]:
#checking for null values
df.isnull().sum()

Unnamed: 0    0
label         0
text          0
label_num     0
dtype: int64

In [None]:
#brief information of the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5171 entries, 0 to 5170
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  5171 non-null   int64 
 1   label       5171 non-null   object
 2   text        5171 non-null   object
 3   label_num   5171 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 161.7+ KB


In [None]:
#checking number of spam and ham messages
df['label'].value_counts()

ham     3672
spam    1499
Name: label, dtype: int64

#splitting data into two halves to train the model

In [None]:
x=df.drop(columns='label_num',axis=1)
y=df['label_num']

In [None]:
#breaking down the texts into small parts called tokens
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['text'])
sequences = tokenizer.texts_to_sequences(df['text'])
padded_sequences = pad_sequences(sequences, padding='post')

labels = df['label_num'].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2)

In [None]:
#creating a keras model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=len(tokenizer.word_index)+1,
                              output_dim=128,
                              input_length=X_train.shape[1]),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [None]:
#getting the accuracy of the trained model
history = model.fit(X_train,
                    y_train,
                    epochs=10,
                    validation_data=(X_test,y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
#evaluating accuracy
loss, accuracy = model.evaluate(X_test,y_test)
print(f"Accuracy: {accuracy*100:.2f}%")

Accuracy: 92.08%


In [None]:
from sklearn.metrics import confusion_matrix, classification_report
y_true = y_test  # the true labels from the test set
y_pred = model.predict(X_test)  # the predicted probabilities from the model
threshold = 0.5 # the threshold value to classify as spam or ham
y_pred = (y_pred > threshold).astype(int) # the binary labels from the probabilities



In [None]:
cm = confusion_matrix(y_true, y_pred) # the confusion matrix
print('confusion matrix: \n',cm)

confusion matrix: 
 [[716   3]
 [ 79 237]]


* True Positives (716): Spam emails correctly identified as spam.
* True Negatives (237): Ham emails correctly identified as ham.
* False Positives (3): Ham emails incorrectly classified as spam.
* False Negatives (79): Spam emails incorrectly classified as ham.

**REPORT**

In [None]:
#creating a classification report
report = classification_report(y_test, np.round(y_pred), target_names=['ham', 'spam'])

In [None]:
print(report)

              precision    recall  f1-score   support

         ham       0.90      1.00      0.95       719
        spam       0.99      0.75      0.85       316

    accuracy                           0.92      1035
   macro avg       0.94      0.87      0.90      1035
weighted avg       0.93      0.92      0.92      1035





* The overall accuracy of the model is 93%, indicating that it correctly classifies emails as spam or ham 92% of the time.
*   The precision and recall for both spam and ham emails are high, suggesting that the model is effective in identifying both types of emails accurately.

* The precision for ham emails is 0.90, meaning that all emails predicted as ham were actually ham. Precision for spam mails is 0.99 means that out of every 10 emails the model predicts as spam, 9 are actually spam, and 1 is mistakenly labeled ham (false positive).

* F1 Score: 0.85 - This signifies a good balance between precision and recall for identifying spam emails. The model excels at catching most spam without labeling too many ham emails incorrectly.





In [None]:
input_mail = '''Subject: Exclusive Offer: Claim Your Prize Now!

Congratulations! You have been selected as the lucky winner of our exclusive prize giveaway. Claim your prize now by clicking the link below. Don't miss out on this once-in-a-lifetime opportunity!

Claim Your Prize Now: [link]

Hurry, this offer won't last long! Act now to secure your reward.

Unsubscribe | Privacy Policy'''

In [None]:
#Preprocess the input email text
input_text = input_mail
input_sequence = tokenizer.texts_to_sequences([input_text])
input_padded = pad_sequences(input_sequence, maxlen=5916, padding='post')

#Predict the probability of spam
probability = model.predict(input_padded)[0][0]
print(f"Probability of spam: {probability}")

#Classify the email as spam or ham
threshold = 0.5
if probability > threshold:
    print("The email is spam.")
else:
    print("The email is ham.")

Probability of spam: 0.6911893486976624
The email is spam.
