In [222]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns

In [223]:
data = pd.read_csv('spam.csv', encoding='ISO-8859-1')

In [224]:
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [225]:
data.describe()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
count,5572,5572,50,12,6
unique,2,5169,43,10,5
top,ham,"Sorry, I'll call later","bt not his girlfrnd... G o o d n i g h t . . .@""","MK17 92H. 450Ppw 16""","GNT:-)"""
freq,4825,30,3,2,2


In [226]:
print(data.isnull().any())

v1            False
v2            False
Unnamed: 2     True
Unnamed: 3     True
Unnamed: 4     True
dtype: bool


In [227]:
print(data[['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4']].head()) 

  Unnamed: 2 Unnamed: 3 Unnamed: 4
0        NaN        NaN        NaN
1        NaN        NaN        NaN
2        NaN        NaN        NaN
3        NaN        NaN        NaN
4        NaN        NaN        NaN


In [228]:
data = data.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1)

In [229]:
print(data.isnull().any())

v1    False
v2    False
dtype: bool


In [230]:
print(data['v1'].value_counts())
#imbalanced dataset :)

v1
ham     4825
spam     747
Name: count, dtype: int64


In [231]:
data['special_char_count'] = data['v2'].apply(lambda x: sum(not c.isalnum() for c in x))

In [232]:
spam_keywords = ['win', 'prize', 'free', 'urgent', 'offer']
data['spam_keyword_count'] = data['v2'].apply(lambda x: sum(1 for word in spam_keywords if word in x.lower()))

In [233]:
import re
data['cleaned_text'] = data['v2'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))

In [234]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)  #
X = vectorizer.fit_transform(data['v2']).toarray()  # Convert text into numeric vectors
y = data['v1']

In [235]:
#spling the data train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Check the shape of the data
print(X_train.shape)

(4457, 5000)


In [236]:
#solving the imbalance problem (using over sampling)
from imblearn.over_sampling import SMOTE

# Apply SMOTE only to the training data
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

print("Class distribution after SMOTE:")
print(y_train_res.value_counts())

Class distribution after SMOTE:
v1
ham     3859
spam    3859
Name: count, dtype: int64


In [237]:
#random forst classifier model
from sklearn.ensemble import RandomForestClassifier

# Initialize RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model on the resampled training data
rf_model.fit(X_train_res, y_train_res)


In [238]:
# Predict on the test data
y_pred = rf_model.predict(X_test)

In [239]:
#evaluating the model by calculating the accuracy
from sklearn.metrics import classification_report
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print detailed classification report (Precision, Recall, F1-score, etc.)
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9811659192825112

Classification Report:
               precision    recall  f1-score   support

         ham       0.98      1.00      0.99       966
        spam       1.00      0.86      0.92       149

    accuracy                           0.98      1115
   macro avg       0.99      0.93      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [240]:
#confusion matric
from sklearn.metrics import confusion_matrix

# Generate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

Confusion Matrix:
 [[966   0]
 [ 21 128]]


In [241]:
#evaluating by calculating f1 score
from sklearn.metrics import f1_score

# Calculate the F1 score, explicitly specifying the positive label ('spam' as positive)
f1 = f1_score(y_test, y_pred, average='binary', pos_label='spam')  # 'spam' is the positive class
print("F1 Score:", f1)

F1 Score: 0.924187725631769


In [242]:
from sklearn.metrics import precision_score

# Calculate the precision score, specifying 'spam' as the positive class
precision1 = precision_score(y_test, y_pred, average='binary', pos_label='spam')  # 'spam' is the positive class
print("Precision:", precision1)

# Calculate the precision score, specifying 'ham' as the positive class
precision2 = precision_score(y_test, y_pred, average='binary', pos_label='ham')  # 'spam' is the positive class
print("Precision:", precision2)

Precision: 1.0
Precision: 0.9787234042553191


In [243]:
#check
print("True labels:", y_test[:10])
print("Predicted labels:", y_pred[:10])

True labels: 2826     ham
3695     ham
3906     ham
575     spam
2899     ham
3456     ham
5128     ham
919      ham
2505     ham
17       ham
Name: v1, dtype: object
Predicted labels: ['ham' 'ham' 'ham' 'spam' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham']


In [244]:
#calculating cross validation score
from sklearn.metrics import make_scorer, f1_score

# Use make_scorer to specify how to handle non-numeric labels
f1_scorer = make_scorer(f1_score, average='binary', pos_label='spam')

# Now use this scorer in cross-validation
from sklearn.model_selection import cross_val_score

# Perform cross-validation with the custom scorer
scores = cross_val_score(rf_model, X, y, cv=5, scoring=f1_scorer)
print(f"F1 scores for each fold: {scores}")
print(f"Mean F1 score: {scores.mean()}")

F1 scores for each fold: [0.93286219 0.92907801 0.91240876 0.89454545 0.90181818]
Mean F1 score: 0.9141425200969685


In [295]:
#testing 
message = input("enter a message to test if it is spam or ham please : ")
test_messages_tfidf =vectorizer.transform([message]).toarray()
pediction = rf_model.predict(test_messages_tfidf)
print(f"message : {message}\npredected: {pediction}")

enter a message to test if it is spam or ham please :  Congratulations! You've won a $1000 gift card. Call now to claim.


message : Congratulations! You've won a $1000 gift card. Call now to claim.
predected: ['spam']


In [287]:
!pip install gradio



In [None]:
#this is for the interface
import gradio as gr

# Define the function to predict spam or ham
def predict_email_spam(user_input):
    # Debugging: Check the input received
    print(f"Received input: {user_input}")
    
    # Transform the input using the trained vectorizer
    input_vectorized = vectorizer.transform([user_input]).toarray()  # Same transformation as in console
    
    # Debugging: Check the vectorized input
    print(f"Vectorized input: {input_vectorized}")  # Show the array of features
    
    # Predict using the trained rf_model
    prediction = rf_model.predict(input_vectorized)
    
    # Debugging: Check prediction result
    print(f"Prediction output: {prediction}")  # Print out the raw prediction result
    
    # Check if the output is a single value (spam or ham)
    if prediction[0] == 1:
        print("Prediction is Spam")
        return "Spam"
    elif prediction[0] == 0:
        print("Prediction is Ham")
        return "Ham"
    else:
        # Handle cases where prediction is not 0 or 1
        print(f"Unexpected prediction value: {prediction}")
        return "Error: Unexpected prediction output"

# Create the Gradio interface
interface = gr.Interface(
    fn=predict_email_spam,  # Function to process input
    inputs=gr.Textbox(lines=5, placeholder="Type your email content here..."),  # Input box for user to type the email
    outputs="text",  # Output will be text (either "Spam" or "Ham")
    title="Spam Email Classifier",  # Interface title
    description="Enter an email's content to check if it's Spam or Ham."  # Short description of what the interface does
)

# Launch the interface
interface.launch(share=True)  # share=True will generate a public URL that others can access


* Running on local URL:  http://127.0.0.1:7865
* Running on public URL: https://0ebcb37d7cc0e8f8cc.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


