In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import numpy as np
import re
import string
import warnings

In [2]:
warnings.filterwarnings('ignore')

In [4]:
try:
   
    df = pd.read_csv('spam.csv', encoding='latin1') 
    df = df.iloc[:, :2]
    df.columns = ['label', 'text']
    print("Dataset loaded successfully!")
except FileNotFoundError:
    print("Error: 'spam.csv' not found. Please ensure the file is uploaded.")
    print("For demonstration, creating a dummy dataset if 'spam.csv' was not provided.")
    data = {
        'label': ['ham', 'spam', 'ham', 'ham', 'spam', 'ham', 'spam', 'ham'],
        'text': [
            "Hey, how are you?",
            "WINNER! U have won $1000! Claim now!",
            "Call me back later.",
            "Meeting at 3 PM.",
            "FREE entry to exclusive casino for VIPs! Text 'PLAY' to 1234.",
            "Did you get the document?",
            "URGENT! Your account has been compromised. Click this link: bit.ly/malicious",
            "See you tomorrow."
        ]
    }
    df = pd.DataFrame(data)
    print("Dummy dataset created for demonstration purposes.")


Dataset loaded successfully!


In [5]:
print("\n--- Original Dataset Head ---")
print(df.head())
print("\n--- Dataset Info ---")
df.info()
print("\n--- Missing Values Before Preprocessing ---")
print(df.isnull().sum())
print(f"\nTotal rows before dropping NaNs: {df.shape[0]}")


--- Original Dataset Head ---
  label                                               text
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...

--- Dataset Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   5572 non-null   object
 1   text    5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB

--- Missing Values Before Preprocessing ---
label    0
text     0
dtype: int64

Total rows before dropping NaNs: 5572


In [6]:
df.dropna(inplace=True)
print(f"\nTotal rows after dropping NaNs: {df.shape[0]}")
print("\n--- Missing Values After Preprocessing ---")
print(df.isnull().sum())


Total rows after dropping NaNs: 5572

--- Missing Values After Preprocessing ---
label    0
text     0
dtype: int64


In [7]:
le = LabelEncoder()
df['label_encoded'] = le.fit_transform(df['label'])
print("\n--- Label Encoding Mapping ---")
print(dict(zip(le.classes_, le.transform(le.classes_))))



--- Label Encoding Mapping ---
{'ham': np.int64(0), 'spam': np.int64(1)}


In [9]:
def preprocess_text(text):
    text = text.lower() 
    text = ''.join([char for char in text if char not in string.punctuation]) 
    text = text.strip() 
    return text

df['cleaned_text'] = df['text'].apply(preprocess_text)

print("\n--- Dataset Head with Cleaned Text ---")
print(df.head())


--- Dataset Head with Cleaned Text ---
  label                                               text  label_encoded  \
0   ham  Go until jurong point, crazy.. Available only ...              0   
1   ham                      Ok lar... Joking wif u oni...              0   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...              1   
3   ham  U dun say so early hor... U c already then say...              0   
4   ham  Nah I don't think he goes to usf, he lives aro...              0   

                                        cleaned_text  
0  go until jurong point crazy available only in ...  
1                            ok lar joking wif u oni  
2  free entry in 2 a wkly comp to win fa cup fina...  
3        u dun say so early hor u c already then say  
4  nah i dont think he goes to usf he lives aroun...  


In [10]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['cleaned_text'])
y = df['label_encoded']

print(f"\nShape of vectorized text data (X): {X.shape}")
print(f"Number of unique words (features): {X.shape[1]}")


Shape of vectorized text data (X): (5572, 9489)
Number of unique words (features): 9489


In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"\nTraining features shape: {X_train.shape}")
print(f"Testing features shape: {X_test.shape}")
print(f"Training target shape: {y_train.shape}")
print(f"Testing target shape: {y_test.shape}")
print(f"Training set class distribution:\n{y_train.value_counts(normalize=True)}")
print(f"Testing set class distribution:\n{y_test.value_counts(normalize=True)}")



Training features shape: (4457, 9489)
Testing features shape: (1115, 9489)
Training target shape: (4457,)
Testing target shape: (1115,)
Training set class distribution:
label_encoded
0    0.865829
1    0.134171
Name: proportion, dtype: float64
Testing set class distribution:
label_encoded
0    0.866368
1    0.133632
Name: proportion, dtype: float64


In [14]:
model = LogisticRegression(solver='liblinear', random_state=42) # 'liblinear' is good for small datasets
model.fit(X_train, y_train)

print("\n--- Model Training Complete (Logistic Regression) ---")
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1] 
print("\n--- Model Evaluation ---")


cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)
print(f"  True Negatives (TN - Correctly Ham): {cm[0,0]}")
print(f"  False Positives (FP - Ham classified as Spam): {cm[0,1]}")
print(f"  False Negatives (FN - Spam classified as Ham): {cm[1,0]}")
print(f"  True Positives (TP - Correctly Spam): {cm[1,1]}")

accuracy = accuracy_score(y_test, y_pred)
print(f"\nAccuracy: {accuracy:.4f}")


--- Model Training Complete (Logistic Regression) ---

--- Model Evaluation ---
Confusion Matrix:
[[964   2]
 [ 17 132]]
  True Negatives (TN - Correctly Ham): 964
  False Positives (FP - Ham classified as Spam): 2
  False Negatives (FN - Spam classified as Ham): 17
  True Positives (TP - Correctly Spam): 132

Accuracy: 0.9830


In [15]:
accuracy = accuracy_score(y_test, y_pred)
print(f"\nAccuracy: {accuracy:.4f}")


Accuracy: 0.9830


In [16]:
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f"Precision (Spam): {precision:.4f}")
print(f"Recall (Spam): {recall:.4f}")
print(f"F1-Score (Spam): {f1:.4f}")

Precision (Spam): 0.9851
Recall (Spam): 0.8859
F1-Score (Spam): 0.9329


In [17]:
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))

print("\n--- Example Spam Detection ---")

new_emails = [
    "Hello, how are you doing today? Let's catch up soon.", # Ham
    "URGENT: Your bank account needs verification. Click here immediately to avoid suspension.", # Spam
    "Meeting reminder for tomorrow morning at 10 AM.", # Ham
    "Congratulations! You've won a FREE iPhone. Claim your prize now!", # Spam
    "Please send your details for a job interview opportunity." # Likely Ham, could be phishing
]



Classification Report:
              precision    recall  f1-score   support

         ham       0.98      1.00      0.99       966
        spam       0.99      0.89      0.93       149

    accuracy                           0.98      1115
   macro avg       0.98      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      1115


--- Example Spam Detection ---


In [19]:
cleaned_new_emails = [preprocess_text(email) for email in new_emails]
new_emails_vectorized = vectorizer.transform(cleaned_new_emails)

# Make predictions
predictions_encoded = model.predict(new_emails_vectorized)
predictions_labels = le.inverse_transform(predictions_encoded)
probabilities = model.predict_proba(new_emails_vectorized)

print("\n--- New Email Predictions ---")
for i, email in enumerate(new_emails):
    print(f"Email: '{email}'")
    print(f"  Predicted: {predictions_labels[i]} (Probability of Spam: {probabilities[i][1]:.4f})")
    print("-" * 50)




--- New Email Predictions ---
Email: 'Hello, how are you doing today? Let's catch up soon.'
  Predicted: ham (Probability of Spam: 0.0059)
--------------------------------------------------
Email: 'URGENT: Your bank account needs verification. Click here immediately to avoid suspension.'
  Predicted: ham (Probability of Spam: 0.2382)
--------------------------------------------------
Email: 'Meeting reminder for tomorrow morning at 10 AM.'
  Predicted: ham (Probability of Spam: 0.0057)
--------------------------------------------------
Email: 'Congratulations! You've won a FREE iPhone. Claim your prize now!'
  Predicted: spam (Probability of Spam: 0.9239)
--------------------------------------------------
Email: 'Please send your details for a job interview opportunity.'
  Predicted: ham (Probability of Spam: 0.0780)
--------------------------------------------------
