In [1]:
!pip install transformers datasets evaluate scikit-learn pandas numpy matplotlib torch joblib

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [12]:
import pickle

# Load the trained model
with open('spam_nb_model.pkl', 'rb') as f:
    loaded_nb_model = pickle.load(f)

# Load the vectorizer
with open('vectorizer.pkl', 'rb') as f:
    loaded_vectorizer = pickle.load(f)

print("Model and vectorizer loaded successfully!")

Model and vectorizer loaded successfully!


In [13]:
# Assuming 'clean_text' is defined in an earlier cell (e.g., bf4023f4)
# Assuming 'loaded_nb_model' and 'loaded_vectorizer' are loaded, and 'predict_spam_loaded' is defined

# Test with new examples using the loaded model (via predict_spam_loaded from 9df1ef06)
print(f"'WINNER! You've won a free iPhone!' is: {predict_spam_loaded('WINNER! You\'ve won a free iPhone!')}")
print(f"'Hey, what time are we meeting for coffee?' is: {predict_spam_loaded('Hey, what time are we meeting for coffee?')}")

'WINNER! You've won a free iPhone!' is: Spam
'Hey, what time are we meeting for coffee?' is: Ham


In [14]:
import re

# Function to clean text
def clean_text(text):
    text = text.lower()  # convert to lowercase
    text = re.sub(r'<.*?>', '', text)  # remove HTML tags
    text = re.sub(r'[^a-z\s]', '', text)  # remove numbers & special characters
    text = re.sub(r'\s+', ' ', text)  # remove extra spaces
    return text.strip()

# Apply cleaning to the 'v2' column and store it in a new 'cleaned_text' column
df['cleaned_text'] = df['v2'].apply(clean_text)

# Check the first few rows with the new column
print(df.head())

     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4                                       cleaned_text  \
0        NaN        NaN  go until jurong point crazy available only in ...   
1        NaN        NaN                            ok lar joking wif u oni   
2        NaN        NaN  free entry in a wkly comp to win fa cup final ...   
3        NaN        NaN        u dun say so early hor u c already then say   
4        NaN        NaN  nah i dont think he goes to usf he lives aroun...   

   label  
0      0  
1      0  
2      1  
3      0  
4      0  


In [15]:
# Predict with a custom email
custom_email = "Congratulations! You have been selected for a prize worth $1,000,000. Click here to claim."
print(f"'{custom_email}' is: {predict_spam_loaded(custom_email)}")

custom_email_2 = "Hi, just wanted to check in about our meeting tomorrow. Are we still on for 10 AM?"
print(f"'{custom_email_2}' is: {predict_spam_loaded(custom_email_2)}")

'Congratulations! You have been selected for a prize worth $1,000,000. Click here to claim.' is: Spam
'Hi, just wanted to check in about our meeting tomorrow. Are we still on for 10 AM?' is: Ham


In [16]:
import numpy as np

# Get feature names (words) from the vectorizer
feature_names = loaded_vectorizer.get_feature_names_out()

# Get the log probabilities of features given each class
# nb_model.feature_log_prob_ has shape (n_classes, n_features)
# The first row [0] corresponds to 'ham', the second row [1] to 'spam'
ham_feature_log_probs = loaded_nb_model.feature_log_prob_[0]
spam_feature_log_probs = loaded_nb_model.feature_log_prob_[1]

# Sort features by their log probabilities to find the most indicative words
# For ham (class 0)
ham_indicative_features_indices = ham_feature_log_probs.argsort()[-10:][::-1] # Top 10 words for ham
ham_indicative_words = [feature_names[i] for i in ham_indicative_features_indices]

# For spam (class 1)
spam_indicative_features_indices = spam_feature_log_probs.argsort()[-10:][::-1] # Top 10 words for spam
spam_indicative_words = [feature_names[i] for i in spam_indicative_features_indices]

print("Top 10 words indicative of HAM:")
for i, word in enumerate(ham_indicative_words):
    print(f"{i+1}. {word}")

print("\nTop 10 words indicative of SPAM:")
for i, word in enumerate(spam_indicative_words):
    print(f"{i+1}. {word}")

Top 10 words indicative of HAM:
1. you
2. to
3. the
4. and
5. in
6. is
7. me
8. my
9. it
10. of

Top 10 words indicative of SPAM:
1. to
2. call
3. you
4. your
5. free
6. the
7. for
8. now
9. or
10. is


In [17]:
from sklearn.model_selection import train_test_split

# Convert labels to numeric: 0 = ham, 1 = spam
df['label'] = df['v1'].map({'ham': 0, 'spam': 1})

# Split into train + test (15% test)
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['cleaned_text'], df['label'], test_size=0.15, random_state=42, stratify=df['label']
)

# Split train into train + validation (15% of train for validation)
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_texts, train_labels, test_size=0.15, random_state=42, stratify=train_labels
)

# Check sizes
print("Train size:", len(train_texts))
print("Validation size:", len(val_texts))
print("Test size:", len(test_texts))

Train size: 4025
Validation size: 711
Test size: 836


In [18]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Convert text to numeric features
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(train_texts)
X_val = vectorizer.transform(val_texts)
X_test = vectorizer.transform(test_texts)

# Train Naive Bayes classifier
nb_model = MultinomialNB()
nb_model.fit(X_train, train_labels)

# Evaluate on validation set
val_preds = nb_model.predict(X_val)
print("Validation Accuracy:", accuracy_score(val_labels, val_preds))
print(classification_report(val_labels, val_preds))

# Evaluate on test set
test_preds = nb_model.predict(X_test)
print("Test Accuracy:", accuracy_score(test_labels, test_preds))
print(classification_report(test_labels, test_preds))

Validation Accuracy: 0.9901547116736991
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       616
           1       0.99      0.94      0.96        95

    accuracy                           0.99       711
   macro avg       0.99      0.97      0.98       711
weighted avg       0.99      0.99      0.99       711

Test Accuracy: 0.9772727272727273
              precision    recall  f1-score   support

           0       0.98      0.99      0.99       724
           1       0.95      0.88      0.91       112

    accuracy                           0.98       836
   macro avg       0.97      0.93      0.95       836
weighted avg       0.98      0.98      0.98       836



In [19]:
# Assuming 'clean_text' is defined in an earlier cell (e.g., bf4023f4)
# Assuming 'vectorizer' and 'nb_model' are available and 'predict_spam' is defined (e.g., in 403be8ad)

# Examples using the 'predict_spam' function (defined in 403be8ad)
print(predict_spam("Congratulations! You won a free ticket!"))
print(predict_spam("Hi, are we meeting tomorrow?"))

Spam
Ham


In [20]:
import os
print(os.listdir('/content/'))

['.config', 'spam_nb_model.pkl', 'spam[1].csv', 'vectorizer.pkl', 'sample_data']


In [22]:
def predict_spam(email_text):
    email_text = clean_text(email_text) # Use the clean_text function defined earlier
    features = vectorizer.transform([email_text]) # Use the fitted vectorizer
    pred = nb_model.predict(features)[0]
    return "Spam" if pred == 1 else "Ham"

# Test with new examples
print(f"'WINNER! You've won a free iPhone!' is: {predict_spam('WINNER! You\'ve won a free iPhone!')}")
print(f"'Hey, what time are we meeting for coffee?' is: {predict_spam('Hey, what time are we meeting for coffee?')}")
print(f"'Free entry to a £1000 prize draw! Text WIN to 80080 now.' is: {predict_spam('Free entry to a £1000 prize draw! Text WIN to 80080 now.')}")
print(f"'Hi, could you please send me the report by end of day?' is: {predict_spam('Hi, could you please send me the report by end of day?')}")

'WINNER! You've won a free iPhone!' is: Spam
'Hey, what time are we meeting for coffee?' is: Ham
'Free entry to a £1000 prize draw! Text WIN to 80080 now.' is: Spam
'Hi, could you please send me the report by end of day?' is: Ham


In [23]:
import pickle
# 'clean_text' is assumed to be defined in a previous cell (e.g., bf4023f4)

# Load the trained model
with open('spam_nb_model.pkl', 'rb') as f:
    loaded_nb_model = pickle.load(f)

# Load the vectorizer
with open('vectorizer.pkl', 'rb') as f:
    loaded_vectorizer = pickle.load(f)

print("Model and vectorizer loaded successfully!")

# 'predict_spam_loaded' is assumed to be defined in 9df1ef06

# Test with new examples using the loaded model
print(f"'WINNER! You've won a free iPhone!' (loaded model) is: {predict_spam_loaded('WINNER! You\'ve won a free iPhone!')}")
print(f"'Hey, what time are we meeting for coffee?' (loaded model) is: {predict_spam_loaded('Hey, what time are we meeting for coffee?')}")

Model and vectorizer loaded successfully!
'WINNER! You've won a free iPhone!' (loaded model) is: Spam
'Hey, what time are we meeting for coffee?' (loaded model) is: Ham


In [35]:
import pickle

# Save the trained model
with open('spam_nb_model.pkl', 'wb') as f:
    pickle.dump(nb_model, f)

# Save the vectorizer
with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

In [36]:
from sklearn.feature_extraction.text import CountVectorizer

# Convert text to numeric features
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(train_texts)
X_val = vectorizer.transform(val_texts)
X_test = vectorizer.transform(test_texts)

In [37]:
from sklearn.naive_bayes import MultinomialNB

# Train Naive Bayes classifier
nb_model = MultinomialNB()
nb_model.fit(X_train, train_labels)  # Training the model

In [38]:
import pickle
# 're' is imported and 'clean_text' is defined in an earlier cell (e.g., bf4023f4)

# Load the trained model
with open('spam_nb_model.pkl', 'rb') as f:
    loaded_nb_model = pickle.load(f)

# Load the vectorizer
with open('vectorizer.pkl', 'rb') as f:
    loaded_vectorizer = pickle.load(f)

print("Model and vectorizer loaded successfully!")

def predict_spam_loaded(email_text):
    email_text = clean_text(email_text)
    # Use the loaded vectorizer and model
    features = loaded_vectorizer.transform([email_text])
    pred = loaded_nb_model.predict(features)[0]
    return "Spam" if pred == 1 else "Ham"

# Test with new examples using the loaded model
print(f"'WINNER! You've won a free iPhone!' (loaded model) is: {predict_spam_loaded('WINNER! You\'ve won a free iPhone!')}")
print(f"'Hey, what time are we meeting for coffee?' (loaded model) is: {predict_spam_loaded('Hey, what time are we meeting for coffee?')}")
print(f"'Free entry to a \u00a31000 prize draw! Text WIN to 80080 now.' (loaded model) is: {predict_spam_loaded('Free entry to a \u00a31000 prize draw! Text WIN to 80080 now.')}")
print(f"'Hi, could you please send me the report by end of day?' (loaded model) is: {predict_spam_loaded('Hi, could you please send me the report by end of day?')}")

Model and vectorizer loaded successfully!
'WINNER! You've won a free iPhone!' (loaded model) is: Spam
'Hey, what time are we meeting for coffee?' (loaded model) is: Ham
'Free entry to a £1000 prize draw! Text WIN to 80080 now.' (loaded model) is: Spam
'Hi, could you please send me the report by end of day?' (loaded model) is: Ham


In [39]:
val_preds = nb_model.predict(X_val)
test_preds = nb_model.predict(X_test)

In [40]:
import re
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset
df = pd.read_csv('/content/spam[1].csv', encoding='latin1') # Corrected file name from spam[2].csv to spam[1].csv

# Define the clean_text function
def clean_text(text):
    text = text.lower()  # convert to lowercase
    text = re.sub(r'<.*?>', '', text)  # remove HTML tags
    text = re.sub(r'[^a-z\s]', '', text)  # remove numbers & special characters
    text = re.sub(r'\s+', ' ', text)  # remove extra spaces
    return text.strip()

# Apply cleaning to the 'v2' column and store it in a new 'cleaned_text' column
df['cleaned_text'] = df['v2'].apply(clean_text)

# Convert labels to numeric: 0 = ham, 1 = spam
df['label'] = df['v1'].map({'ham': 0, 'spam': 1})

# Split into train + test (15% test)
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['cleaned_text'], df['label'], test_size=0.15, random_state=42, stratify=df['label']
)

# Split train into train + validation (15% of train for validation)
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_texts, train_labels, test_size=0.15, random_state=42, stratify=train_labels
)

print("Data preprocessed and split successfully.")
print("Train size:", len(train_texts))
print("Validation size:", len(val_texts))
print("Test size:", len(test_texts))

Data preprocessed and split successfully.
Train size: 4025
Validation size: 711
Test size: 836
