In [13]:
import pandas as pd

# Load your dataset
df = pd.read_csv('/content/email.csv')  # update path if needed
df = df[['Category', 'Message']]  # clean columns
df.columns = ['label', 'text']    # rename for clarity
df.head()


Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [14]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
# Re-load or clean the label column
df = df[df['label'].isin(['ham', 'spam'])]  # remove anything weird
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

# Function to clean the message text
def clean_text(text):
    text = text.lower()  # lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # remove punctuation
    text = ' '.join([word for word in text.split() if word not in stopwords.words('english')])
    return text

df['clean_text'] = df['text'].apply(clean_text)


df.head()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['label'] = df['label'].map({'ham': 0, 'spam': 1})


Unnamed: 0,label,text,clean_text
0,0,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...
1,0,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entry wkly comp win fa cup final tkts st ...
3,0,U dun say so early hor... U c already then say...,u dun say early hor u c already say
4,0,"Nah I don't think he goes to usf, he lives aro...",nah dont think goes usf lives around though


In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# Convert text into TF-IDF features
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['clean_text'])
y = df['label']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [16]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Train model
model = MultinomialNB()
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)


In [17]:
print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print("\n📊 Classification Report:\n", classification_report(y_test, y_pred))
print("\n🔍 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


✅ Accuracy: 0.967713004484305

📊 Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98       966
           1       1.00      0.76      0.86       149

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.97      1115


🔍 Confusion Matrix:
 [[966   0]
 [ 36 113]]


In [20]:
# 🔮 Predict function
def predict_message(message):
    # Clean it like your training data
    message_clean = re.sub(r'[^a-zA-Z\s]', '', message.lower())
    message_vec = vectorizer.transform([message_clean])  # Vectorize the message
    prediction = model.predict(message_vec)[0]           # Get the prediction
    return "SPAM" if prediction == 1 else "HAM"

# Test your model with custom inputs!
print(predict_message("You won a free ticket! Click now to claim."))
print(predict_message("Hey bro, are we meeting tomorrow?"))
print(predict_message("🔥 Congratulations! You've been selected for a cash prize."))


HAM
HAM
SPAM
