In [13]:
# Step 1: Import Dependencies
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

In [14]:
# Step 2: Load the dataset
df = pd.read_csv("spam.csv")
print(df.head())

  Category                                            Message
0      ham                              Go until jurong point
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham                   Nah I don't think he goes to usf
4     spam  Had your mobile 11 months or more? U R entitle...


In [15]:
# Step 3: Categorize the target (convert ham/spam to 0/1)
df['spam'] = df['Category'].apply(lambda x: 1 if x == 'spam' else 0)
print(df.head())

  Category                                            Message  spam
0      ham                              Go until jurong point     0
1      ham                      Ok lar... Joking wif u oni...     0
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...     1
3      ham                   Nah I don't think he goes to usf     0
4     spam  Had your mobile 11 months or more? U R entitle...     1


In [16]:
# Step 4: Split into Training and Test sets
X_train, X_test, y_train, y_test = train_test_split(df.Message, df.spam, test_size=0.25)

# Step 5: Import CountVectorizer to convert text → numeric features
from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer()

# Convert NaN → ""
X_train = X_train.fillna("")
X_test = X_test.fillna("")

# Convert to string type
X_train_count = v.fit_transform(X_train.astype(str))
X_test_count  = v.transform(X_test.astype(str))


In [17]:

# Step 6: Train Multinomial Naïve Bayes model
model = MultinomialNB()
model.fit(X_train_count, y_train)

In [18]:

# Step 7: Transform test data and check accuracy
X_test_count = v.transform(X_test)
print("Test Accuracy:", model.score(X_test_count, y_test))

Test Accuracy: 0.9943019943019943


In [19]:
# Step 8: Use Pipeline (CountVectorizer + MultinomialNB together)
clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [20]:
# Step 9: Train Pipeline model
clf.fit(X_train, y_train)

In [21]:
# Step 10: Evaluate accuracy using Pipeline
print("Pipeline Test Accuracy:", clf.score(X_test, y_test))

Pipeline Test Accuracy: 0.9943019943019943


In [22]:
# Step 11: Predict on new emails
emails = [
    "Hello, please call our customer care immediately for a prize",
    "Hi John, are we still meeting for lunch tomorrow?"
]
print("Predictions for new emails:", clf.predict(emails))

Predictions for new emails: [1 1]
