In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Sample dataset (you can replace this with a CSV file if needed)
data = {
    'text': [
        "Win a free iPhone now",
        "Hey, how are you doing?",
        "Congratulations, you've won a lottery",
        "Let's catch up tomorrow",
        "Free entry in a contest",
        "Are we meeting today?",
        "You have been selected for a prize",
        "See you at 5 PM"
    ],
    'label': ['spam', 'ham', 'spam', 'ham', 'spam', 'ham', 'spam', 'ham']  # spam or ham (not spam)
}

# Step 1: Load data
df = pd.DataFrame(data)

# Step 2: Split data
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.3, random_state=42)

# Step 3: Convert text to features (Bag of Words)
vectorizer = CountVectorizer()
X_train_counts = vectorizer.fit_transform(X_train)
X_test_counts = vectorizer.transform(X_test)

# Step 4: Train model
model = MultinomialNB()
model.fit(X_train_counts, y_train)

# Step 5: Predict on test data
y_pred = model.predict(X_test_counts)

# Step 6: Show results
print("Predictions:", y_pred)
print("Accuracy:", accuracy_score(y_test, y_pred))

# Test new message
new_message = ["Congratulations! You have won a free gift"]
new_data = vectorizer.transform(new_message)
print("Prediction for new message:", model.predict(new_data)[0])


Predictions: ['spam' 'spam' 'spam']
Accuracy: 0.3333333333333333
Prediction for new message: spam
