In [12]:
#imports
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report


In [13]:
df = pd.read_csv('/kaggle/input/sms-spam-collection-dataset/spam.csv', encoding='latin-1')
df.columns


Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')

In [14]:
#keeping only necessary columns
df = df[['v1', 'v2']]  
df.columns = ['label', 'message'] #renaming

#encoding labels 'ham' = 0, 'spam' = 1
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

In [15]:
#split in test and train
X_train, X_test, y_train, y_test = train_test_split(
    df['message'], df['label'], test_size=0.2, random_state=42)

In [16]:
#tf-idf vectorization
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [17]:
#naive bayes model training
model = MultinomialNB()
model.fit(X_train_vec, y_train)

In [18]:
#evaluation
y_pred = model.predict(X_test_vec)
print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print("\n📊 Classification Report:\n", classification_report(y_test, y_pred))

✅ Accuracy: 0.9623318385650225

📊 Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       1.00      0.72      0.84       150

    accuracy                           0.96      1115
   macro avg       0.98      0.86      0.91      1115
weighted avg       0.96      0.96      0.96      1115



In [19]:
#testing on a sample message
# Try your own SMS
your_msg = "Congratulations! You've won."
your_vec = vectorizer.transform([your_msg])
predicted_label = model.predict(your_vec)[0]
print("Predicted Label:", "Spam" if predicted_label == 1 else "Ham")


Predicted Label: Ham
