In [5]:
import pandas as pd

# Load the uploaded dataset
df = pd.read_csv('spam.csv', encoding='latin-1')

# Clean and prepare
df = df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, errors='ignore')
df = df.rename(columns={'v1': 'label', 'v2': 'message'})

print(df.head())

  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [6]:
df['label'] = df['label'].map({'spam': 1, 'ham': 0})


In [7]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
features = vectorizer.fit_transform(df['message'])


In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, df['label'], test_size=0.2, random_state=42)


In [9]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train, y_train)
predictions = model.predict(X_test)


In [10]:
from sklearn.metrics import classification_report, accuracy_score
print("Accuracy:", accuracy_score(y_test, predictions))
print(classification_report(y_test, predictions))


Accuracy: 0.979372197309417
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       965
           1       0.92      0.93      0.92       150

    accuracy                           0.98      1115
   macro avg       0.95      0.96      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [11]:
sample = ["Congratulations! You've won a free ticket. Call now."]
sample_features = vectorizer.transform(sample)
print(model.predict(sample_features))  # 1 for spam, 0 for ham


[1]
