In [8]:
import pandas as pd
df = pd.read_csv(
    "https://raw.githubusercontent.com/mohitgupta-1O1/Kaggle-SMS-Spam-Collection-Dataset-/master/spam.csv",
    encoding='latin-1'
)
df = df[['v1', 'v2']]
df.columns = ['label', 'message']
df['label'] = df['label'].map({'ham': 0, 'spam': 1})
print(df.head())

   label                                            message
0      0  Go until jurong point, crazy.. Available only ...
1      0                      Ok lar... Joking wif u oni...
2      1  Free entry in 2 a wkly comp to win FA Cup fina...
3      0  U dun say so early hor... U c already then say...
4      0  Nah I don't think he goes to usf, he lives aro...


In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df['message'], df['label'], test_size=0.20, random_state=42
)
print("Train size:", X_train.shape[0])
print("Test size:", X_test.shape[0])

Train size: 4457
Test size: 1115


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english', max_features=3000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)
print("Vectorization done. Feature shape:", X_train_tfidf.shape)

Vectorization done. Feature shape: (4457, 3000)


In [11]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(max_iter=5000)
model.fit(X_train_tfidf, y_train)
print("Model trained successfully!")

Model trained successfully!


In [12]:
from sklearn.metrics import accuracy_score, classification_report
y_pred = model.predict(X_test_tfidf)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9641255605381166

Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       0.97      0.75      0.85       150

    accuracy                           0.96      1115
   macro avg       0.97      0.88      0.91      1115
weighted avg       0.96      0.96      0.96      1115



In [14]:
sample_messages = X_test[:20]
sample_vectors = tfidf.transform(sample_messages)
probabilities = model.predict_proba(sample_vectors)
print("\nCategories: ['Not Spam', 'Spam']\n")
for i, prob in enumerate(probabilities):
    pred = "SPAM" if prob[1] > 0.5 else "NOT SPAM"
    print(f"Email {i+1}: Probability of spam={prob[1]:.2f}->Prediction:{pred}")


Categories: ['Not Spam', 'Spam']

Email 1: Probability of spam=0.08->Prediction:NOT SPAM
Email 2: Probability of spam=0.09->Prediction:NOT SPAM
Email 3: Probability of spam=0.22->Prediction:NOT SPAM
Email 4: Probability of spam=0.03->Prediction:NOT SPAM
Email 5: Probability of spam=0.89->Prediction:SPAM
Email 6: Probability of spam=0.08->Prediction:NOT SPAM
Email 7: Probability of spam=0.03->Prediction:NOT SPAM
Email 8: Probability of spam=0.13->Prediction:NOT SPAM
Email 9: Probability of spam=0.02->Prediction:NOT SPAM
Email 10: Probability of spam=0.06->Prediction:NOT SPAM
Email 11: Probability of spam=0.08->Prediction:NOT SPAM
Email 12: Probability of spam=0.94->Prediction:SPAM
Email 13: Probability of spam=0.05->Prediction:NOT SPAM
Email 14: Probability of spam=0.02->Prediction:NOT SPAM
Email 15: Probability of spam=0.04->Prediction:NOT SPAM
Email 16: Probability of spam=0.02->Prediction:NOT SPAM
Email 17: Probability of spam=0.07->Prediction:NOT SPAM
Email 18: Probability of spam=