In [3]:
import numpy as np
import pandas as pd
import nltk
import string

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import accuracy_score, precision_score, confusion_matrix, classification_report

In [4]:
df = pd.read_csv("spam.csv", encoding="ISO-8859-1")
df = df.drop(columns=["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"])
df = df.rename(columns={"v1": "Category", "v2": "text"})

In [5]:
df.head()

Unnamed: 0,Category,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
df["Category"] = df["Category"].map({"ham": 0, "spam": 1})

In [7]:
df = df.drop_duplicates(keep="first").reset_index(drop=True)

In [8]:
nltk.download("punkt")
nltk.download("stopwords")

ps = PorterStemmer()
stop_words = set(stopwords.words("english"))

[nltk_data] Downloading package punkt to C:\Users\acer/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\acer/nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [9]:
def clean_text(text):
    text = text.lower()
    tokens = nltk.word_tokenize(text)
    tokens = [w for w in tokens if w.isalnum()]
    tokens = [w for w in tokens if w not in stop_words]
    tokens = [ps.stem(w) for w in tokens]
    return " ".join(tokens)

In [10]:
X = df["text"]
y = df["Category"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=2, stratify=y
)

In [11]:
pipe = Pipeline([
    ("tfidf", TfidfVectorizer(preprocessor=clean_text)),
    ("mnb", MultinomialNB())
])


In [None]:
param_grid = {
    "tfidf__max_features": [3000, 5000, 8000, 12000],
    "tfidf__ngram_range": [(1, 1), (1, 2)],        
    "tfidf__min_df": [1, 2, 3],
    "tfidf__sublinear_tf": [True, False],
    "mnb__alpha": [0.1, 0.3, 0.5, 0.7, 1.0]
}

In [13]:
grid = GridSearchCV(
    pipe,
    param_grid=param_grid,
    scoring="f1",    
    cv=5,
    n_jobs=-1
)

grid.fit(X_train, y_train)

best_model = grid.best_estimator_

In [14]:
y_pred = best_model.predict(X_test)
print(" Best Params:", grid.best_params_)
print(" Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


 Best Params: {'mnb__alpha': 0.1, 'tfidf__max_features': 12000, 'tfidf__min_df': 1, 'tfidf__ngram_range': (1, 2), 'tfidf__sublinear_tf': True}
 Accuracy: 0.9893617021276596
Precision: 0.9838709677419355

Confusion Matrix:
 [[901   2]
 [  9 122]]

Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      0.99       903
           1       0.98      0.93      0.96       131

    accuracy                           0.99      1034
   macro avg       0.99      0.96      0.98      1034
weighted avg       0.99      0.99      0.99      1034



In [15]:
import pickle
pickle.dump(best_model, open("model.pkl", "wb"))