In [13]:
import pandas as pd

# Load CSV with encoding fix
df = pd.read_csv('spam.csv', encoding='latin1')


In [14]:
df

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [15]:
df = df[['v1', 'v2']]  # Keep only useful columns
df.columns = ['label', 'message']  # Rename for clarity


In [None]:

# Remove duplicates and nulls
df.drop_duplicates(inplace=True)
df.dropna(inplace=True)


In [17]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['label'] = le.fit_transform(df['label'])  # ham = 0, spam = 1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['label'] = le.fit_transform(df['label'])  # ham = 0, spam = 1


In [24]:
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

nltk.download('punkt')
nltk.download('stopwords')

ps = PorterStemmer()

def transform_text(text):
    text = text.lower()
    words = nltk.word_tokenize(text)
    filtered = [ps.stem(w) for w in words if w.isalnum() and w not in stopwords.words('english') and w not in string.punctuation]
    return " ".join(filtered)

df['transformed'] = df['message'].apply(transform_text)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['transformed'] = df['message'].apply(transform_text)


In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=3000)
X = tfidf.fit_transform(df['transformed']).toarray()
y = df['label'].values


In [28]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define models to test
models = {
    "Naive Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "SVM (Linear)": SVC(kernel='linear'),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "Decision Tree": DecisionTreeClassifier(),
    "KNN": KNeighborsClassifier(n_neighbors=5)
}

# Function to evaluate each model
def evaluate_model(name, model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    print(f"\n🔎 {name}")
    print(f"Accuracy : {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall   : {rec:.4f}")
    print(f"F1 Score : {f1:.4f}")

# Run all models
for name, model in models.items():
    evaluate_model(name, model, X_train, X_test, y_train, y_test)



🔎 Naive Bayes
Accuracy : 0.9749
Precision: 1.0000
Recall   : 0.8133
F1 Score : 0.8971

🔎 Logistic Regression
Accuracy : 0.9480
Precision: 0.9600
Recall   : 0.6400
F1 Score : 0.7680

🔎 SVM (Linear)
Accuracy : 0.9794
Precision: 0.9774
Recall   : 0.8667
F1 Score : 0.9187

🔎 Random Forest
Accuracy : 0.9749
Precision: 1.0000
Recall   : 0.8133
F1 Score : 0.8971

🔎 Decision Tree
Accuracy : 0.9543
Precision: 0.8511
Recall   : 0.8000
F1 Score : 0.8247

🔎 KNN
Accuracy : 0.9193
Precision: 1.0000
Recall   : 0.4000
F1 Score : 0.5714


In [29]:
import pickle

pickle.dump(model, open('spam_model.pkl', 'wb'))
pickle.dump(tfidf, open('vectorizer.pkl', 'wb'))
