In [None]:
import numpy as np
import pandas as pd

In [None]:
df = pd.read_csv('../data/spam.csv', encoding='latin-1')

In [None]:
df.sample(5)

In [None]:
df.info()

In [None]:
df.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], inplace=True)

In [None]:
df.rename(columns={'v1': 'target', 'v2': 'text'}, inplace=True)

In [None]:
# Missing values

df.isnull().sum()

In [None]:
# Check for duplicate values

df.duplicated().sum()

In [None]:
df = df.drop_duplicates(keep='first')

In [None]:
df.head()

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['target'] = le.fit_transform(df['target'])

# 2. EDA

In [None]:
df.head()

In [None]:
df['target'].value_counts()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
plt.pie(df['target'].value_counts(), labels=['ham', 'spam'], autopct='%1.1f%%')
plt.show()

In [None]:
# Since data is imbalanced, we will use stratified sampling while splitting the data

In [None]:
import nltk

In [None]:
nltk.download('punkt_tab', quiet=True)

In [None]:
df['num_characters'] = df['text'].apply(len)

In [None]:
df.head()

In [None]:
df['num_words'] = df['text'].apply(lambda x: nltk.word_tokenize(x)).apply(len)

In [None]:
df['num_sentences'] = df['text'].apply(lambda x: nltk.sent_tokenize(x)).apply(len)

In [None]:
df.head()

In [None]:
df[df['target'] == 0][['num_characters', 'num_words', 'num_sentences']].describe()

In [None]:
# Histograms
plt.figure(figsize=(15, 6))
sns.histplot(df[df['target'] == 0]['num_characters'], color='blue', label='Ham')
sns.histplot(df[df['target'] == 1]['num_characters'], color='red', label='Spam')
plt.legend()
plt.show()

In [None]:
# Histograms
plt.figure(figsize=(20, 6))
sns.histplot(df[df['target'] == 0]['num_words'], color='blue', label='Ham')
sns.histplot(df[df['target'] == 1]['num_words'], color='red', label='Spam')
plt.legend()
plt.show()

In [None]:
corr = df.select_dtypes(include=['number']).corr()
plt.figure(figsize=(8, 6))
sns.heatmap(corr, annot=True, cmap='Blues')
plt.show()

In [None]:
sns.pairplot(df, hue='target')
plt.show()

In [None]:
from nltk.stem.porter import PorterStemmer
import string
from nltk.corpus import stopwords
nltk.download('stopwords')

ps = PorterStemmer()


In [None]:
def transform_text(text):
    text = text.lower()
    tokens = nltk.word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    return " ".join([
        ps.stem(token) 
        for token in tokens 
        if token.isalnum() and token not in stop_words and token not in string.punctuation 
    ])


In [None]:
transform_text('can you come to the epstein island tommorrow? loving dancing %')

In [None]:
df['text'].apply(transform_text)

In [None]:
df['transformed_text'] = df['text'].apply(transform_text)

In [None]:
df.head()

In [None]:
!pip install wordcloud

In [None]:
from wordcloud import WordCloud
wc = WordCloud(width=500, height=500, min_font_size=10, background_color='white')

In [None]:
spam = wc.generate(df[df['target'] == 1]['transformed_text'].str.cat(sep=" "))

In [None]:
plt.imshow(spam)

In [None]:
df.head()

In [None]:
df[df['target'] == 1]['transformed_text'].head(10)

In [None]:
spam_corpus = []
spam_words_list = df[df['target'] == 1]['transformed_text'].tolist()

for text in spam_words_list:
    for word in text.split():
        spam_corpus.append(word)

In [None]:
len(spam_corpus)

In [None]:
from collections import Counter
pd.DataFrame(Counter(spam_corpus).most_common(30), columns=['word', 'count'])

In [None]:
sns.barplot(x='word', y='count', data=pd.DataFrame(Counter(spam_corpus).most_common(30), columns=['word', 'count']))
plt.xticks(rotation='vertical')
plt.show()

In [None]:
ham_corpus = []
ham_words_list = df[df['target'] == 0]['transformed_text'].tolist()

for text in ham_words_list:
    for word in text.split():
        ham_corpus.append(word)

In [None]:
sns.barplot(x='word', y='count', data=pd.DataFrame(Counter(ham_corpus).most_common(30), columns=['word', 'count']))
plt.xticks(rotation='vertical')
plt.show()

# 4. Model Building

In [74]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
cv = TfidfVectorizer(max_features=3000)

In [75]:
# Split data for training
X = cv.fit_transform(df['transformed_text']).toarray()
y = df['target'].values

In [76]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)
print(f"Train samples: {len(X_train)}, Test samples: {len(X_test)}")

Train samples: 4135, Test samples: 1034


In [None]:
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix

In [None]:
gnb = GaussianNB()
mnb = MultinomialNB()
bnb = BernoulliNB()

In [None]:
gnb.fit(X_train, y_train)
y_pred_gnb = gnb.predict(X_test)

print("Accuracy Score:", accuracy_score(y_test, y_pred_gnb))
print("Precision Score:", precision_score(y_test, y_pred_gnb))
print("Confusion Matrix", confusion_matrix(y_test, y_pred_gnb))

In [None]:
mnb.fit(X_train, y_train)
y_pred_mnb = mnb.predict(X_test)

print("Accuracy Score:", accuracy_score(y_test, y_pred_mnb))
print("Precision Score:", precision_score(y_test, y_pred_mnb))
print("Confusion Matrix", confusion_matrix(y_test, y_pred_mnb))

In [None]:
bnb.fit(X_train, y_train)
y_pred_bnb = bnb.predict(X_test)

print("Accuracy Score:", accuracy_score(y_test, y_pred_bnb))
print("Precision Score:", precision_score(y_test, y_pred_bnb))
print("Confusion Matrix", confusion_matrix(y_test, y_pred_bnb))

In [None]:
!pip install xgboost

# 5. Model Improvement

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import BaggingClassifier
from xgboost import XGBClassifier

In [None]:
svc = SVC(kernel='sigmoid', gamma=1.0)
knc = KNeighborsClassifier()
mnb = MultinomialNB()
dtc = DecisionTreeClassifier(max_depth=5)
rfc = RandomForestClassifier(n_estimators=50, random_state=2)
abc = AdaBoostClassifier(n_estimators=50, random_state=2)
gbc = GradientBoostingClassifier(n_estimators=50, random_state=2)
etc = ExtraTreesClassifier(n_estimators=50, random_state=2)
bc = BaggingClassifier(n_estimators=50, random_state=2)
xgbc = XGBClassifier(n_estimators=50, random_state=2)

In [None]:
clfs = {
    'SVC': svc,
    'KNC': knc,
    'MNB': mnb,
    'DTC': dtc,
    'RFC': rfc,
    'ABC': abc,
    'GBC': gbc,
    'ETC': etc,
    'BC': bc,
    'XGBC': xgbc
}

In [None]:
def train_classifier(clf, X_train, y_train, X_test, y_test):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    return accuracy, precision

In [None]:
train_classifier(svc, X_train, y_train, X_test, y_test)

In [None]:
# --- Reusable function to run all classifiers and return results ---
def run_all_classifiers(clfs, X_train, y_train, X_test, y_test):
    """Train all classifiers and return a dict with model names as keys 
    and (accuracy, precision) as values"""
    results = {}
    for name, clf in clfs.items():
        accuracy, precision = train_classifier(clf, X_train, y_train, X_test, y_test)
        results[name] = (accuracy, precision)
        print(f"{name}: Accuracy={accuracy:.4f}, Precision={precision:.4f}")
    return results

In [79]:
# Dictionary to store results from all experiments
# Key = experiment name, Value = dict of {model_name: (accuracy, precision)}
all_experiments = {}

# --- Experiment 1: Original (default TfidfVectorizer) ---
print("=== Experiment: original ===")
all_experiments['original'] = run_all_classifiers(clfs, X_train, y_train, X_test, y_test)

=== Experiment: original ===
SVC: Accuracy=0.9758, Precision=0.9748
KNC: Accuracy=0.9052, Precision=1.0000
MNB: Accuracy=0.9710, Precision=1.0000
DTC: Accuracy=0.9284, Precision=0.8200
RFC: Accuracy=0.9758, Precision=0.9829
ABC: Accuracy=0.9246, Precision=0.8488
GBC: Accuracy=0.9468, Precision=0.9192
ETC: Accuracy=0.9749, Precision=0.9746
BC: Accuracy=0.9584, Precision=0.8682
XGBC: Accuracy=0.9671, Precision=0.9483


In [77]:
# --- Experiment 2: TfidfVectorizer with max_features=3000 ---
# (Re-using same X_train, X_test since cv was already set to max_features=3000)
print("=== Experiment: max_ft_3000 ===")
all_experiments['max_ft_3000'] = run_all_classifiers(clfs, X_train, y_train, X_test, y_test)

=== Experiment: max_ft_3000 ===
SVC: Accuracy=0.9758, Precision=0.9748
KNC: Accuracy=0.9052, Precision=1.0000
MNB: Accuracy=0.9710, Precision=1.0000
DTC: Accuracy=0.9313, Precision=0.8252
RFC: Accuracy=0.9758, Precision=0.9829
ABC: Accuracy=0.9246, Precision=0.8488
GBC: Accuracy=0.9468, Precision=0.9192
ETC: Accuracy=0.9749, Precision=0.9746
BC: Accuracy=0.9584, Precision=0.8682
XGBC: Accuracy=0.9671, Precision=0.9483


In [80]:
# --- Helper function to build comparison dataframe from all experiments ---
def build_comparison_df(all_experiments):
    """Takes the all_experiments dict and builds a single comparison DataFrame.
    Each experiment adds Accuracy_<name> and Precision_<name> columns."""
    
    # Get model names from first experiment
    model_names = list(list(all_experiments.values())[0].keys())
    
    comparison = pd.DataFrame({'Model': model_names})
    
    for exp_name, results in all_experiments.items():
        comparison[f'Accuracy_{exp_name}'] = [results[m][0] for m in model_names]
        comparison[f'Precision_{exp_name}'] = [results[m][1] for m in model_names]
    
    # Sort by the last experiment's precision
    last_exp = list(all_experiments.keys())[-1]
    return comparison.sort_values(by=f'Precision_{last_exp}', ascending=False)

# Build and display comparison so far
comparison_df = build_comparison_df(all_experiments)
comparison_df

Unnamed: 0,Model,Accuracy_original,Precision_original
1,KNC,0.905222,1.0
2,MNB,0.970986,1.0
4,RFC,0.975822,0.982906
0,SVC,0.975822,0.97479
7,ETC,0.974855,0.974576
9,XGBC,0.967118,0.948276
6,GBC,0.946809,0.919192
8,BC,0.958414,0.868217
5,ABC,0.924565,0.848837
3,DTC,0.928433,0.82


### 2. Lets try with scaling the features

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

X_train_scaled, X_test_scaled, y_train_scaled, y_test_scaled = train_test_split(
    X_scaled, y, test_size=0.2, random_state=2
)

print("=== Experiment: minmax_scaled ===")
all_experiments['minmax_scaled'] = run_all_classifiers(
    clfs, X_train_scaled, y_train_scaled, X_test_scaled, y_test_scaled
)

In [None]:
comparison_df = build_comparison_df(all_experiments)
comparison_df

In [None]:
###############################################
# It means scaling the features did not improve the performance of the models. 
# In fact, it seems to have slightly decreased the precision for most models. 
# This suggests that for this particular dataset and feature representation (TF-IDF), scaling may not be necessary or beneficial. 
# However, it's always good to experiment with different preprocessing techniques as their impact can vary based on the data and model used.
###############################################

### 3. Lets try with adding more features like num_characters, num_words, num_sentences to the original X and see if it improves the performance of the models.

In [None]:
X_nc = np.hstack((X, df['num_characters'].values.reshape(-1, 1)))

In [None]:
X_train_scaled, X_test_scaled, y_train_scaled, y_test_scaled = train_test_split(
    X_nc, y, test_size=0.2, random_state=2
)

print("=== Experiment: adding num_characters ===")
all_experiments['num_characters'] = run_all_classifiers(
    clfs, X_train_scaled, y_train_scaled, X_test_scaled, y_test_scaled
)

In [None]:
comparison_df = build_comparison_df(all_experiments)
comparison_df

In [None]:
X_nw = np.hstack((X, df['num_words'].values.reshape(-1, 1)))

In [None]:
X_train_scaled, X_test_scaled, y_train_scaled, y_test_scaled = train_test_split(
    X_nw, y, test_size=0.2, random_state=2
)

print("=== Experiment: adding num_words ===")
all_experiments['num_words'] = run_all_classifiers(
    clfs, X_train_scaled, y_train_scaled, X_test_scaled, y_test_scaled
)

In [None]:
comparison_df = build_comparison_df(all_experiments)
comparison_df

### 4. Voting Classifier

In [None]:
knc = KNeighborsClassifier()
mnb = MultinomialNB()
rfc = RandomForestClassifier(n_estimators=50, random_state=2)

from sklearn.ensemble import VotingClassifier

In [None]:
voting = VotingClassifier(estimators=[('svc', svc), ('mnb', mnb), ('rfc', rfc)], voting='soft')

In [None]:
voting.fit(X_train, y_train)

In [None]:
y_pred = voting.predict(X_test)
print("Accuracy Score:", accuracy_score(y_test, y_pred))
print("Precision Score:", precision_score(y_test, y_pred))

### 5. Stacking

In [None]:
estimators = [('svc', svc), ('nb', mnb), ('et', etc)]
final_estimator = RandomForestClassifier()

In [None]:
from sklearn.ensemble import StackingClassifier

In [None]:
clf = StackingClassifier(estimators=estimators, final_estimator=final_estimator)

In [None]:
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Accuracy Score:", accuracy_score(y_test, y_pred))
print("Precision Score:", precision_score(y_test, y_pred))

In [81]:
import pickle

pickle.dump(mnb, open('model.pkl', 'wb'))
pickle.dump(cv, open('vectorizer.pkl', 'wb'))