In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.pipeline import Pipeline
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
import nltk

In [2]:
#download nltk data
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
#load datasets
df = pd.read_csv('spam.csv', encoding='latin-1')
df = df[['v1', 'v2']]
df.columns = ['label', 'message']

In [4]:
#text preprocessing function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\W+', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    tokens = text.split()
    tokens = [PorterStemmer().stem(word) for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

df['message'] = df['message'].apply(preprocess_text)

In [5]:
#encode labels
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

In [6]:
#split dataset
X_train, X_test, y_train, y_test = train_test_split(df['message'], df['label'], test_size=0.2, random_state=42)

In [7]:
# Define pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('classifier', MultinomialNB())
])

In [8]:
# Hyperparameter tuning
param_grid = {
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'tfidf__max_df': [0.75, 1.0],
    'tfidf__min_df': [1, 2],
    'classifier__alpha': [0.01, 0.1, 1]
}
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

In [9]:
# Best model
best_model = grid_search.best_estimator_

In [10]:
y_pred = best_model.predict(X_test)
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(f'Precision: {precision_score(y_test, y_pred)}')
print(f'Recall: {recall_score(y_test, y_pred)}')
print(f'F1 Score: {f1_score(y_test, y_pred)}')
print(f'ROC-AUC: {roc_auc_score(y_test, y_pred)}')

Accuracy: 0.9775784753363229
Precision: 0.9251700680272109
Recall: 0.9066666666666666
F1 Score: 0.9158249158249158
ROC-AUC: 0.9476338514680485


ADDING ADVANCED FEATURES

In [11]:
import tensorflow as tf

In [12]:
from gensim.models import Word2Vec
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [13]:
# Tokenize and pad sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['message'])
sequences = tokenizer.texts_to_sequences(df['message'])
word_index = tokenizer.word_index

data = pad_sequences(sequences, maxlen=100)
labels = np.array(df['label'])

In [14]:
# Word2Vec model
w2v_model = Word2Vec(sentences=df['message'].apply(str.split), vector_size=100, window=5, min_count=1, workers=4)
word_vectors = w2v_model.wv

In [15]:
# Create embedding matrix
embedding_matrix = np.zeros((len(word_index) + 1, 100))
for word, i in word_index.items():
    if word in word_vectors:
        embedding_matrix[i] = word_vectors[word]

In [16]:
# Use embedding_matrix in a neural network model
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, SpatialDropout1D

In [17]:
model = Sequential([
    Embedding(len(word_index) + 1, 100, weights=[embedding_matrix], input_length=100, trainable=False),
    SpatialDropout1D(0.2),
    LSTM(100, dropout=0.2, recurrent_dropout=0.2),
    Dense(1, activation='sigmoid')
])

In [18]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(data, labels, validation_split=0.2, epochs=5, batch_size=64)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x23121456740>