In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
from sklearn.model_selection import train_test_split,RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

In [3]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [4]:
def preprocess_text(text):
   # Remove non-alphanumeric characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Tokenize and remove stopwords
    words = text.split()
    words = [word for word in words if word not in stopwords.words('english')]
    # Lemmatize words
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

In [5]:
 def add_features(df):
    df['email_length'] = df['content'].apply(len)
    df['word_count'] = df['content'].apply(lambda x: len(x.split()))
    df['special_char_count'] = df['content'].apply(lambda x: len(re.findall(r'[!@#$%^&*]', x)))
    df['uppercase_count'] = df['content'].apply(lambda x: len(re.findall(r'\b[A-Z]+\b', x)))
    return df

In [9]:
import warnings
warnings.filterwarnings("ignore")
df = pd.read_csv("/content/email_spam.csv")

# Drop duplicate rows
df = df.drop_duplicates()

# Encode the target variable (spam = 1, not spam = 0)
df['type'] = df['type'].map({'spam': 1, 'not spam': 0})

# Preprocess 'title' and 'text', then combine them
df['content'] = (df['title'] + " " + df['text']).apply(preprocess_text)

# Add new features
df = add_features(df)

# Split the data into features (X) and target (y)
X_text = df['content']
X_features = df[['email_length', 'word_count', 'special_char_count', 'uppercase_count']]
y = df['type']

# Split into training and testing sets
X_train_text, X_test_text, X_train_features, X_test_features, y_train, y_test = train_test_split(
    X_text, X_features, y, test_size=0.2, random_state=42
)

# Convert text data into numerical features using TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english', max_features=10000, ngram_range=(1, 2))
X_train_tfidf = tfidf.fit_transform(X_train_text)
X_test_tfidf = tfidf.transform(X_test_text)

# Combine TF-IDF features with engineered features
X_train_combined = np.hstack((X_train_tfidf.toarray(), X_train_features.values))
X_test_combined = np.hstack((X_test_tfidf.toarray(), X_test_features.values))

# Define XGBoost classifier
xgb = XGBClassifier(eval_metric='logloss', use_label_encoder=False, random_state=42)

from lightgbm import LGBMClassifier

# Define LightGBM classifier
lgbm = LGBMClassifier(random_state=42)

# Hyperparameter tuning with RandomizedSearchCV
param_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'num_leaves': [31, 50, 100],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9]
}

random_search = RandomizedSearchCV(lgbm, param_distributions=param_dist, n_iter=50, scoring='accuracy', cv=3, n_jobs=-1, random_state=42)
random_search.fit(X_train_combined, y_train)

# Best model
best_lgbm_model = random_search.best_estimator_

# Predict on the test set
y_pred = best_lgbm_model.predict(X_test_combined)

from sklearn.metrics import accuracy_score, classification_report
# Evaluate the model
print("Best Parameters:", random_search.best_params_)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


[LightGBM] [Info] Number of positive: 20, number of negative: 46
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000054 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 62
[LightGBM] [Info] Number of data points in the train set: 66, number of used features: 4
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.303030 -> initscore=-0.832909
[LightGBM] [Info] Start training from score -0.832909
Best Parameters: {'subsample': 0.9, 'num_leaves': 31, 'n_estimators': 200, 'max_depth': 5, 'learning_rate': 0.05, 'colsample_bytree': 0.9}
Accuracy: 0.6470588235294118

Classification Report:
               precision    recall  f1-score   support

           0       0.65      1.00      0.79        11
           1       0.00      0.00      0.00         6

    accuracy                           0.65        17
   macro avg       0.32      0.50      0.39