In [27]:
from sklearn.model_selection import train_test_split
from sklearn import metrics
import pandas as pd
import random
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from imblearn.over_sampling import SMOTE
import numpy as np

In [28]:
data = pd.read_csv('spambase.data', sep=",", names=["word_freq_make", "word_freq_address", "word_freq_all", "word_freq_3d", "word_freq_our", "word_freq_over", "word_freq_remove", "word_freq_internet", "word_freq_order", "word_freq_mail", "word_freq_receive", "word_freq_will", "word_freq_people", "word_freq_report", "word_freq_addresses", "word_freq_free", "word_freq_business", "word_freq_email", "word_freq_you", "word_freq_credit", "word_freq_your", "word_freq_font", "word_freq_000", "word_freq_money", "word_freq_hp", "word_freq_hpl", "word_freq_george", "word_freq_650", "word_freq_lab",
                   "word_freq_labs", "word_freq_telnet", "word_freq_857", "word_freq_data", "word_freq_415", "word_freq_85", "word_freq_technology", "word_freq_1999", "word_freq_parts", "word_freq_pm", "word_freq_direct", "word_freq_cs", "word_freq_meeting", "word_freq_original", "word_freq_project", "word_freq_re", "word_freq_edu", "word_freq_table", "word_freq_conference", "char_freq_semicolon", "char_freq_left_paren", "char_freq_left_bracket", "char_freq_exclamation", "char_freq_dollar", "char_freq_pound", "capital_run_length_average", "capital_run_length_longest", "capital_run_length_total", "is_spam"])

In [29]:
data.target = data.loc[:,'is_spam'].tolist()

random.shuffle(data.target)

print(data.target)

[0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 

  data.target = data.loc[:,'is_spam'].tolist()


In [30]:
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(data, data.target, test_size=0.5,random_state=109) # 70% training and 30% test

In [31]:
# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Build the logistic regression model
model = LogisticRegression()
model.fit(X_train_scaled, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test_scaled)

# Evaluate the performance of the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1-score:", f1_score(y_test, y_pred))

Accuracy: 0.5867014341590613
Precision: 0.3488372093023256
Recall: 0.032432432432432434
F1-score: 0.05934718100890208


In [32]:
# Apply SMOTE to oversample the minority class
sm = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = sm.fit_resample(X_train, y_train)

# Scale the features
scaler = StandardScaler()
X_train_resampled_scaled = scaler.fit_transform(X_train_resampled)
X_test_scaled = scaler.transform(X_test)

# Build the logistic regression model
model = LogisticRegression()
model.fit(X_train_resampled_scaled, y_train_resampled)

# Make predictions on the test data
y_pred = model.predict(X_test_scaled)

# Evaluate the performance of the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1-score:", f1_score(y_test, y_pred))

Accuracy: 0.49500217296827465
Precision: 0.3931469792605951
Recall: 0.47135135135135137
F1-score: 0.4287118977384464


In [33]:
from imblearn.under_sampling import TomekLinks

# Undersample the majority class using Tomek links
tl = TomekLinks()
X_resampled, y_resampled = tl.fit_resample(X_train_scaled, y_train)

# Build the logistic regression model on the resampled data
model = LogisticRegression()
model.fit(X_resampled, y_resampled)

# Make predictions on the test data
y_pred = model.predict(X_test_scaled)

# Evaluate the performance of the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1-score:", f1_score(y_test, y_pred))


Accuracy: 0.5732290308561495
Precision: 0.4
Recall: 0.12324324324324325
F1-score: 0.18842975206611573


In [34]:
from sklearn.model_selection import cross_val_score

# Apply SMOTE to oversample the minority class
smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(X_train_scaled, y_train)

# Build the logistic regression model on the resampled data
model = LogisticRegression()
model.fit(X_resampled, y_resampled)

# Use cross-validation to evaluate the performance of the model
scores = cross_val_score(model, X_resampled, y_resampled, cv=5, scoring='f1')

# Calculate the mean and standard deviation of the F1 scores
mean_score = np.mean(scores)
std_score = np.std(scores)

# Make predictions on the test data
y_pred = model.predict(X_test_scaled)

# Evaluate the performance of the model on the test data
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1-score:", f1_score(y_test, y_pred))
print("Cross-validation F1-score mean:", mean_score)
print("Cross-validation F1-score standard deviation:", std_score)


Accuracy: 0.49500217296827465
Precision: 0.400336417157275
Recall: 0.5145945945945946
F1-score: 0.45033112582781454
Cross-validation F1-score mean: 0.5352318612557444
Cross-validation F1-score standard deviation: 0.02854671950716237


In [35]:
from imblearn.under_sampling import TomekLinks
from sklearn.model_selection import cross_val_score

# Undersample the majority class using Tomek links
tl = TomekLinks()
X_resampled, y_resampled = tl.fit_resample(X_train_scaled, y_train)

# Build the logistic regression model on the resampled data
model = LogisticRegression()
model.fit(X_resampled, y_resampled)

# Use cross-validation to evaluate the performance of the model
scores = cross_val_score(model, X_resampled, y_resampled, cv=5, scoring='f1')

# Calculate the mean and standard deviation of the F1 scores
mean_score = np.mean(scores)
std_score = np.std(scores)

# Make predictions on the test data
y_pred = model.predict(X_test_scaled)

# Evaluate the performance of the model on the test data
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1-score:", f1_score(y_test, y_pred))
print("Cross-validation F1-score mean:", mean_score)
print("Cross-validation F1-score standard deviation:", std_score)


Accuracy: 0.5732290308561495
Precision: 0.4
Recall: 0.12324324324324325
F1-score: 0.18842975206611573
Cross-validation F1-score mean: 0.22284609847838813
Cross-validation F1-score standard deviation: 0.041431939971035586


In [36]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Define the parameter grid for grid search
param_grid = {
    'C': [0.1, 1, 10],
    'penalty': ['l1', 'l2', 'elasticnet']
}

# Create the grid search object
grid = GridSearchCV(LogisticRegression(), param_grid, cv=5)

# Fit the grid search object to the data
grid.fit(X_train, y_train)

# Predict using the best estimator found by grid search
y_pred = grid.predict(X_test)

# Evaluate the performance of the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Accuracy: 0.5958279009126467
Precision: 0.4576271186440678
Recall: 0.02918918918918919
F1-score: 0.054878048780487805


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
