In [13]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [14]:
# Data manipulation
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from graphviz import Source

# Preprocessing
from sklearn.model_selection import train_test_split

# Evaluation
from sklearn.metrics import accuracy_score

# Models
from src.models.cart import CART

# Load dataset
from src.data.load_dataset import load_spambase

In [15]:
X, y = load_spambase()
# Split the dataset into training+validation and test sets
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Further split the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, stratify=y_train_val, random_state=42) # 0.25 x 0.8 = 0.2

X_train.shape, X_val.shape, X_test.shape

((2760, 57), (920, 57), (921, 57))

### Train the baseline CART model with raw features

In [4]:
param_grid = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_impurity_decrease': [0, 0.1, 0.2],
    'max_features': [None, 'sqrt', 'log2', 0.5, 0.75]
}

results = []
best_score = 0
best_params = {}


def process_max_features(max_features, n_features):
    if max_features is None:
        max_features_val = None
    elif isinstance(max_features, int):
        max_features_val = max_features
    elif isinstance(max_features, float):
        # Fraction of features, ensure it's an integer
        max_features_val = int(max_features * n_features)
    elif max_features == 'sqrt':
        max_features_val = int(np.sqrt(n_features))
    elif max_features == 'log2':
        max_features_val = int(np.log2(n_features))
    else:
        raise ValueError("max_features must be None, int, float, 'sqrt', or 'log2'.")
    return max_features_val


# Iterate over all combinations of parameters
for max_depth in param_grid['max_depth']:
    for min_samples_split in param_grid['min_samples_split']:
        for min_impurity_decrease in param_grid['min_impurity_decrease']:
            for max_features in param_grid['max_features']:
                # Convert max_features accordingly
                n_features = X_train.shape[1]
                max_features_val = process_max_features(max_features, n_features)
                
                # Initialize and fit the CART model
                model = CART(max_depth=max_depth, 
                             min_samples_split=min_samples_split,
                             min_impurity_decrease=min_impurity_decrease,
                             max_features=max_features_val)
                
                model.fit(X_train, y_train)
                
                # Predict on the validation set and evaluate
                y_pred_val = model.predict(X_val)
                score = accuracy_score(y_val, y_pred_val)
                
                results.append({'max_depth': max_depth,
                                'min_samples_split': min_samples_split,
                                'min_impurity_decrease': min_impurity_decrease,
                                'max_features': max_features,
                                'score': score})
                
                if score > best_score:
                    best_score = score
                    best_params = {'max_depth': max_depth,
                                   'min_samples_split': min_samples_split,
                                   'min_impurity_decrease': min_impurity_decrease,
                                   'max_features': max_features_val}
            print(f"Finished iteration for max_depth={max_depth}, min_samples_split={min_samples_split}, min_impurity_decrease={min_impurity_decrease}, max_features={max_features}")

print(f"Best Validation Score: {best_score}")
print("Best Parameters:", best_params)

# After finding the best parameters, evaluate the model on the test set
best_model = CART(**best_params)
best_model.fit(X_train_val, y_train_val)  # Train on the full training dataset
test_predictions = best_model.predict(X_test)
test_score = accuracy_score(y_test, test_predictions)
print(f"Test Score: {test_score}")


Finished iteration for max_depth=None, min_samples_split=2, min_impurity_decrease=0, max_features=0.75
Finished iteration for max_depth=None, min_samples_split=2, min_impurity_decrease=0.1, max_features=0.75
Finished iteration for max_depth=None, min_samples_split=2, min_impurity_decrease=0.2, max_features=0.75
Finished iteration for max_depth=None, min_samples_split=5, min_impurity_decrease=0, max_features=0.75
Finished iteration for max_depth=None, min_samples_split=5, min_impurity_decrease=0.1, max_features=0.75
Finished iteration for max_depth=None, min_samples_split=5, min_impurity_decrease=0.2, max_features=0.75
Finished iteration for max_depth=None, min_samples_split=10, min_impurity_decrease=0, max_features=0.75
Finished iteration for max_depth=None, min_samples_split=10, min_impurity_decrease=0.1, max_features=0.75
Finished iteration for max_depth=None, min_samples_split=10, min_impurity_decrease=0.2, max_features=0.75
Finished iteration for max_depth=10, min_samples_split=2, 

In [5]:
# Get DOT data
dot_data = best_model.export_graphviz(full_verbose=False, leaf_verbose=True)

# Draw graph
graph = Source(dot_data, format="pdf", filename="cart_tree_custom_implementation", directory="../reports/baseline_models/")
graph.render(view=True)

'../reports/baseline_models/cart_tree_custom_implementation.pdf'

In [10]:
import os
import json

from src.config import EXPERIMENT_DIR

experiment_config = {
    "name": "custom_cart_v2",
    "description": "Custom cart v2",
    "model_class": "src.models.cart.CART",
    "model_params": best_params,
    "metrics": [
        "accuracy",
        "f1",
        "roc_auc"
    ]
}
experiment_dir = os.path.join(EXPERIMENT_DIR, 'custom_cart_v2')
os.makedirs(experiment_dir, exist_ok=True)
exp_config_path = os.path.join(experiment_dir, 'config.json')
with open(exp_config_path, 'w') as file:
    json.dump(experiment_config, file, indent=4)    

### Using sklearn's DecisionTreeClassifier

In [6]:
from sklearn.tree import DecisionTreeClassifier

In [7]:
## Use sklearn's DecisionTreeClassifier to compare with our implementation
sklearn_model = DecisionTreeClassifier(**best_params)
sklearn_model.fit(X_train_val, y_train_val)
sklearn_test_predictions = sklearn_model.predict(X_test)
sklearn_test_score = accuracy_score(y_test, sklearn_test_predictions)
print(f"Test Score (sklearn): {sklearn_test_score}:2f")

Test Score (sklearn): 0.9109663409337676:2f


In [13]:
from sklearn.tree import export_graphviz

## Get DOT data
dot_data = export_graphviz(sklearn_model, filled=True, rounded=True, special_characters=True, feature_names=np.arange(X_train_val.shape[1]), class_names=['0', '1'])

## Draw graph
graph = Source(dot_data, format="pdf", filename="sklearn_cart_tree", directory="../reports/baseline_models/")
graph.render(view=True)

'../reports/baseline_models/sklearn_cart_tree.pdf'

In [8]:
import os
import json

from src.config import EXPERIMENT_DIR

experiment_config = {
    "name": "sklearn_cart_w_best_params",
    "description": "Sklearn cart with best parameters",
    "model_class": "sklearn.tree.DecisionTreeClassifier",
    "model_params": best_params,
    "metrics": [
        "accuracy",
        "f1",
        "roc_auc"
    ]
}
experiment_dir = os.path.join(EXPERIMENT_DIR, 'sklearn_cart_w_best_params')
os.makedirs(experiment_dir, exist_ok=True)
exp_config_path = os.path.join(experiment_dir, 'config.json')
with open(exp_config_path, 'w') as file:
    json.dump(experiment_config, file, indent=4)    

In [9]:
### Using GridSearchCV to find the best parameters
from sklearn.model_selection import GridSearchCV

param_grid = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_impurity_decrease': [0, 0.1, 0.2],
    'max_features': [None, 'sqrt', 'log2', 0.5, 0.75]
}

model = DecisionTreeClassifier()
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_val, y_train_val)

print(f"Best Validation Score (sklearn): {grid_search.best_score_}")

### Random Forest Classifier

In [4]:
from src.models.random_forest import RandomForestClassifier

In [5]:
best_params = {'max_depth': 10, 'min_samples_split': 2, 'min_impurity_decrease': 0, 'max_features': 42}

In [6]:
param_grid = {
    'n_estimators': [100, 200, 500]
}

# let's try different n_estimators with our best parameters from the previous step and initialize Random Forest model

results = []
best_score = 0
best_n_estimators = None

for n_estimators in param_grid['n_estimators']:
    model = RandomForestClassifier(n_estimators=n_estimators, **best_params)
    model.fit(X_train, y_train)
    predictions = model.predict(X_val)
    score = accuracy_score(y_val, predictions)
    results.append({'n_estimators': n_estimators, 'score': score})
    if score > best_score:
        best_score = score
        best_n_estimators = n_estimators
    print(f"Finished iteration for n_estimators={n_estimators}")

Finished iteration for n_estimators=100
Finished iteration for n_estimators=200
Finished iteration for n_estimators=500


In [9]:
random_forest_model = RandomForestClassifier(n_estimators=best_n_estimators, **best_params)
random_forest_model.fit(X_train_val, y_train_val)

test_predictions = random_forest_model.predict(X_test)
test_score = accuracy_score(y_test, test_predictions)
print(f"Test Score (Random Forest): {test_score}")

Test Score (Random Forest): 0.9294245385450597


In [10]:
import os
import json

from src.config import EXPERIMENT_DIR

experiment_config = {
    "name": "custom_rf_w_best_params",
    "description": "Custom random forest with best parameters",
    "model_class": "src.models.random_forest.RandomForestClassifier",
    "model_params": {'n_estimators': best_n_estimators, **best_params},
    "metrics": [
        "accuracy",
        "f1",
        "roc_auc"
    ]
}
experiment_dir = os.path.join(EXPERIMENT_DIR, 'custom_rf_w_best_params')
os.makedirs(experiment_dir, exist_ok=True)
exp_config_path = os.path.join(experiment_dir, 'config.json')
with open(exp_config_path, 'w') as file:
    json.dump(experiment_config, file, indent=4)    

### Sklearn's RandomForestClassifier

In [11]:
from sklearn.ensemble import RandomForestClassifier as SklearnRandomForestClassifier

In [12]:
sklearn_rf_model = SklearnRandomForestClassifier(n_estimators=best_n_estimators, **best_params)
sklearn_rf_model.fit(X_train_val, y_train_val)

sklearn_test_predictions = sklearn_rf_model.predict(X_test)
sklearn_test_score = accuracy_score(y_test, sklearn_test_predictions)
print(f"Test Score (sklearn Random Forest): {sklearn_test_score}")

Test Score (sklearn Random Forest): 0.9272529858849077


## Logistic Regression

In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

# scale the data
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_val = scaler.fit_transform(X_train_val)
X_test = scaler.transform(X_test)

# Define the parameter grid to search over
param_grid = {
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'C': [0.01, 0.1, 1, 10, 100],  # Regularization strength
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'max_iter': [100, 1000, 2500, 5000]
}

# Initialize the model
model = LogisticRegression()

# Initialize GridSearchCV
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')

# Fit GridSearchCV
grid_search.fit(X_train_val, y_train_val)

# Get the best parameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print("Best parameters found: ", best_params)

# Make predictions with the best model
predictions = best_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: ", accuracy)


1300 fits failed out of a total of 2000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
100 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/ajaykarthicksenthilkumar/miniconda/envs/email_spam_classification/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 890, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/ajaykarthicksenthilkumar/miniconda/envs/email_spam_classification/lib/python3.9/site-packages/sklearn/base.py", line 1351, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/ajaykarthicksenthilkumar/miniconda/envs/email_spam_classification/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 1172

Best parameters found:  {'C': 1, 'max_iter': 1000, 'penalty': 'l2', 'solver': 'sag'}
Accuracy:  0.9294245385450597




In [21]:
best_params

{'C': 1, 'max_iter': 1000, 'penalty': 'l2', 'solver': 'sag'}

In [23]:
import os
import json

from src.config import EXPERIMENT_DIR

experiment_config = {
    "name": "sklearn_logistic_regression_w_best_params",
    "description": "Sklearn logistic regression with best parameters",
    "model_class": "sklearn.linear_model.LogisticRegression",
    "model_params": best_params,
    "metrics": [
        "accuracy",
        "f1",
        "roc_auc"
    ]
}
experiment_dir = os.path.join(EXPERIMENT_DIR, 'sklearn_logistic_regression_w_best_params')
os.makedirs(experiment_dir, exist_ok=True)
exp_config_path = os.path.join(experiment_dir, 'config.json')
with open(exp_config_path, 'w') as file:
    json.dump(experiment_config, file, indent=4)    