In [1]:
%load_ext autoreload
%autoreload 2

In [31]:
# Data manipulation
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from graphviz import Source

# Preprocessing
from sklearn.model_selection import train_test_split

# Evaluation
from sklearn.metrics import accuracy_score

# Models
from src.models.cart import CART

# Load dataset
from src.data.load_dataset import load_spambase

In [28]:
X, y = load_spambase()
# Split the dataset into training+validation and test sets
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Further split the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, stratify=y_train_val, random_state=42) # 0.25 x 0.8 = 0.2

X_train.shape, X_val.shape, X_test.shape

((2760, 57), (920, 57), (921, 57))

### Train the baseline CART model with raw features

In [30]:
param_grid = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_impurity_decrease': [0, 0.1, 0.2],
    'max_features': [None, 'sqrt', 'log2', 0.5, 0.75]
}

results = []
best_score = 0
best_params = {}


def process_max_features(max_features, n_features):
    if max_features is None:
        max_features_val = None
    elif isinstance(max_features, int):
        max_features_val = max_features
    elif isinstance(max_features, float):
        # Fraction of features, ensure it's an integer
        max_features_val = int(max_features * n_features)
    elif max_features == 'sqrt':
        max_features_val = int(np.sqrt(n_features))
    elif max_features == 'log2':
        max_features_val = int(np.log2(n_features))
    else:
        raise ValueError("max_features must be None, int, float, 'sqrt', or 'log2'.")
    return max_features_val


# Iterate over all combinations of parameters
for max_depth in param_grid['max_depth']:
    for min_samples_split in param_grid['min_samples_split']:
        for min_impurity_decrease in param_grid['min_impurity_decrease']:
            for max_features in param_grid['max_features']:
                # Convert max_features accordingly
                n_features = X_train.shape[1]
                max_features_val = process_max_features(max_features, n_features)
                
                # Initialize and fit the CART model
                model = CART(max_depth=max_depth, 
                             min_samples_split=min_samples_split,
                             min_impurity_decrease=min_impurity_decrease,
                             max_features=max_features_val)
                
                model.fit(X_train, y_train)
                
                # Predict on the validation set and evaluate
                predictions = model.predict(X_val)
                score = accuracy_score(y_val, predictions)
                
                results.append({'max_depth': max_depth,
                                'min_samples_split': min_samples_split,
                                'min_impurity_decrease': min_impurity_decrease,
                                'max_features': max_features,
                                'score': score})
                
                if score > best_score:
                    best_score = score
                    best_params = {'max_depth': max_depth,
                                   'min_samples_split': min_samples_split,
                                   'min_impurity_decrease': min_impurity_decrease,
                                   'max_features': max_features_val}
            print(f"Finished iteration for max_depth={max_depth}, min_samples_split={min_samples_split}, min_impurity_decrease={min_impurity_decrease}, max_features={max_features}")

print(f"Best Validation Score: {best_score}")
print("Best Parameters:", best_params)

# After finding the best parameters, evaluate the model on the test set
best_model = CART(**best_params)
best_model.fit(X_train_val, y_train_val)  # Train on the full training dataset
test_predictions = best_model.predict(X_test)
test_score = accuracy_score(y_test, test_predictions)
print(f"Test Score: {test_score}")


Finished iteration for max_depth=None, min_samples_split=2, min_impurity_decrease=0, max_features=0.75
Finished iteration for max_depth=None, min_samples_split=2, min_impurity_decrease=0.1, max_features=0.75
Finished iteration for max_depth=None, min_samples_split=2, min_impurity_decrease=0.2, max_features=0.75
Finished iteration for max_depth=None, min_samples_split=5, min_impurity_decrease=0, max_features=0.75
Finished iteration for max_depth=None, min_samples_split=5, min_impurity_decrease=0.1, max_features=0.75
Finished iteration for max_depth=None, min_samples_split=5, min_impurity_decrease=0.2, max_features=0.75
Finished iteration for max_depth=None, min_samples_split=10, min_impurity_decrease=0, max_features=0.75
Finished iteration for max_depth=None, min_samples_split=10, min_impurity_decrease=0.1, max_features=0.75
Finished iteration for max_depth=None, min_samples_split=10, min_impurity_decrease=0.2, max_features=0.75
Finished iteration for max_depth=10, min_samples_split=2, 

In [32]:
# Get DOT data
dot_data = best_model.export_graphviz(full_verbose=False, leaf_verbose=True)

# Draw graph
graph = Source(dot_data, format="pdf", filename="cart_tree_custom_implementation", directory="../reports/baseline_models/")
graph.render(view=True)

'../reports/baseline_models/cart_tree_custom_implementation.pdf'

### Using sklearn's DecisionTreeClassifier

In [33]:
from sklearn.tree import DecisionTreeClassifier

In [40]:
## Use sklearn's DecisionTreeClassifier to compare with our implementation
sklearn_model = DecisionTreeClassifier(**best_params)
sklearn_model.fit(X_train_val, y_train_val)
sklearn_test_predictions = sklearn_model.predict(X_test)
sklearn_test_score = accuracy_score(y_test, sklearn_test_predictions)
print(f"Test Score (sklearn): {sklearn_test_score}:2f")

Test Score (sklearn): 0.9196525515743756:2f


In [41]:
from sklearn.tree import export_graphviz

## Get DOT data
dot_data = export_graphviz(sklearn_model, filled=True, rounded=True, special_characters=True, feature_names=np.arange(X_train_val.shape[1]), class_names=['0', '1'])

## Draw graph
graph = Source(dot_data, format="pdf", filename="sklearn_cart_tree", directory="../reports/baseline_models/")
graph.render(view=True)

'../reports/baseline_models/sklearn_cart_tree.pdf'