# Homework 5 RF Accuracy Improvement

This assignment is inspired by examples of Shan-Hung Wu from National Tsing Hua University.

Requirement: improve the accuracy per feature of the following code from 0.03 up to at least 0.45 and accuracy should be more than 0.92

Here are three hints:

    You can improve the ratio by picking out or "creating" several features.
    Tune hyperparameters
    The ratio can be improved from 0.03 up to 0.47.

In [209]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import numpy as np

# Load the dataset
cancer_data = load_breast_cancer()
features, target = cancer_data.data, cancer_data.target

# Print initial dataset dimensions
print("Dataset shape:", features.shape)

# TODO: Select features based on importance or correlation with the target variable
# Train a RandomForest model to get feature importances
base_model = RandomForestClassifier(n_estimators=100, random_state=0)
base_model.fit(features, target)

# Get feature importances and select the top 3 features 
feature_importances = base_model.feature_importances_
top_features_idx = np.argsort(feature_importances)[-2:]
top_features = features[:, top_features_idx] 

# Split data into training and testing
train_features, test_features, train_target, test_target = train_test_split(
    top_features, target, test_size=0.3, random_state=0
)

# TODO: Set up RandomForestClassifier with hyperparameter tuning
# Initialize the model and tuning the hyperparameters
rf_model = RandomForestClassifier()
param_grid = {
    'n_estimators': np.arange(100, 2001, 100),       
    'criterion': ["gini", "entropy"],
    'max_depth': [None, 5, 7, 10],  
    'min_samples_split': [2, 5, 10, 15, 20, 25],    
    'min_samples_leaf': [1, 2, 4, 8, 10, 12, 15],   
    'max_features': ['sqrt', 'log2']                
}

# Use RandomizedSearchCV to tune hyperparameters with 5-fold cross-validation
tuned_model = RandomizedSearchCV(
    estimator=rf_model, param_distributions=param_grid,
    n_iter=20, cv=5, error_score='raise', random_state=0
)
tuned_model.fit(train_features, train_target)

# Print the best cross-validation score and the chosen hyperparameters
print("Best cross-validation accuracy from RandomizedSearchCV:", tuned_model.best_score_)
print("Selected hyperparameters:", tuned_model.best_params_)

# TODO: Predict and calculate accuracy on the test set
# Make predictions on the test set with the tuned model
test_predictions = tuned_model.best_estimator_.predict(test_features)

# Calculate and print test set accuracy and per-feature accuracy
test_accuracy = accuracy_score(test_target, test_predictions)
print(f"Test Set Accuracy: {test_accuracy:.2f}")
print(f"Accuracy per feature dimension: {test_accuracy / top_features.shape[1]:.2f}")


Dataset shape: (569, 30)
Best cross-validation accuracy from RandomizedSearchCV: 0.8994620253164557
Selected hyperparameters: {'n_estimators': np.int64(2000), 'min_samples_split': 25, 'min_samples_leaf': 12, 'max_features': 'sqrt', 'max_depth': 7, 'criterion': 'entropy'}
Test Set Accuracy: 0.94
Accuracy per feature dimension: 0.47
