# Homework 5 RF Accuracy Improvement

This assignment is inspired by examples of Shan-Hung Wu from National Tsing Hua University.

Requirement: improve the accuracy per feature of the following code from 0.03 up to at least 0.45 and accuracy should be more than 0.92

Here are three hints:

    You can improve the ratio by picking out or "creating" several features.
    Tune hyperparameters
    The ratio can be improved from 0.03 up to 0.47.

In [166]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import numpy as np

# Load the dataset
cancer_data = load_breast_cancer()
data_features, data_target = cancer_data.data, cancer_data.target

# Show dataset dimensions (no PCA transformation applied)
print("Dataset shape:", data_features.shape)

# TODO: Select features based on connections with Y (importance? correlation?)
# Train an initial RandomForest model to get feature importances
initial_rf_model = RandomForestClassifier(n_estimators=100, random_state=0)
initial_rf_model.fit(data_features, data_target)

# Get feature importances and select top features
feature_importances = initial_rf_model.feature_importances_
top_features = np.argsort(feature_importances)[-10:]  # Select the top 10 features, adjust as needed

# Use only the selected top features for training/testing
selected_features = data_features[:, top_features]

# Split data into training and testing sets (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(
    selected_features, data_target, test_size=0.3, random_state=0
)

# TODO: Set up the RandomForestClassifier with hyperparameter tuning
# Initialize the RandomForestClassifier and set up an extended parameter grid for tuning
random_forest_model = RandomForestClassifier()
param_grid = {
    'n_estimators': np.arange(10, 101, 10),      # Wider range for number of trees
    'criterion': ["gini", "entropy"],          # Minimum samples per split      # Corrected max features
}

# Use RandomizedSearchCV to perform hyperparameter tuning with 5-fold cross-validation
tuned_model = RandomizedSearchCV(
    estimator=random_forest_model, param_distributions=param_grid,
    n_iter=20, cv=5, error_score='raise', random_state=0
)
tuned_model.fit(X_train, y_train)

# Print the optimal cross-validation score and selected parameters
print("Best cross-validation accuracy from RandomizedSearchCV:", tuned_model.best_score_)
print("Selected hyperparameters:", tuned_model.best_params_)

# TODO: Predict and calculate accuracy on the test set
# Predict on the test set using the tuned model
y_test_predictions = tuned_model.best_estimator_.predict(X_test)

# Calculate overall test accuracy and per-feature accuracy
test_accuracy = accuracy_score(y_test, y_test_predictions)
print(f"Test Set Accuracy: {test_accuracy:.2f}")
print(f"Accuracy per feature dimension: {test_accuracy / selected_features.shape[1]:.2f}")


Dataset shape: (569, 30)
Best cross-validation accuracy from RandomizedSearchCV: 0.9473101265822784
Selected hyperparameters: {'n_estimators': np.int64(100), 'criterion': 'gini'}
Test Set Accuracy: 0.95
Accuracy per feature dimension: 0.09
