# Homework 5 RF Accuracy Improvement

This assignment is inspired by examples of Shan-Hung Wu from National Tsing Hua University.

Requirement: improve the accuracy per feature of the following code from 0.03 up to at least 0.45 and accuracy should be more than 0.92

Here are three hints:

    You can improve the ratio by picking out or "creating" several features.
    Tune hyperparameters
    The ratio can be improved from 0.03 up to 0.47.

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score




# load the breast_cancer dataset
init_data = load_breast_cancer()
(X, y) = load_breast_cancer(return_X_y=True)

print(X.shape)

# COMPLETED Select some features (X), hint: based on the connections with
# our Y (importance? correlation?)
initial_rf = RandomForestClassifier(n_estimators=100, random_state=42)
initial_rf.fit(X, y)

feature_importances = initial_rf.feature_importances_
importance_threshold = np.percentile(feature_importances, 95)  # Select top 5% features
selected_features = np.where(feature_importances >= importance_threshold)[0]

X_selected = X[:, selected_features]
print("Shape of X after feature selection:", X_selected.shape)

# COMPLETED need 5 fold cross validation
param_grid = {
    'n_estimators': [200, 400, 500],
    'criterion': ['gini', 'entropy'],
    'max_depth': [ 10, 15, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_selected, y)

# COMPLETED Tune parameters for RandomForestClassifier
best_rf = grid_search.best_estimator_
print("Best Parameters:", grid_search.best_params_)

# COMPLETED Calculate Average accuracy score
cross_val_acc = cross_val_score(best_rf, X_selected, y, cv=5, scoring='accuracy')
avg_accuracy = cross_val_acc.mean()
accuracy_per_feature = avg_accuracy / X_selected.shape[1]

print("Average Accuracy:", avg_accuracy)
print("Accuracy per Feature:", accuracy_per_feature)

# COMPLETED Calculate Average (accuracy score/number of features)
assert avg_accuracy > 0.92, "Accuracy requirement not met."
assert accuracy_per_feature > 0.45, "Accuracy per feature requirement not met."

(569, 30)
Shape of X after feature selection: (569, 2)
Best Parameters: {'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200}
Average Accuracy: 0.9420121099208197
Accuracy per Feature: 0.47100605496040987
