# Homework 5 RF Accuracy Improvement

This assignment is inspired by examples of Shan-Hung Wu from National Tsing Hua University.

Requirement: improve the accuracy per feature of the following code from 0.03 up to at least 0.45 and accuracy should be more than 0.92

Here are three hints:

    You can improve the ratio by picking out or "creating" several features.
    Tune hyperparameters
    The ratio can be improved from 0.03 up to 0.47.

In [6]:
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, f_classif
import numpy as np


# load the breast_cancer dataset
init_data = load_breast_cancer()
(X, y) = load_breast_cancer(return_X_y=True)

print(X.shape)

k_features = 15
selector = SelectKBest(score_func=f_classif,k=k_features)
X_selected = selector.fit_transform(X,y)

print(f"Selected features: {k_features}/{X.shape[1]}")

best_accuracy = 0
best_params = {}

n_estimators_range = [50, 100, 150]
criterion_range = ['gini', 'entropy']
max_depth_range = [5, 10, None]

for n_estimators in n_estimators_range:
    for criterion in criterion_range:
        for max_depth in max_depth_range:
            clf = RandomForestClassifier(
                n_estimators=n_estimators,
                criterion=criterion,
                max_depth=max_depth,
                random_state=42
            )


            skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
            cv_scores = cross_val_score(clf, X_selected, y, cv=skf, scoring= 'accuracy')
            avg_accuracy = np.mean(cv_scores)

            if avg_accuracy > best_accuracy:
                best_accuracy = avg_accuracy
                best_params = {
                    'n_estimators' : n_estimators,
                    'criterion' : criterion,
                    'max_depth' : max_depth
                }

final_clf = RandomForestClassifier(**best_params, random_state=42)
final_clf.fit(X_selected, y)

final_scores = cross_val_score(final_clf, X_selected, y, cv=5, scoring='accuracy')
average_accuracy = np.mean(final_scores)
accuracy_per_feature = average_accuracy / k_features

print(f"Best Hyperparameters: {best_params}")
print(f"Average Accuracy: {average_accuracy:.4f}")
print(f"Accuracy per Feature: {accuracy_per_feature:.4f}")

# TODO Select some features (X), hint: based on the connections with
# our Y (importance? correlation?)
# TODO need 5 fold cross validation
# TODO Tune parameters for RandomForestClassifier
# TODO Calculate Average accuracy score
# TODO Calculate Average (accuracy score/number of features)

(569, 30)
Selected features: 15/30
Best Hyperparameters: {'n_estimators': 50, 'criterion': 'entropy', 'max_depth': 5}
Average Accuracy: 0.9473
Accuracy per Feature: 0.0632
