# Homework 5 RF Accuracy Improvement

This assignment is inspired by examples of Shan-Hung Wu from National Tsing Hua University.

Requirement: improve the accuracy per feature of the following code from 0.03 up to at least 0.45 and accuracy should be more than 0.92

Here are three hints:

    You can improve the ratio by picking out or "creating" several features.
    Tune hyperparameters
    The ratio can be improved from 0.03 up to 0.47.

In [18]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier


# load the breast_cancer dataset
init_data = load_breast_cancer()
(X, y) = load_breast_cancer(return_X_y=True)

print(X.shape)

# TODO Select some features (X), hint: based on the connections with
# our Y (importance? correlation?)
# TODO need 5 fold cross validation
# TODO Tune parameters for RandomForestClassifier
# TODO Calculate Average accuracy score
# TODO Calculate Average (accuracy score/number of features)

(569, 30)
Accuracy: 0.95
Accuracy per feature: 0.47


In [1]:
import pandas as pd
from sklearn.datasets import load_breast_cancer

# Load the dataset
data = load_breast_cancer()
X, y = data.data, data.target

# Create a DataFrame
df = pd.DataFrame(X, columns=data.feature_names)
df['target'] = y

print(f"Dataset shape: {X.shape}")


Dataset shape: (569, 30)


In [2]:
# Compute correlation with the target
corr = df.corr()
feature_corr = corr['target'].drop('target').abs().sort_values(ascending=False)

# Display the top features
print("Top features based on correlation with the target:")
print(feature_corr.head(10))


Top features based on correlation with the target:
worst concave points    0.793566
worst perimeter         0.782914
mean concave points     0.776614
worst radius            0.776454
mean perimeter          0.742636
worst area              0.733825
mean radius             0.730029
mean area               0.708984
mean concavity          0.696360
worst concavity         0.659610
Name: target, dtype: float64


In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold

# Select top k features
k = 2
selected_features = feature_corr.index[:k]
X_selected = df[selected_features].values

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt']
}

# Initialize the classifier
clf = RandomForestClassifier(random_state=42)

# Define the cross-validation strategy
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Perform grid search
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=cv, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_selected, y)

# Best estimator and score
best_clf = grid_search.best_estimator_
best_score = grid_search.best_score_

print(f"Best hyperparameters: {grid_search.best_params_}")
print(f"Average accuracy: {best_score:.4f}")


Best hyperparameters: {'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
Average accuracy: 0.9385


In [4]:
accuracy_per_feature = best_score / k
print(f"Accuracy per feature: {accuracy_per_feature:.4f}")


Accuracy per feature: 0.4693


In [5]:
results = []

for k in range(2, 6):  # Trying k from 2 to 5
    selected_features = feature_corr.index[:k]
    X_selected = df[selected_features].values
    
    # Perform grid search
    grid_search.fit(X_selected, y)
    best_score = grid_search.best_score_
    accuracy_per_feature = best_score / k
    
    results.append({
        'k': k,
        'features': list(selected_features),
        'best_params': grid_search.best_params_,
        'accuracy': best_score,
        'accuracy_per_feature': accuracy_per_feature
    })
    
    print(f"\nNumber of features: {k}")
    print(f"Selected features: {list(selected_features)}")
    print(f"Best hyperparameters: {grid_search.best_params_}")
    print(f"Average accuracy: {best_score:.4f}")
    print(f"Accuracy per feature: {accuracy_per_feature:.4f}")



Number of features: 2
Selected features: ['worst concave points', 'worst perimeter']
Best hyperparameters: {'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
Average accuracy: 0.9385
Accuracy per feature: 0.4693

Number of features: 3
Selected features: ['worst concave points', 'worst perimeter', 'mean concave points']
Best hyperparameters: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
Average accuracy: 0.9385
Accuracy per feature: 0.3128

Number of features: 4
Selected features: ['worst concave points', 'worst perimeter', 'mean concave points', 'worst radius']
Best hyperparameters: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}
Average accuracy: 0.9385
Accuracy per feature: 0.2346

Number of features: 5
Selected features: ['worst concave points', 'worst perimeter', 'mean concave points', 'worst radiu