#  위스콘신 유방암 데이터셋
- Random Forest를 이용해 target을 분류
- Feature importance 확인

- GridSearch를 이용해 n_estimators, max_depth, max_features의 최적의 파라미터 탐색

In [5]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score

In [6]:
data = load_breast_cancer()
X, y = data.data, data.target

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                   test_size=0.2,
                                                   stratify=y,
                                                   random_state=0)

In [8]:
X_train.shape, X_test.shape

((455, 30), (114, 30))

In [9]:
param = {
    "max_depth": range(1,6),
    "n_estimators": range(100,501,100),
    "max_features": range(1,31, 5)
}

gs = GridSearchCV(RandomForestClassifier(n_jobs=-1, random_state=0),
                 param,
                 cv=4,
                 n_jobs=-1)

In [10]:
gs.fit(X_train, y_train)

GridSearchCV(cv=4, estimator=RandomForestClassifier(n_jobs=-1, random_state=0),
             n_jobs=-1,
             param_grid={'max_depth': range(1, 6),
                         'max_features': range(1, 31, 5),
                         'n_estimators': range(100, 501, 100)})

In [12]:
gs.best_params_

{'max_depth': 4, 'max_features': 16, 'n_estimators': 300}

In [13]:
gs.best_score_

0.9648540599285824

In [14]:
best_model = gs.best_estimator_
fi = best_model.feature_importances_

In [15]:
fi = pd.Series(fi, index=data.feature_names)
fi.sort_values(ascending=False)

worst concave points       0.247656
worst perimeter            0.216231
worst radius               0.163227
mean concave points        0.126286
worst area                 0.103267
worst texture              0.019934
mean concavity             0.015295
mean texture               0.014945
worst concavity            0.014496
area error                 0.014007
worst smoothness           0.006950
mean perimeter             0.006876
radius error               0.006705
worst compactness          0.004974
mean radius                0.004893
perimeter error            0.004592
worst fractal dimension    0.003886
worst symmetry             0.003760
mean area                  0.003233
mean symmetry              0.002454
texture error              0.002181
compactness error          0.001909
mean fractal dimension     0.001781
symmetry error             0.001724
fractal dimension error    0.001688
concave points error       0.001621
concavity error            0.001602
mean compactness           0

In [16]:
# 최종 평가
pred_test = best_model.predict(X_test)
accuracy_score(y_test, pred_test)

0.9385964912280702