<a href="https://colab.research.google.com/github/kant1724/ibks-ml-basic/blob/master/06_decision_tree.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from pandas import Series

In [0]:
breast_cancer = datasets.load_breast_cancer()

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry 
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 3 is Mean Radius, f

In [0]:
data = pd.DataFrame(breast_cancer.data, columns=breast_cancer.feature_names)
sy = pd.Series(breast_cancer.target, dtype='category')
data['label'] = sy   

In [0]:
from sklearn.model_selection import train_test_split 
x = data.drop('label', axis=1)
y = data['label']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=0)
print(len(x_train))
print(len(x_test))
print(len(y_train))
print(len(y_test))

426
143
426
143


In [0]:
clf = RandomForestClassifier(max_depth=5, random_state=0)
clf.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=5, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [0]:
pred = clf.predict(x_test)

In [0]:
feature_importances = clf.feature_importances_
Series_feat_imp = Series(feature_importances, index = x_train.columns)
Series_feat_imp = Series_feat_imp.sort_values(ascending=True).nlargest(100)
Series_feat_imp

worst perimeter            0.161971
worst radius               0.119979
worst concave points       0.117353
mean concave points        0.107081
worst area                 0.069927
mean concavity             0.055877
mean perimeter             0.050567
mean area                  0.048635
worst concavity            0.037762
area error                 0.032754
radius error               0.028953
mean radius                0.022914
perimeter error            0.016554
worst smoothness           0.016026
worst texture              0.014578
mean texture               0.012953
worst symmetry             0.012454
worst compactness          0.011301
worst fractal dimension    0.010094
mean compactness           0.007746
texture error              0.007130
mean smoothness            0.005597
fractal dimension error    0.005538
concave points error       0.005493
compactness error          0.004889
mean symmetry              0.004392
symmetry error             0.003933
concavity error            0

In [0]:
print('정확도 :', metrics.accuracy_score(y_test, pred))

정확도 : 0.972027972027972
