In [1]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from matplotlib import style
import pandas as pd
import seaborn as sns
from scipy import stats
from sklearn.datasets import load_breast_cancer

In [3]:
breast_cancer=load_breast_cancer()

In [4]:
breastDF = pd.DataFrame(breast_cancer.data, columns = breast_cancer.feature_names)

In [51]:
breast_cancer.target_names # 0이 음성 1이 양성

array(['malignant', 'benign'], dtype='<U9')

In [5]:
breastDF['target']=breast_cancer.target
breastDF.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [6]:
import random

random_seed = random.randint(0,1000)
random_seed

133

In [7]:
X= breastDF.iloc[:,:-1]
y= breastDF.iloc[:, -1]

In [9]:
from sklearn.preprocessing import StandardScaler
#StandardScaler 로 X 단위에 의한 숫자 스케일을 조정함
scaler=StandardScaler()
scaler.fit(X)
X_scaled = scaler.transform(X)

In [10]:
### 랜덤 시드를 활용하여 트레인, 테스트셋 분리 / 7:3 비율로
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y,
                                                    test_size=0.3, random_state=random_seed)

In [31]:
#첫번째 분류 모델 svm
from sklearn.svm import SVC 
import time
from sklearn import metrics

In [45]:
### 학습 진행 _ 비선형
clf = SVC(kernel = 'rbf',C=100, gamma= 5,random_state=random_seed)

In [46]:
clf.fit(X_train, y_train)

SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=5, kernel='rbf', max_iter=-1,
    probability=False, random_state=133, shrinking=True, tol=0.001,
    verbose=False)

In [47]:
prediction = clf.predict(X_test)
prediction

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1])

In [42]:
# roc curve는 민감도(1을 1이라고 예측), 판정한것 중 실제 양성 / 전체양성
#특이도(0을 0이라 예측), 판정한것 중 실제 음성/ 전체 음성 가 어떤 관계를 갖고 있는지
#TPR(민감도)와 FPR(1-특이도)(0을 1이라 예측)
#1에 가까울수록 좋은 성능
#PR(precision recall) 정밀도 = 실제양성수 / 양성이라고 판정한 수
# 재현율 = 검출 양성수 / 전체양성수
# PR(precision-recall)은 데이터 라벨의 분포가 심하게 불균등할 때 쓴다고 함
#PR curve는 크게 편향된 데이터에서의 성능을 측정할 때, Negative 데이터가 
#positive 데이터에 비해 압도적으로 많을 때, 
# PR curve는 TN을 고려하지 않기 때문이다.
# TN은 negative 데이터를 negative라고 예측한 것인데, 
# 지금과 같은 상황에서는 negative 데이터가 대부분이기 때문에 negative를 negative라고 예측한 것은 그다지 유용한 정보가 아니다.)

In [48]:
# svm 모델 모두 positive로 예측했으니 pr 말고 roc 쓴다
score = metrics.roc_auc_score(y_train, clf.decision_function(X_train)) 
print(score)

1.0


In [49]:
score_1 = metrics.roc_auc_score(y_test, clf.decision_function(X_test)) 
print(score_1)
#정확도 높음!

0.5


In [66]:
#두번째 분류 모델 random forrest
from sklearn.ensemble import RandomForestClassifier

In [None]:
cancer=load_breast_cancer()
X= cancer.data
y= cancer.target

In [64]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state = random_seed)

In [119]:
rf = RandomForestClassifier(n_estimators=5, oob_score=True, random_state=random_seed) #n_estimators는 나무의 개수

### 학습 진행

t0 = time.time() # 학습 시작때 시간
rf.fit(X_train, y_train)

elapsed = time.time()-t0 # 학습 끝났을때 시간 - 학습 끝난 시간
print("걸린시간은 {:.2f}초입니다.".format(elapsed))

걸린시간은 0.02초입니다.


  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])


In [120]:
prediction = rf.predict(X_test)
probs = rf.predict_proba(X_test)[:, 1]

In [121]:
#1(양성)의 비율이 더 많으니 roc 로 평가
score = metrics.roc_auc_score(y_test, probs) 
print(score)
# 이미 점수가 높지만 하이퍼파라미터를 조정해보면,,,

0.9912820512820513


In [95]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

In [102]:
params = { 'n_estimators' : [10, 100],
           'max_depth' : [6, 8, 10, 12],
           'min_samples_leaf' : [8, 12, 18],
           'min_samples_split' : [8, 16, 20]
            }

In [110]:
rf_clf = RandomForestClassifier(random_state = 0, n_jobs = -1)
grid_cv = GridSearchCV(rf_clf, param_grid = params, cv = 5, n_jobs = -1)
grid_cv.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=-1,
                                              oob_score=False, random_state=0,
                                              verbose=0, warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid={'max_d

In [114]:
print('최적 하이퍼 파라미터: ', grid_cv.best_params_)

최적 하이퍼 파라미터:  {'max_depth': 6, 'min_samples_leaf': 8, 'min_samples_split': 8, 'n_estimators': 100}


In [115]:
rf_best = RandomForestClassifier(n_estimators=100, max_depth= 6, min_samples_leaf = 8, min_samples_split = 8, oob_score=True, random_state=random_seed)

In [116]:
rf_best.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=6, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=8, min_samples_split=8,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=True, random_state=133, verbose=0,
                       warm_start=False)

In [117]:
prediction_best = rf_best.predict(X_test)
probs_best = rf_best.predict_proba(X_test)[:, 1]

In [118]:
score = metrics.roc_auc_score(y_test, probs) 
print(score)

0.9914529914529914
