In [56]:
# module import
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

#
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load data

iris  = load_iris()
print(iris.keys())

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename'])


In [3]:
X = iris.data
display(X[:5])
Y = iris.target
display(Y[:5])

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2]])

array([0, 0, 0, 0, 0])

In [4]:
# seperate

X_train, X_test, y_train, y_test = train_test_split(iris.data,
                                                   iris.target,
                                                   test_size = 0.3,
                                                   random_state = 121)
# model

#rf
rfc = RandomForestClassifier()

#LR
lrc = LogisticRegression()

#svc
svc = SVC()


# fit 각각 학습
models = [rfc,lrc,svc]

for model in models:
    model.fit(X_train,y_train)
    
# predict
rfc_pred = rfc.predict(X_test)
lrc_pred = lrc.predict(X_test)
svc_pred = svc.predict(X_test)


print("rfc" , accuracy_score(y_test,rfc_pred))
print("lrc" , accuracy_score(y_test,lrc_pred))
print("svc" , accuracy_score(y_test,svc_pred))

rfc 0.9333333333333333
lrc 0.9555555555555556
svc 0.9555555555555556


In [5]:
rfc.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [6]:
# parameters
# RandomForestClassifier() == Decision Tree parameter와 동일


In [20]:
# 2차 검증 GridSearchCV

#module import
from sklearn.metrics import precision_score,recall_score,confusion_matrix,f1_score,roc_auc_score

#rfc

parameters = {'max_depth':[2,3,5,10], 'min_samples_split':[2,3,5],
              'min_samples_leaf':[1,5,8]}

grid_rfc = GridSearchCV(rfc, param_grid=parameters, scoring='accuracy', cv=5, refit=True)

grid_rfc.fit(X_train,y_train)

#output
print(grid_rfc)
print("최적 파라미터는 ? >>", grid_rfc.best_params_)
print("최고 정확도는 ? >> {0:.4f}".format(grid_rfc.best_score_))

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'max_depth': [2, 3, 5, 10],
                         'min_samples_leaf': [1, 5, 8],
                         'min_samples_split': [2, 3, 5]},
             scoring='accuracy')
최적 파라미터는 ? >> {'max_depth': 2, 'min_samples_leaf': 5, 'min_samples_split': 5}
최고 정확도는 ? >> 0.9714


In [23]:
# 성능 개선

best_rfc = grid_rfc.best_estimator_
print(best_rfc)

# 예측값
rfc_predictions = best_rfc.predict(X_test)
# 예측 확률
rfc_proba = best_rfc.predict_proba(X_test)[:,1]

#정확도
accuracy = accuracy_score(y_test, rfc_predictions)
print("grid 예측 정확도는? >> ", accuracy)


RandomForestClassifier(max_depth=2, min_samples_leaf=5, min_samples_split=5)
grid 예측 정확도는? >>  0.9333333333333333


In [25]:
from sklearn.metrics import classification_report
print(classification_report(rfc_predictions, y_test))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        14
           1       0.94      0.88      0.91        17
           2       0.87      0.93      0.90        14

    accuracy                           0.93        45
   macro avg       0.93      0.94      0.94        45
weighted avg       0.93      0.93      0.93        45



In [46]:
# # 다른 검증값
# def get_clf_eval(y_test, rfc_predictions, rfc_proba):
    
#     #오차행렬
#     confusion = confusion_matrix(y_test, rfc_predictions)
    
#     #정확도
#     accuracy = accuracy_score(y_test, rfc_predictions)
    
# #     정밀도 precision
#     precision = precision_score(y_test, rfc_predictions, average=None)
    
# #     재현율 recall
#     recall = recall_score(y_test, rfc_predictions, average=None)
    
# #     F1
# #     정밀도 (precision) 와 재현율(recall) 과의 "조화평균"
#     f1 = f1_score(y_test, rfc_predictions, average=None)
    
# #     #roc_auc
#     roc_auc = roc_auc_score(y_test, rfc_proba)
    
#     print('오차 행렬')
#     print(confusion)
#     print()    
#     print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f}, F1: {3:.4f}'.format(accuracy, precision, recall, f1))

# # 다중분류에 대한 검증 

In [47]:
lrc.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [None]:
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]

In [62]:
# 2차 검증 GridSearchCV

#module import
from sklearn.metrics import precision_score,recall_score,confusion_matrix,f1_score,roc_auc_score

#lrc

parameters = {'solver' : ['newton-cg','lbfgs','liblinear'],'penalty' :['l2'],'C' : [50, 10, 1.0, 0.1, 0.01]}

grid_lrc = GridSearchCV(lrc, param_grid=parameters, scoring='accuracy', cv=3, refit=True)

grid_lrc.fit(X_train,y_train)

#output
print(grid_lrc)
print("최적 파라미터는 ? >>", grid_lrc.best_params_)
print("최고 정확도는 ? >> {0:.4f}".format(grid_lrc.best_score_))

GridSearchCV(cv=3, estimator=LogisticRegression(),
             param_grid={'C': [50, 10, 1.0, 0.1, 0.01], 'penalty': ['l2'],
                         'solver': ['newton-cg', 'lbfgs', 'liblinear']},
             scoring='accuracy')
최적 파라미터는 ? >> {'C': 50, 'penalty': 'l2', 'solver': 'liblinear'}
최고 정확도는 ? >> 0.9714


In [58]:
svc.get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [59]:
# 2차 검증 GridSearchCV

#module import
from sklearn.metrics import precision_score,recall_score,confusion_matrix,f1_score,roc_auc_score

#svc

parameters = {'kernel' : ['linear','poly', 'rbf', 'sigmoid'],'C':[50, 10, 1.0, 0.1, 0.01]}

grid_svc = GridSearchCV(svc, param_grid=parameters, scoring='accuracy', cv=3, refit=True)

grid_svc.fit(X_train,y_train)

#output
print(grid_svc)
print("최적 파라미터는 ? >>", grid_svc.best_params_)
print("최고 정확도는 ? >> {0:.4f}".format(grid_svc.best_score_))

GridSearchCV(cv=3, estimator=SVC(),
             param_grid={'C': [50, 10, 1.0, 0.1, 0.01],
                         'kernel': ['linear', 'poly', 'rbf', 'sigmoid']},
             scoring='accuracy')
최적 파라미터는 ? >> {'C': 50, 'kernel': 'rbf'}
최고 정확도는 ? >> 0.9714


In [63]:
# 성능 개선 lrc
best_lrc = grid_lrc.best_estimator_ # model/ fit/ predict 
print(best_lrc)

# 예측을 확인 
lrc_predictions = best_lrc.predict(X_test)  # 예측값
lrc_proba = best_lrc.predict_proba(X_test)[:,1]  # 예측확률
accuracy= accuracy_score(y_test, lrc_predictions) # 정확도

print(' grid 예측 정확도 >>', accuracy)


LogisticRegression(C=50, solver='liblinear')
 grid 예측 정확도 >> 0.9777777777777777


In [65]:
# 성능 개선 svc

best_svc = grid_svc.best_estimator_
print(best_svc)

# 예측값
svc_predictions = best_svc.predict(X_test)
# 예측 확률
# svc_proba = best_svc.predict_proba(X_test)[:,1]

#정확도
accuracy = accuracy_score(y_test, svc_predictions)
print("grid 예측 정확도는? >> ", accuracy)


SVC(C=50)
grid 예측 정확도는? >>  0.9777777777777777


In [None]:
방사 기저 함수(RBF: Radial Bias Function)