In [8]:
#그리드 자동화
import scipy.stats
from sklearn.datasets import load_digits
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

data = load_digits()
train_X, test_X, train_y, test_y = train_test_split(data.data, data.target, random_state=42)

#파라미터값의 후보 설정
model_param_set_grid = {SVC():{
    "kernel":["linear", "poly", "rbf", "sigmoid"],
    "C":[10 ** i for i in range(-5, 5)],
    "decision_function_shape":["ovr", "ovo"],
    "random_state":[42]
}}

max_score = 0
best_param = None

#그리드 검색으로 파라미터 검색
for model, param in model_param_set_grid.items():
    clf = GridSearchCV(model, param)
    clf.fit(train_X, train_y)
    pred_y = clf.predict(test_X)
    score = f1_score(test_y, pred_y, average="micro")
    if max_score < score:
        max_score = score
        best_model = model.__class__.__name__
        best_param = clf.best_params_
        
print("파라미터: {}".format(best_param))
print("최고 점수: ", max_score)
svm = SVC()
svm.fit(train_X, train_y)
print()
print("조정 없음")
print(svm.score(test_X, test_y))

파라미터: {'C': 10, 'decision_function_shape': 'ovr', 'kernel': 'rbf', 'random_state': 42}
최고 점수:  0.9888888888888889

조정 없음
0.9866666666666667


In [10]:
#랜덤 검색

import scipy.stats
from sklearn.datasets import load_digits
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

data = load_digits()
train_X, test_X, train_y, test_y = train_test_split(data.data, data.target, random_state=42)

#파라미터값의 후보 설정
model_param_set_random = {SVC():{
    "kernel":["linear", "poly", "rbf", "sigmoid"],
    "C":scipy.stats.uniform(0.00001, 1000),
    "decision_function_shape":["ovr", "ovo"],
    "random_state":scipy.stats.randint(0, 100)
}}


max_score = 0
best_param = None

#랜덤 검색으로 파라미터 검색
for model, param in model_param_set_random.items():
    clf = RandomizedSearchCV(model, param)
    clf.fit(train_X, train_y)
    pred_y = clf.predict(test_X)
    score = f1_score(test_y, pred_y, average="micro")
    if max_score < score:
        max_score = score
        best_param = clf.best_params_
        
print("파라미터: {}".format(best_param))
print("최고 점수: ", max_score)
svm=SVC()
svm.fit(train_X, train_y)
print()
print("조정 없음")
print(svm.score(test_X, test_y))

파라미터: {'C': 34.640675912359264, 'decision_function_shape': 'ovo', 'kernel': 'rbf', 'random_state': 98}
최고 점수:  0.9888888888888889

조정 없음
0.9866666666666667


In [12]:
# 연습 문제
# 그리드 검색과 랜덤 검색은 시간이 많이 걸리지만 적절한 파라미터를 발견하면 정확도를 향상시킨다. 파라미터 검색을 해보자.

import requests
import io
import pandas as pd
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV

#필요한 데이터 전처리
vote_data_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/voting-records/house-votes-84.data"
s = requests.get(vote_data_url).content
vote_data = pd.read_csv(io.StringIO(s.decode('utf-8')), header=None)
vote_data.columns = ['Class Name', 'handicapped-infants', 'water-project-cost-sharing',
                    'adoption-of-the-budget-resolution',
                    'physician-fee-freeze',
                    'el-salvador-aid',
                    'religious-groups-in-schoolrs',
                    'anti-satellite-test-ban',
                    'aid-to-nicaraguan-contras',
                    'mx-missile',
                    'immigration',
                    'synfuels-corporation-cutback',
                    'education-spending',
                    'superfund-right-to-sue',
                    'crime',
                    'duty-free-exports',
                    'export-administration-act-south-africa']
label_encode= preprocessing.LabelEncoder()
vote_data_encode = vote_data.apply(lambda x: label_encode.fit_transform(x))
X = vote_data_encode.drop('Class Name', axis = 1)
Y = vote_data_encode['Class Name']
train_X, test_X, train_y, test_y = train_test_split(X, Y, random_state=50)

models_name = ["SVM", "결정 트리", "랜덤 포레스트"]
models = [SVC(), DecisionTreeClassifier(), RandomForestClassifier()]
params = [{"C":[0.01, 0.1, 1.0, 10, 100],
          "kernel":["linear", "rbf", "poly", "sigmoid"],
          "random_state":[42]},
         {"max_depth":[i for i in range(1, 10)],
         "random_state":[i for i in range(1, 10)]},
         {"n_estimators":[i for i in range(10, 20)],
         "max_depth":[i for i in range(1, 10)],
         "random_state":[i for i in range(100)]}]
for name, model, param in zip(models_name, models, params):
    clf = RandomizedSearchCV(model, param)
    clf.fit(train_X, train_y)
    print(name)
    print(clf.score(test_X, test_y))
    print()

SVM
0.944954128440367

결정 트리
0.944954128440367

랜덤 포레스트
0.944954128440367



In [22]:
# 종합 문제
# 필기체 숫자 이미지 데이터 셋을 사용한다. 판별하기 어려운 숫자도 있으므로 파라미터나 모델 선택이 중요

import scipy.stats
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV 
from sklearn.metrics import f1_score

data = load_digits()
train_X, test_X, train_y, test_y = train_test_split(data.data, data.target, random_state=42)

# 그리드 검색을 위해 모델과 파라미터 셋을 정리한 딕셔너리 준비
# 딕셔너리의 ket에는 객체의 인스턴스를 지정
model_param_set_grid = {
    LogisticRegression():{
        "C":[10 ** i for i in range(-5, 5)],
        "random_state":[42]
    },
    LinearSVC():{
        "C":[10 ** i for i in range(-5, 5)],
        "multi_class":["ovr", "crammer_singer"],
        "random_state":[42]
    },
    SVC():{
        "kernel":["linear", "poly", "rbf", "sigmoid"],
        "C":[10 ** i for i in range(-5, 5)],
        "decision_function_shape":["ovr", "ovo"],
        "random_state":[42]
    },
    DecisionTreeClassifier():{
        "max_depth":[i for i in range(1, 20)],
    },
    RandomForestClassifier():{
        "n_estimators":[i for i in range(10, 20)],
        "max_depth":[i for i in range(1, 10)],
    },
    KNeighborsClassifier():{
        "n_neighbors":[i for i in range(1, 10)]
    }
}

#랜덤 검색을 위해 모델과 파라미터셋을 정리한 딕셔너리 준비
model_param_set_random = {
    LogisticRegression():{
        "C":scipy.stats.uniform(0.00001, 1000),
        "random_state":scipy.stats.randint(0, 100)
    },
    LinearSVC():{
        "C":scipy.stats.uniform(0.00001, 1000),
        "multi_class":["ovr", "crammer_singer"],
        "random_state":scipy.stats.randint(0, 100)
    },
    SVC():{
        "kernel":["linear", "poly", "rbf", "sigmoid"],
        "C":scipy.stats.uniform(0.00001, 1000),
        "decision_function_shape":["ovr", "ovo"],
        "random_state":scipy.stats.randint(0, 100)
    },
    DecisionTreeClassifier():{
        "max_depth":scipy.stats.randint(1, 20),
    },
    RandomForestClassifier():{
        "n_estimators":scipy.stats.randint(10, 100),
        "max_depth":scipy.stats.randint(1, 20),
    },
    KNeighborsClassifier():{
        "n_neighbors":scipy.stats.randint(1, 20)
    }
}

# 점수 비교용 변수 준비
max_score = 0
best_model = None
best_param = None

#그리드 검색으로 파라미터 검색 수행
for model, param in model_param_set_grid.items():
    clf = GridSearchCV(model, param)
    clf.fit(train_X, train_y)
    pred_y = clf.predict(test_X)
    score = f1_score(test_y, pred_y, average="micro")
    
    #최고 평가 갱신시 모델과 파라미터 업데이트
    if max_score < score:
        max_score = score
        best_model = model.__class__.__name__
        best_param = clf.best_params_
        
#랜덤 검색으로 파라미터 검색 수행
for model, param in model_param_set_random.items():
    clf = RandomizedSearchCV(model, param)
    clf.fit(train_X, train_y)
    pred_y = clf.predict(test_X)
    score = f1_score(test_y, pred_y, average="micro")
    
    #최고 평가 갱신시 모델과 파라미터 업데이트
    if max_score < score:
        max_score = score
        best_model = model.__class__.__name__
        best_param = clf.best_params_
        
print("학습 모델: {}, \n파라미터: {}".format(best_model, best_param))
#가장 성적이 좋은 점수 출력
print("최고 점수: ", max_score)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt



학습 모델: SVC, 
파라미터: {'C': 10, 'decision_function_shape': 'ovr', 'kernel': 'rbf', 'random_state': 42}
최고 점수:  0.9888888888888889
