In [1]:
# (1) 필요한 모듈 import
import numpy as np
from sklearn.datasets import load_digits
from sklearn.datasets import load_wine
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.base import clone


# (2) 데이터 준비
digits = load_digits()
print(digits.DESCR)
print(digits.target_names)
digits_data = digits.data
digits_label = digits.target


# (3) train, test 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(digits_data, digits_label, test_size=0.2)


# (4) 모델 학습 및 예측
decisiontree = DecisionTreeClassifier()
clone_dct = clone(decisiontree)
clone_dct.fit(X_train, y_train)
y_pred = clone_dct.predict(X_test)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

randomforest = RandomForestClassifier()
clone_rdf = clone(randomforest)
clone_rdf.fit(X_train, y_train)
y_pred = clone_rdf.predict(X_test)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

svm_m = svm.SVC()
clone_svm = clone(svm_m)
clone_svm.fit(X_train,y_train)
y_pred = clone_svm.predict(X_test)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

sgd = SGDClassifier() #loss : Defaults to ‘hinge’, which gives a linear SVM
clone_sgd = clone(sgd)
clone_sgd.fit(X_train, y_train)
y_pred = clone_sgd.predict(X_test)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

logisticregression = LogisticRegression(solver='lbfgs', max_iter=5000)
clone_lgr = clone(logisticregression)
clone_lgr.fit(X_train, y_train)
y_pred = clone_lgr.predict(X_test)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

.. _digits_dataset:

Optical recognition of handwritten digits dataset
--------------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 5620
    :Number of Attributes: 64
    :Attribute Information: 8x8 image of integer pixels in the range 0..16.
    :Missing Attribute Values: None
    :Creator: E. Alpaydin (alpaydin '@' boun.edu.tr)
    :Date: July; 1998

This is a copy of the test set of the UCI ML hand-written digits datasets
https://archive.ics.uci.edu/ml/datasets/Optical+Recognition+of+Handwritten+Digits

The data set contains images of hand-written digits: 10 classes where
each class refers to a digit.

Preprocessing programs made available by NIST were used to extract
normalized bitmaps of handwritten digits from a preprinted form. From a
total of 43 people, 30 contributed to the training set and different 13
to the test set. 32x32 bitmaps are divided into nonoverlapping blocks of
4x4 and the number of on pixels are counted in each blo

In [2]:
num=100
dct_avg_acc=0
rdf_avg_acc=0
svm_avg_acc=0
sgd_avg_acc=0
lgr_avg_acc=0

decisiontree = DecisionTreeClassifier()
randomforest = RandomForestClassifier()
svm_m = svm.SVC()
sgd = SGDClassifier()
logisticregression = LogisticRegression(solver='lbfgs', max_iter=10000)

for i in range(num):
    X_train, X_test, y_train, y_test = train_test_split(digits_data, digits_label, test_size=0.2)
    
    
    clone_dct = clone(decisiontree)
    clone_dct.fit(X_train, y_train)
    y_pred = clone_dct.predict(X_test)
    dct_list=list(map(str,classification_report(y_test, y_pred).split())) #classification_report 결과를 리스트로 생성
    #dct_list.index('accuracy') 코드로 index 54 에 accuracy가 위치하고 그다음 index 55 에 accuracy확률이 위치함
    dct_avg_acc+=float(dct_list[55])
      

    clone_rdf = clone(randomforest)
    clone_rdf.fit(X_train, y_train)
    y_pred = clone_rdf.predict(X_test)
    rdf_list=list(map(str,classification_report(y_test, y_pred).split()))
    rdf_avg_acc+=float(rdf_list[55])
      

    clone_svm = clone(svm_m)
    clone_svm.fit(X_train,y_train)
    y_pred=clone_svm.predict(X_test)
    svm_list=list(map(str,classification_report(y_test, y_pred).split()))
    svm_avg_acc+=float(svm_list[55])
    

    clone_sgd = clone(sgd)
    clone_sgd.fit(X_train, y_train)
    y_pred = clone_sgd.predict(X_test)
    sgd_list=list(map(str,classification_report(y_test, y_pred).split()))
    sgd_avg_acc+=float(sgd_list[55])
    
    
    clone_lgr = clone(logisticregression)
    clone_lgr.fit(X_train, y_train)
    y_pred = clone_lgr.predict(X_test)
    lgr_list=list(map(str,classification_report(y_test, y_pred).split()))
    lgr_avg_acc+=float(lgr_list[55])


print('DecisionTreeClassifier accuracy :',dct_avg_acc/num)
print('RandomForestClassifier accuracy :',rdf_avg_acc/num)
print('SVM accuracy :',svm_avg_acc/num)
print('SGDClassifier accuracy :',sgd_avg_acc/num)
print('LogisticRegression accuracy :',lgr_avg_acc/num)

DecisionTreeClassifier accuracy : 0.8523999999999995
RandomForestClassifier accuracy : 0.9744999999999994
SVM accuracy : 0.9869999999999992
SGDClassifier accuracy : 0.9496000000000001
LogisticRegression accuracy : 0.9625999999999993


0 ~ 9 숫자를 정확하게 분류할수록 좋기 때문에 accuracy 값이 높을수록 좋다. 학습과 시험을 100번 수행한 평균 결과 SVM의 정확도가 가장 높다.

In [3]:
# (2) 데이터 준비
wine = load_wine()
print(wine.DESCR)
print(wine.target_names)
wine_data = wine.data
wine_label = wine.target


# (3) train, test 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(wine_data, wine_label, test_size=0.2)


# (4) 모델 학습 및 예측
decisiontree = DecisionTreeClassifier()
clone_dct = clone(decisiontree)
clone_dct.fit(X_train, y_train)
y_pred = clone_dct.predict(X_test)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

randomforest = RandomForestClassifier()
clone_rdf = clone(randomforest)
clone_rdf.fit(X_train, y_train)
y_pred = clone_rdf.predict(X_test)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

svm_m = svm.SVC()
clone_svm = clone(svm_m)
clone_svm.fit(X_train,y_train)
y_pred = clone_svm.predict(X_test)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

sgd = SGDClassifier()
clone_sgd = clone(sgd)
clone_sgd.fit(X_train, y_train)
y_pred = clone_sgd.predict(X_test)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

logisticregression = LogisticRegression(solver='lbfgs', max_iter=10000)
clone_lgr = clone(logisticregression)
clone_lgr.fit(X_train, y_train)
y_pred = clone_lgr.predict(X_test)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

.. _wine_dataset:

Wine recognition dataset
------------------------

**Data Set Characteristics:**

    :Number of Instances: 178 (50 in each of three classes)
    :Number of Attributes: 13 numeric, predictive attributes and the class
    :Attribute Information:
 		- Alcohol
 		- Malic acid
 		- Ash
		- Alcalinity of ash  
 		- Magnesium
		- Total phenols
 		- Flavanoids
 		- Nonflavanoid phenols
 		- Proanthocyanins
		- Color intensity
 		- Hue
 		- OD280/OD315 of diluted wines
 		- Proline

    - class:
            - class_0
            - class_1
            - class_2
		
    :Summary Statistics:
    
                                   Min   Max   Mean     SD
    Alcohol:                      11.0  14.8    13.0   0.8
    Malic Acid:                   0.74  5.80    2.34  1.12
    Ash:                          1.36  3.23    2.36  0.27
    Alcalinity of Ash:            10.6  30.0    19.5   3.3
    Magnesium:                    70.0 162.0    99.7  14.3
    Total Phenols:                0

In [4]:
dct_avg_acc=0
rdf_avg_acc=0
svm_avg_acc=0
sgd_avg_acc=0
lgr_avg_acc=0

decisiontree = DecisionTreeClassifier()
randomforest = RandomForestClassifier()
svm_m = svm.SVC()
sgd = SGDClassifier()
logisticregression = LogisticRegression(solver='lbfgs', max_iter=10000)


for i in range(num):
    X_train, X_test, y_train, y_test = train_test_split(wine_data, wine_label, test_size=0.2)
    
    clone_dct = clone(decisiontree)
    clone_dct.fit(X_train, y_train)
    y_pred = clone_dct.predict(X_test)
    dct_list=list(map(str,classification_report(y_test, y_pred).split())) #classification_report 결과를 리스트로 생성
    #dct_list.index('accuracy') 코드로 index 19 에 accuracy가 위치하고 그다음 index 20 에 accuracy확률이 위치함
    dct_avg_acc+=float(dct_list[20])
    
    
    clone_rdf = clone(randomforest)
    clone_rdf.fit(X_train,y_train)
    y_pred = clone_rdf.predict(X_test)
    rdf_list=list(map(str,classification_report(y_test, y_pred).split()))
    rdf_avg_acc+=float(rdf_list[20])
    
    
    clone_svm = clone(svm_m)
    clone_svm.fit(X_train,y_train)
    y_pred = clone_svm.predict(X_test)
    svm_list=list(map(str,classification_report(y_test, y_pred).split()))
    svm_avg_acc+=float(svm_list[20])
    
    
    clone_sgd = clone(sgd)
    clone_sgd.fit(X_train, y_train)
    y_pred = clone_sgd.predict(X_test)
    sgd_list=list(map(str,classification_report(y_test, y_pred).split()))
    sgd_avg_acc+=float(sgd_list[20])
    
    
    clone_lgr = clone(logisticregression)
    clone_lgr.fit(X_train, y_train)
    y_pred = clone_lgr.predict(X_test)
    lgr_list=list(map(str,classification_report(y_test, y_pred).split()))
    lgr_avg_acc+=float(lgr_list[20])

    
print('DecisionTreeClassifier accuracy :',dct_avg_acc/num)
print('RandomForestClassifier accuracy :',rdf_avg_acc/num)
print('SVM accuracy :',svm_avg_acc/num)
print('SGDClassifier accuracy :',sgd_avg_acc/num)
print('LogisticRegression accuracy :',lgr_avg_acc/num)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

DecisionTreeClassifier accuracy : 0.9035
RandomForestClassifier accuracy : 0.9788999999999994
SVM accuracy : 0.671
SGDClassifier accuracy : 0.5984
LogisticRegression accuracy : 0.9578999999999996


3가지 클래스로 와인을 정확하게 분류할수록 좋기 때문에 accuracy 값이 높을수록 좋다. 학습과 시험을 100번 수행한 평균 결과 RandomForestClassifier의 정확도가 가장 높다.
f1-score 계산에서 precision + recall이 분모인데 class_2가 예측값으로 나타나지 않아 precision + recall값이 0이되어 SVM f1-score 계산에서 zero_division 에러가 발생하였다.

In [5]:
# (2) 데이터 준비
breast_cancer = load_breast_cancer()
print(breast_cancer.DESCR)
print(breast_cancer.target_names)
breast_cancer_data = breast_cancer.data
breast_cancer_label = breast_cancer.target


# (3) train, test 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(breast_cancer_data, breast_cancer_label, test_size=0.2)


# (4) 모델 학습 및 예측
decisiontree = DecisionTreeClassifier()
clone_dct = clone(decisiontree)
clone_dct.fit(X_train, y_train)
y_pred = clone_dct.predict(X_test)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

randomforest = RandomForestClassifier()
clone_rdf = clone(randomforest)
clone_rdf.fit(X_train, y_train)
y_pred = clone_rdf.predict(X_test)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

svm_m = svm.SVC()
clone_svm = clone(svm_m)
clone_svm.fit(X_train,y_train)
y_pred = clone_svm.predict(X_test)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

sgd = SGDClassifier()
clone_sgd = clone(sgd)
clone_sgd.fit(X_train, y_train)
y_pred = clone_sgd.predict(X_test)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

logisticregression = LogisticRegression(solver='lbfgs', max_iter=30000)
clone_lgr = clone(logisticregression)
clone_lgr.fit(X_train, y_train)
y_pred = clone_lgr.predict(X_test)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        worst/largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 0 is Mean Radi

In [6]:
dct_avg_recall=0
rdf_avg_recall=0
svm_avg_recall=0
sgd_avg_recall=0
lgr_avg_recall=0

decisiontree = DecisionTreeClassifier()
randomforest = RandomForestClassifier()
svm_m = svm.SVC()
sgd = SGDClassifier()
logisticregression = LogisticRegression(solver='lbfgs', max_iter=10000)


for i in range(num):
    X_train, X_test, y_train, y_test = train_test_split(breast_cancer_data, breast_cancer_label, test_size=0.2)
    
    
    clone_dct = clone(decisiontree)
    clone_dct.fit(X_train, y_train)
    y_pred = clone_dct.predict(X_test)
    dct_list=list(map(str,classification_report(y_test, y_pred).split())) #classification_report 결과를 리스트로 생성
    #dct_list 에서 malignant recall값 인덱스 6
    dct_avg_recall+=float(dct_list[6])
    
    
    clone_rdf = clone(randomforest)
    clone_rdf.fit(X_train,y_train)
    y_pred = clone_rdf.predict(X_test)
    rdf_list=list(map(str,classification_report(y_test, y_pred).split()))
    rdf_avg_recall+=float(rdf_list[6])
    
   
    clone_svm = clone(svm_m)
    clone_svm.fit(X_train,y_train)
    y_pred = clone_svm.predict(X_test)
    svm_list=list(map(str,classification_report(y_test, y_pred).split()))
    svm_avg_recall+=float(svm_list[6])
    

    clone_sgd = clone(sgd)
    clone_sgd.fit(X_train, y_train)
    y_pred = clone_sgd.predict(X_test)
    sgd_list=list(map(str,classification_report(y_test, y_pred).split()))
    sgd_avg_recall+=float(sgd_list[6])
    
    
    clone_lgr = clone(logisticregression)
    clone_lgr.fit(X_train, y_train)
    y_pred = clone_lgr.predict(X_test)
    lgr_list=list(map(str,classification_report(y_test, y_pred).split()))
    lgr_avg_recall+=float(lgr_list[6])

    
print('DecisionTreeClassifier malignant recall :',dct_avg_recall/num)
print('RandomForestClassifier malignant recall :',rdf_avg_recall/num)
print('SVM malignant recall :',svm_avg_recall/num)
print('SGDClassifier malignant recall :',sgd_avg_recall/num)
print('LogisticRegression malignant recall :',lgr_avg_recall/num)

DecisionTreeClassifier malignant recall : 0.9120999999999999
RandomForestClassifier malignant recall : 0.9390999999999998
SVM malignant recall : 0.8026000000000001
SGDClassifier malignant recall : 0.8475000000000001
LogisticRegression malignant recall : 0.9237000000000001


유방암이 양성인 경우를 최대한 놓치지 않아야 하기에 malignant recall 값이 높을수록 좋다. 학습과 시험을 100번 수행한 평균 결과 RandomForest의 malignant recall 값이 가장 높다.

Decision Tree
가중치와 불순도를 곱하여 정보획득량이 최대한 커지도록 기준을 설정
기준을 반복적으로 설정하며 정보획득량이 없거나 트리의 정해진 최대 크기 단계에 도달할때까지 실행
탐욕 알고리즘으로 최적의 트리가 아닐 가능성 있음
트리의 최대크기 지정이나 가지치기를 통해 오버피팅을 줄여줘야함

Random Forest
앙상블학습법
중복을 허용한 학습데이터 일부를 뽑아 트리 생성을 반복한다. 이때 전체 속성이 아닌 일부 속성만 이용하여 트리의 다양성을 높인다. 각 트리의 분류결과 중 다수인 결과를 최종적으로 선택한다.

SVM
결정경계 데이터간의 분류기준선을 정하는 모델
설정된 커널을 통해 높은 차원으로 바꾼 뒤 결정경계 구분
이상치를 어떻게 처리할지 파라미터 C 값으로 마진을 설정하는 정도에 따라 오버피팅과 언더피팅이 발생할 수 있다.
디폴트 커널인 RBF 커널은 결정경계의 유연성을 정하는 gamma 값에 따라 오버피팅과 언더피팅이 발생할 수 있다.

SGD Classifier
확률적 경사하강법 최적화 기술
SVM, Logistic Regression 등 loss function을 이용해서 피팅 할 때 확률적 경사하강법을 이용하여 효율성을 높인다.
최상의 결과를 얻으려면 데이터의 평균과 단위 분산이 0이어야 한다.

Logistic Regression
log-odds를 구한 후 Sigmoid 함수를 적용하여 데이터가 속할 확률을 0에서 1 사이의 값으로 예측하고 그 확률에 따라 이진 분류 
필요에 따라 분류 임계값 변경 가능
log-odds 사건이 발생할 확률을 발생하지 하지 않을 확률로 나눈 값에 로그를 취한 것
다중 분류에 이용할 경우 one-vs-all 방법으로 분류 가짓수 만큼 나눠서 실행