# 1. load_digits : 손글씨

## 1) 모듈 import

In [1]:
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


## 2) 데이터 준비

In [2]:
digits = load_digits()

print(type(dir(digits)))


<class 'list'>


In [3]:
digits.keys()

dict_keys(['data', 'target', 'frame', 'feature_names', 'target_names', 'images', 'DESCR'])

## 3) 데이터 이해하기

### Feature Data 지정하기

In [4]:
digits_data = digits.data

print(digits_data.shape)

(1797, 64)


### Label Data 지정하기

In [5]:
digits_label = digits.target
print(digits_label.shape)
digits_label

(1797,)


array([0, 1, 2, ..., 8, 9, 8])

### Target Names 출력해 보기

In [6]:
digits.target_names

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

### 데이터 Describe 해 보기

In [7]:
print(digits.DESCR)

.. _digits_dataset:

Optical recognition of handwritten digits dataset
--------------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 1797
    :Number of Attributes: 64
    :Attribute Information: 8x8 image of integer pixels in the range 0..16.
    :Missing Attribute Values: None
    :Creator: E. Alpaydin (alpaydin '@' boun.edu.tr)
    :Date: July; 1998

This is a copy of the test set of the UCI ML hand-written digits datasets
https://archive.ics.uci.edu/ml/datasets/Optical+Recognition+of+Handwritten+Digits

The data set contains images of hand-written digits: 10 classes where
each class refers to a digit.

Preprocessing programs made available by NIST were used to extract
normalized bitmaps of handwritten digits from a preprinted form. From a
total of 43 people, 30 contributed to the training set and different 13
to the test set. 32x32 bitmaps are divided into nonoverlapping blocks of
4x4 and the number of on pixels are counted in each blo

In [None]:
import pandas as pd

digits_df = pd.DataFrame(data=digits_data, columns=digits.feature_names)

digits_df["label"] = digits.target

digits_df

## 4) train, test 데이터 분리

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(digits_data, 
                                                    digits_label, 
                                                    test_size=0.2, 
                                                    random_state=7)

print('X_train 개수: ', len(X_train), ', X_test 개수: ', len(X_test))

In [None]:
y_train, y_test

##  5) 다양한 모델로 학습시켜보기

### - Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

decision_tree = DecisionTreeClassifier(random_state=32)

decision_tree.fit(X_train, y_train)

y_pred = decision_tree.predict(X_test)

####  모델 평가

In [None]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
accuracy

In [None]:
print(classification_report(y_test, y_pred))

### - Random Foreset

In [None]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(random_state=32)

random_forest.fit(X_train, y_train)

y_pred = random_forest.predict(X_test)


#### 모델 평가

In [None]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
accuracy

In [None]:
print(classification_report(y_test, y_pred))

### - SVM

In [None]:
from sklearn import svm

svm_model = svm.SVC()

svm_model.fit(X_train, y_train)

y_pred = svm_model.predict(X_test)

#### 모델 평가

In [None]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
accuracy

In [None]:
print(classification_report(y_test, y_pred))

### - SGD Classifier

In [None]:
from sklearn.linear_model import SGDClassifier

sgd_model = SGDClassifier()

sgd_model.fit(X_train, y_train)

y_pred = sgd_model.predict(X_test)

#### 모델 평가

In [None]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
accuracy

In [None]:
print(classification_report(y_test, y_pred))

### - Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

logistic_model = LogisticRegression()

logistic_model.fit(X_train, y_train)

y_pred = logistic_model.predict(X_test)

#### 모델 평가

In [None]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
accuracy

In [None]:
print(classification_report(y_test, y_pred))

### digits 의 평가지표로는 accuracy가 적합하다.
### 데이터셋이 어느정도 균등하게 분포되어있고, 분별기의 목적이 Feature Data로부터 도출한 pred가 test_data의 레이블과 일치하는가를 판별하는 것이기 때문이다.


## *SVM 모델이 accuracy 0.99로 가장 좋은 성능을 보였다*

***

# 2. load_wine : 와인

## 1) 모듈 import

In [None]:
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


## 2) 데이터 준비

In [None]:
wine = load_wine()

print(type(dir(wine)))


In [None]:
wine.keys()

## 3) 데이터 이해하기

### Feature Data 지정하기

In [None]:
wine_data = wine.data

print(wine_data.shape)

In [None]:
wine.feature_names

### Label Data 지정하기

In [None]:
wine_label = wine.target
print(wine_label.shape)
wine_label

### Target Names 출력해 보기

In [None]:
wine.target_names

### 데이터 Describe 해 보기

In [None]:
print(wine.DESCR)

In [None]:
import pandas as pd

wine_df = pd.DataFrame(data=wine_data, columns=wine.feature_names)

wine_df["label"] = wine.target

wine_df

## 4) train, test 데이터 분리

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(wine_data, 
                                                    wine_label, 
                                                    test_size=0.2, 
                                                    random_state=7)

print('X_train 개수: ', len(X_train), ', X_test 개수: ', len(X_test))

##  5) 다양한 모델로 학습시켜보기

### - Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

decision_tree = DecisionTreeClassifier(random_state=32)

decision_tree.fit(X_train, y_train)

y_pred = decision_tree.predict(X_test)

####  모델 평가

In [None]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
accuracy

In [None]:
print(classification_report(y_test, y_pred))

### - Random Foreset

In [None]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(random_state=32)

random_forest.fit(X_train, y_train)

y_pred = random_forest.predict(X_test)


#### 모델 평가

In [None]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
accuracy

In [None]:
print(classification_report(y_test, y_pred))

#### 모든 평가값이 1.00 으로 나오는것이 이상하여 혹시 test size가 너무 큰건가 싶어 0.15, 0.1, 0.07 세차례  변경 후 시도해 보았으나 결과값 변화 없었음.
#### 

### - SVM

In [None]:
from sklearn import svm

svm_model = svm.SVC()

svm_model.fit(X_train, y_train)

y_pred = svm_model.predict(X_test)

#### 모델 평가

In [None]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
accuracy

In [None]:
print(classification_report(y_test, y_pred))

### - SGD Classifier

In [None]:
from sklearn.linear_model import SGDClassifier

sgd_model = SGDClassifier()

sgd_model.fit(X_train, y_train)

y_pred = sgd_model.predict(X_test)

#### 모델 평가

In [None]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
accuracy

In [None]:
print(classification_report(y_test, y_pred))

### - Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

logistic_model = LogisticRegression()

logistic_model.fit(X_train, y_train)

y_pred = logistic_model.predict(X_test)

#### 모델 평가

In [None]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
accuracy

In [None]:
print(classification_report(y_test, y_pred))

### wine 의 평가지표로는 accuracy가 적합하다.
### feature data를 토대로 wine의 품종을 판별하여 taget과 일치하는지를 확인하는 분별기이므로 accuracy를 선택.


## *Random Forest 모델이 accuracy 1.00로 가장 좋은 성능을 보였다*

***

# 3. load_breast_cancer : 유방암

## 1) 모듈 import

In [None]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


## 2) 데이터 준비

In [None]:
breast_cancer = load_breast_cancer()

print(type(dir(breast_cancer)))


In [None]:
breast_cancer.keys()

## 3) 데이터 이해하기

### Feature Data 지정하기

In [None]:
breast_cancer_data = breast_cancer.data

print(breast_cancer_data.shape)

In [None]:
breast_cancer.feature_names

### Label Data 지정하기

In [None]:
breast_cancer_label = breast_cancer.target
print(breast_cancer_label.shape)
breast_cancer_label

### Target Names 출력해 보기

In [None]:
breast_cancer.target_names

### 데이터 Describe 해 보기

In [None]:
print(breast_cancer.DESCR)

In [None]:
import pandas as pd

breast_cancer_df = pd.DataFrame(data=breast_cancer_data, columns=breast_cancer.feature_names)

breast_cancer_df["label"] = breast_cancer.target

breast_cancer_df

## 4) train, test 데이터 분리

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(breast_cancer_data, 
                                                    breast_cancer_label, 
                                                    test_size=0.2, 
                                                    random_state=7)

print('X_train 개수: ', len(X_train), ', X_test 개수: ', len(X_test))

##  5) 다양한 모델로 학습시켜보기

### - Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

decision_tree = DecisionTreeClassifier(random_state=32)

decision_tree.fit(X_train, y_train)

y_pred = decision_tree.predict(X_test)

####  모델 평가

In [None]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
accuracy

In [None]:
print(classification_report(y_test, y_pred))

### - Random Foreset

In [None]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(random_state=32)

random_forest.fit(X_train, y_train)

y_pred = random_forest.predict(X_test)


#### 모델 평가

In [None]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
accuracy

In [None]:
print(classification_report(y_test, y_pred))

### - SVM

In [None]:
from sklearn import svm

svm_model = svm.SVC()

svm_model.fit(X_train, y_train)

y_pred = svm_model.predict(X_test)

#### 모델 평가

In [None]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
accuracy

In [None]:
print(classification_report(y_test, y_pred))

### - SGD Classifier

In [None]:
from sklearn.linear_model import SGDClassifier

sgd_model = SGDClassifier()

sgd_model.fit(X_train, y_train)

y_pred = sgd_model.predict(X_test)

#### 모델 평가

In [None]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
accuracy

In [None]:
print(classification_report(y_test, y_pred))

### - Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

logistic_model = LogisticRegression()

logistic_model.fit(X_train, y_train)

y_pred = logistic_model.predict(X_test)

#### 모델 평가

In [None]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
accuracy

In [None]:
print(classification_report(y_test, y_pred))

### breast_cancer 의 평가지표로는 recall가 적합하다.
### 데이터를 분류하였을때 FN(양성을 음성으로 판독)값이  낮아야 하기 때문에  recall 선택


## *Random Forest 모델이 accuracy 1.00로 가장 좋은 성능을 보였다*

# 느낀점 

#### 다양한 데이터를 이용하여 여러 분류기를 사용하는 것과  그리하여 도출된 값을 각 평가 지표에 따라 선별하는 과정을 시행해 보면서 각 분류기의 개요를 조금 알게되었다.
#### 프로젝트를 진행하며 가장 어려웠던 점은 평가지표를 이해하고 모델 특성에 맞게 선택하는 것이었다. 각 수치가 의미하는 바를 이해하기 힘들었고 왜 적용되는지 무엇을 기준으로 하여 적용시키는지 또한 어려웠다. 가장 많은 시간을 소비한 부분이기도 하다.
#### 데이터셋을 불러와 train과 test 로 split하는것, 간단하게 분별기 모델을 변경 가능한것이 유익했다.