# Iris 데이터 정보

## 클래스
* setosa
* versicolour
* virginica

In [2]:
from sklearn.datasets import load_iris

iris = load_iris()

In [3]:
iris.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename'])

In [6]:
iris.data.shape

(150, 4)

In [8]:
iris.target.shape

(150,)

In [11]:
iris.feature_names

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [9]:
iris_data = iris.data

In [10]:
iris_data[0]

array([5.1, 3.5, 1.4, 0.2])

In [12]:
iris_label = iris.target

In [13]:
iris_label

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [16]:
iris.target_names
# 0 : setosa, 1 : versicolor, 2 : virginica

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [18]:
print(iris.DESCR)

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :

In [19]:
iris.filename

'/opt/conda/lib/python3.7/site-packages/sklearn/datasets/data/iris.csv'

## Iris 데이터 판다스로 변환

In [20]:
import pandas as pd

iris_df = pd.DataFrame(data = iris_data, columns=iris.feature_names)
iris_df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [21]:
iris_df['label'] = iris.target
iris_df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),label
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


* feature : sepal length, sepal width, petal length, petal width -> 특징
* label, target : 붓꽃 종류 클래스(0,1,2)

In [23]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(iris_data, iris_label, test_size=0.2, random_state=7)

In [24]:
type(X_train)

numpy.ndarray

In [31]:
# 각 클래스가 균등하게 들어가는지 확인 : 분리할 때 Random seed를 줘서 랜덤하게 들어간다.
len(y_train[y_train == 0]), len(y_train[y_train == 1]), len(y_train[y_train == 2])

(43, 38, 39)

## 머신러닝 모델 학습하기

붓꽃의 종류를 알아내는 것은 **지도학습**이고, **분류문제**이다.

In [35]:
from sklearn.tree import DecisionTreeClassifier

decision_tree = DecisionTreeClassifier(random_state=32)
decision_tree.fit(X_train, y_train)

DecisionTreeClassifier(random_state=32)

## 머신러닝 모델 평가하기

In [36]:
y_pred = decision_tree.predict(X_test)

In [37]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
accuracy

0.9

## 랜덤 포레스트 이용하기
* Decision Tree를 여러개 합쳐서 만든 모델.
* RandomForest의 random_state란 Decision Tree를 만드는 요소들을 랜덤하게 뽑는 것을 의미

In [40]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
X_train, X_test, y_train, y_test = train_test_split(iris_data, iris_label, test_size=0.2, random_state=21)

random_forest = RandomForestClassifier(random_state = 23)
random_forest.fit(X_train, y_train)
y_pred = random_forest.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        11
           1       1.00      0.83      0.91        12
           2       0.78      1.00      0.88         7

    accuracy                           0.93        30
   macro avg       0.93      0.94      0.93        30
weighted avg       0.95      0.93      0.93        30



## SVM 이용하기
* Support Vector Machine
* 장점 : 복잡한 비선형 의사결정 영역을 모형화 할 수 있기 때문에 매우 정확하며, 다른 모델들 보다 오버피팅되는 경향이 적다.

In [43]:
from sklearn import svm
svm_model = svm.SVC()

svm_model.fit(X_train, y_train)
y_pred = svm_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        11
           1       0.91      0.83      0.87        12
           2       0.75      0.86      0.80         7

    accuracy                           0.90        30
   macro avg       0.89      0.90      0.89        30
weighted avg       0.91      0.90      0.90        30



## SGD Classifier
* Stochastic Gradient Descent
* 장점
    - 효율성
    - 구현이 간편함
* 단점
    - 정규화 매개변수와 같은 하이퍼파라미터가 필요하다.
    - 특징 scaling에 민감하다

In [44]:
from sklearn.linear_model import SGDClassifier
sgd_model = SGDClassifier()
sgd_model.fit(X_train, y_train)
y_pred = sgd_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        11
           1       0.86      1.00      0.92        12
           2       1.00      0.71      0.83         7

    accuracy                           0.93        30
   macro avg       0.95      0.90      0.92        30
weighted avg       0.94      0.93      0.93        30



## Logistic Regression
회귀를 사용해 데이터가 어떤 점주에 속할 확률을 0~1사이 값으로 예측하고 확률에 따라 가능성이 더 높은 범주에 속하는 것으로 분류

In [46]:
from sklearn.linear_model import LogisticRegression
logistic_model = LogisticRegression()
logistic_model.fit(X_train, y_train)
y_pred = logistic_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        11
           1       1.00      0.83      0.91        12
           2       0.78      1.00      0.88         7

    accuracy                           0.93        30
   macro avg       0.93      0.94      0.93        30
weighted avg       0.95      0.93      0.93        30



## 모델을 평가하는 다양한 방법

* label에 들어있는 분류에 따라 불균형이 올 수 있다.
* 이때 정답과 오답을 구분해 표현하는 것이 오차행렬이다.

### 오차행렬
![image](https://user-images.githubusercontent.com/63278762/125717289-884aa5d3-f425-4806-bbfd-9f85f3d8b86c.png)

* TP : 정답을 맞게 예측
* FN : 정답을 틀리게 예측
* FP : 오답을 맞게 예측
* TN : 오답을 틀리게 예측

##### 성능 지표
* Sensitivity(Recall) : $$\frac{TP}{FN+TP}$$
* Specificity 
* Precision : $$\frac{TP}{FP+TP}$$
* accuracy : $$\frac{TP+TN}{TP+TN+FP+FN}$$
* F1 Score : $$\frac{2}{\frac{1}{recall}+\frac{1}{precision}}$$

****
* Precision : precision을 높이려면 음성인데 양성으로 판단하는 경우가 적어야 한다.
* Recall : recall을 높이려면 양성인데 음성으로 판단하는 경우가 적어야 한다.

****
위의 precision, recall, f1-score을 한눈에 보는 방법 = classification_report

# toy datasets에 대해서

## digits
* 손글씨 이미지 데이터

* digits의 이미지 크기는 8x8
* Class : 10개
* 샘플 총 개수 : 1797

In [48]:
from sklearn.datasets import load_digits
digits = load_digits()

In [54]:
digits.keys()

dict_keys(['data', 'target', 'frame', 'feature_names', 'target_names', 'images', 'DESCR'])

In [55]:
digits.data.shape

(1797, 64)

In [57]:
digits.target_names

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [58]:
digits.feature_names

['pixel_0_0',
 'pixel_0_1',
 'pixel_0_2',
 'pixel_0_3',
 'pixel_0_4',
 'pixel_0_5',
 'pixel_0_6',
 'pixel_0_7',
 'pixel_1_0',
 'pixel_1_1',
 'pixel_1_2',
 'pixel_1_3',
 'pixel_1_4',
 'pixel_1_5',
 'pixel_1_6',
 'pixel_1_7',
 'pixel_2_0',
 'pixel_2_1',
 'pixel_2_2',
 'pixel_2_3',
 'pixel_2_4',
 'pixel_2_5',
 'pixel_2_6',
 'pixel_2_7',
 'pixel_3_0',
 'pixel_3_1',
 'pixel_3_2',
 'pixel_3_3',
 'pixel_3_4',
 'pixel_3_5',
 'pixel_3_6',
 'pixel_3_7',
 'pixel_4_0',
 'pixel_4_1',
 'pixel_4_2',
 'pixel_4_3',
 'pixel_4_4',
 'pixel_4_5',
 'pixel_4_6',
 'pixel_4_7',
 'pixel_5_0',
 'pixel_5_1',
 'pixel_5_2',
 'pixel_5_3',
 'pixel_5_4',
 'pixel_5_5',
 'pixel_5_6',
 'pixel_5_7',
 'pixel_6_0',
 'pixel_6_1',
 'pixel_6_2',
 'pixel_6_3',
 'pixel_6_4',
 'pixel_6_5',
 'pixel_6_6',
 'pixel_6_7',
 'pixel_7_0',
 'pixel_7_1',
 'pixel_7_2',
 'pixel_7_3',
 'pixel_7_4',
 'pixel_7_5',
 'pixel_7_6',
 'pixel_7_7']

## wine
* 와인 데이터

In [59]:
from sklearn.datasets import load_wine
wine = load_wine()

In [60]:
wine.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names'])

In [61]:
wine.data.shape

(178, 13)

In [63]:
wine.target_names

array(['class_0', 'class_1', 'class_2'], dtype='<U7')

In [66]:
wine.feature_names

['alcohol',
 'malic_acid',
 'ash',
 'alcalinity_of_ash',
 'magnesium',
 'total_phenols',
 'flavanoids',
 'nonflavanoid_phenols',
 'proanthocyanins',
 'color_intensity',
 'hue',
 'od280/od315_of_diluted_wines',
 'proline']

## breast_cancer
* 유방암 데이터

In [67]:
from sklearn.datasets import load_breast_cancer
breast_cancer = load_breast_cancer()

In [69]:
breast_cancer.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename'])

In [71]:
# 데이터 총 개수
breast_cancer.data.shape

(569, 30)

In [73]:
# 데이터 feature
breast_cancer.feature_names,breast_cancer.feature_names.shape 

(array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
        'mean smoothness', 'mean compactness', 'mean concavity',
        'mean concave points', 'mean symmetry', 'mean fractal dimension',
        'radius error', 'texture error', 'perimeter error', 'area error',
        'smoothness error', 'compactness error', 'concavity error',
        'concave points error', 'symmetry error',
        'fractal dimension error', 'worst radius', 'worst texture',
        'worst perimeter', 'worst area', 'worst smoothness',
        'worst compactness', 'worst concavity', 'worst concave points',
        'worst symmetry', 'worst fractal dimension'], dtype='<U23'),
 (30,))

In [75]:
# label 종류
# malignant : 악성종양
# benign : 양성종양
breast_cancer.target_names

array(['malignant', 'benign'], dtype='<U9')