# 머신 러닝 문제 정의
 
머신러닝으로 해결하고자 하는 문제는 크게 3가지로 나뉩니다.

1. 지도 학습 (Supervised Learning)
    - 정답이 주어졌을 때, 해당 데이터의 속성을 바탕으로 정답을 유추합니다. 
2. 비지도 학습 (Un-supervised Learning)
    - 정답이 주어지지 않았을 때, 데이터 간의 관계를 학습합니다.
3. 강화 학습 (Reinforcement Learning)
    - 연속된 행동의 결과로서 최종 점수를 극대화하는 방법을 학습합니다.

일반적으로 3번으로 갈수록 난이도가 높아집니다. 그래서 오늘은 가장 간단한 방법인 supervied learning의 예제 몇가지를 학습해보겠습니다.

대표적인 지도 학습 알고리즘은 선형 회귀(linear regression), 로지스틱 회귀(logistic regression), 분류 나무(decision tree), 서포트 벡터 머신(support vector machine) 등이 있습니다.

- 회귀 문제는 데이터를 바탕으로 결과를 예측하는 모델이며 직선의 관계를 찾습니다.
- 분류 문제는 데이터를 바탕으로 속해있는 집단을 찾습니다.

# 보스턴 집값 예측하기
- 선형 회귀 (Linear Regression)
- y = ax + b

In [1]:
from sklearn.datasets import load_boston

boston       = load_boston()
boston_data  = boston.data
boston_target = boston.target

list(boston)

['data', 'target', 'feature_names', 'DESCR', 'filename']

In [2]:
boston.data

array([[6.3200e-03, 1.8000e+01, 2.3100e+00, ..., 1.5300e+01, 3.9690e+02,
        4.9800e+00],
       [2.7310e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9690e+02,
        9.1400e+00],
       [2.7290e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9283e+02,
        4.0300e+00],
       ...,
       [6.0760e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
        5.6400e+00],
       [1.0959e-01, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9345e+02,
        6.4800e+00],
       [4.7410e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
        7.8800e+00]])

In [3]:
boston.target

array([24. , 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9, 15. ,
       18.9, 21.7, 20.4, 18.2, 19.9, 23.1, 17.5, 20.2, 18.2, 13.6, 19.6,
       15.2, 14.5, 15.6, 13.9, 16.6, 14.8, 18.4, 21. , 12.7, 14.5, 13.2,
       13.1, 13.5, 18.9, 20. , 21. , 24.7, 30.8, 34.9, 26.6, 25.3, 24.7,
       21.2, 19.3, 20. , 16.6, 14.4, 19.4, 19.7, 20.5, 25. , 23.4, 18.9,
       35.4, 24.7, 31.6, 23.3, 19.6, 18.7, 16. , 22.2, 25. , 33. , 23.5,
       19.4, 22. , 17.4, 20.9, 24.2, 21.7, 22.8, 23.4, 24.1, 21.4, 20. ,
       20.8, 21.2, 20.3, 28. , 23.9, 24.8, 22.9, 23.9, 26.6, 22.5, 22.2,
       23.6, 28.7, 22.6, 22. , 22.9, 25. , 20.6, 28.4, 21.4, 38.7, 43.8,
       33.2, 27.5, 26.5, 18.6, 19.3, 20.1, 19.5, 19.5, 20.4, 19.8, 19.4,
       21.7, 22.8, 18.8, 18.7, 18.5, 18.3, 21.2, 19.2, 20.4, 19.3, 22. ,
       20.3, 20.5, 17.3, 18.8, 21.4, 15.7, 16.2, 18. , 14.3, 19.2, 19.6,
       23. , 18.4, 15.6, 18.1, 17.4, 17.1, 13.3, 17.8, 14. , 14.4, 13.4,
       15.6, 11.8, 13.8, 15.6, 14.6, 17.8, 15.4, 21

In [4]:
boston.feature_names

array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
       'TAX', 'PTRATIO', 'B', 'LSTAT'], dtype='<U7')

In [23]:
df = pd.DataFrame(data=boston_data, columns=boston.feature_names)
df['target'] = boston_target
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [24]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(boston_data, boston_target, test_size=0.2, random_state=1)
print(f"X train: {X_train.shape}\nX_test: {X_test.shape}\ny_train: {y_train.shape}\ny_test: {y_test.shape}")

X train: (404, 13)
X_test: (102, 13)
y_train: (404,)
y_test: (102,)


In [26]:
from sklearn.linear_model import LinearRegression

lm_model = LinearRegression()
lm_model.fit(X_train, y_train)
pred = lm_model.predict(X_test)
pred[:10]

array([32.65503184, 28.0934953 , 18.02901829, 21.47671576, 18.8254387 ,
       19.87997758, 32.42014863, 18.06597765, 24.42277848, 27.00977832])

In [33]:
r2 = 1 - ((y_test - pred)**2).sum() / ((y_test - y_test.mean())**2).sum()
r2.round(2)

0.76

In [34]:
lm_model.coef_

array([-1.12386867e-01,  5.80587074e-02,  1.83593559e-02,  2.12997760e+00,
       -1.95811012e+01,  3.09546166e+00,  4.45265228e-03, -1.50047624e+00,
        3.05358969e-01, -1.11230879e-02, -9.89007562e-01,  7.32130017e-03,
       -5.44644997e-01])

In [35]:
lm_model.intercept_

42.93352585337743

# 붓꽃 품종 예측하기
- 분류 문제 (Classification)
- 3가지 종류의 품종을 예측해보자
- 꽃잎의 넓이, 길이 / 꽃받침의 넓이, 길이

In [3]:
import sklearn
import pandas as pd
import numpy as np

# data download
from sklearn.datasets import load_iris

iris = load_iris()
iris_data = iris.data
iris_label = iris.target

df = pd.DataFrame(data=iris_data, columns=iris.feature_names)
df['label'] = iris_label
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),label
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [6]:
# 총 3개의 class가 있다
pd.unique(df.label)

array([0, 1, 2])

In [9]:
# train, test 데이터를 나누기
# 일반적으로 1/5 정도인 0.2로 나눈다
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(iris_data, iris_label, test_size=0.2, random_state=1)
print(f"X train: {X_train.shape}\nX_test: {X_test.shape}\ny_train: {y_train.shape}\ny_test: {y_test.shape}")

X train: (120, 4)
X_test: (30, 4)
y_train: (120,)
y_test: (30,)


In [12]:
# 간단한 decision tree model
from sklearn.tree import DecisionTreeClassifier

dt_clf = DecisionTreeClassifier(random_state=11)
dt_clf.fit(X_train, y_train)
pred = dt_clf.predict(X_test)
pred[:10]

array([0, 1, 1, 0, 2, 1, 2, 0, 0, 2])