# 머신러닝 입문

### feature
머신러닝에서 데이터의 특징을 나타내는 변수

### 데이터 타입
- Numeric : 정량적 측정 가능, 단위가 있는 데이터
- Nominal : 범주로 분류가 가능
- Ordinal : 범주로 분류가 가능하면서 순서가 있음

### 회귀
예측하고자 하는 데이터 Y (종속변수)  
Y에 영향을 주는 데이터 X (독립변수, feature)  
각 feature의 가중치 W  

Y = X와 W의 내적

### 고려해야 할 문제점
- 데이터의 최대/최소가 다름 (Scale에 따른 y값에 영향)
- Nominal, Ordinal한 값들의 표현
- 잘못 기입된 값 처리
- 극단값 처리

In [63]:
import numpy as np

In [64]:
w_vector = np.array([[1], [1], [1]])
x_vector = np.array([[3], [4], [5]])
w_vector.T

array([[1, 1, 1]])

In [65]:
w_vector.T.dot(x_vector)

array([[12]])

### 데이터 불러오기

In [66]:
import pandas as pd
import numpy as np

In [67]:
data_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data'
df_data = pd.read_csv(data_url, sep='\s+', header = None)

In [68]:
df_data.columns = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
df_data.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


상수항에 곱해지는 `w0`는 항상 1, 열 추가  
구하려는 목표값(`y`)은 MEDV이므로 열 제거

In [69]:
df_data['weight_0'] = 1
df_data.drop('MEDV', axis=1, inplace=True)

In [70]:
df_data

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,weight_0
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296.0,15.3,396.90,4.98,1
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.90,9.14,1
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,1
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,1
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.90,5.33,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273.0,21.0,391.99,9.67,1
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273.0,21.0,396.90,9.08,1
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273.0,21.0,396.90,5.64,1
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273.0,21.0,393.45,6.48,1
