## 무작정 따라하기

### 0. 환경준비

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plot
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings(action='ignore')

### 1. 데이터 로딩

In [2]:
path = 'https://raw.githubusercontent.com/DA4BAM/dataset/master/advertising.csv'
data = pd.read_csv(path)

data.head()

Unnamed: 0,TV,Radio,Newspaper,Sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3
3,151.5,41.3,58.5,18.5
4,180.8,10.8,58.4,12.9


- TV, Radio, Newspaper 광고비용 지출에 따른 판매액 데이터
- 광고비용에 따른 매출액

### 2. 데이터 전처리

#### 1) 데이터를 X와 y로 나누기
- X: features, input
- y: target, output

In [4]:
target = 'Sales'
x = data.drop(target, axis=1)
y = data.loc[:, target]

In [5]:
x.head()

Unnamed: 0,TV,Radio,Newspaper
0,230.1,37.8,69.2
1,44.5,39.3,45.1
2,17.2,45.9,69.3
3,151.5,41.3,58.5
4,180.8,10.8,58.4


In [6]:
y.head()

0    22.1
1    10.4
2     9.3
3    18.5
4    12.9
Name: Sales, dtype: float64

#### 2) train, validation, test로 분할

In [7]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

### 3. 학습

In [8]:
# 필요한 함수들 불러오기
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_absolute_error

In [10]:
# 모델 선언하기
model = LinearRegression()

In [11]:
# 학습
model.fit(x_train, y_train)

LinearRegression()

### 4. 예측
- 학습의 결과 모델이 만들어짐
- 모델이 얼마나 정확한지 검증

In [12]:
pred = model.predict(x_test)
pred

array([15.44879715, 10.02542   , 15.81174589, 14.29820885, 18.04544697,
       13.94593875, 15.63052079, 18.75349151,  7.42467152, 18.70458921,
       14.84095237, 14.78543254, 21.54538344, 23.98200536, 14.45955819,
       20.77452798, 12.17205705, 12.1393189 , 10.77321342, 20.33508828,
       16.83598802, 18.8893828 , 21.49029889,  6.7596528 ,  9.1176337 ,
       10.96498503, 20.49059141,  8.25218033, 13.65576416, 15.13450912,
       14.11287885, 16.45458332,  9.41356636, 23.14032921, 10.29537487,
       16.32413042,  9.86484337, 21.25680935,  8.17252095, 13.52387874,
       17.02125307, 15.65891675, 11.85054107,  6.23706005, 11.78997312,
       16.16696818, 11.95261699, 16.74732259, 12.38862469, 15.98542915,
       12.75998492, 10.79018765,  6.68545363, 13.84092333, 10.46838894,
       15.14092368,  8.86779611, 18.11543149, 10.63336588, 17.01836989])

### 5. 평가

In [13]:
# 평균 오차
mean_absolute_error(y_test, pred)

1.369667865284483

## 실습1 : Diamond Price 예측

### 1. 데이터 로딩

In [14]:
path = 'https://raw.githubusercontent.com/DA4BAM/dataset/master/diamonds.csv'
data = pd.read_csv(path)
data = data.sample(2000, random_state=_state=2022)
data.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
50989,0.31,Ideal,G,VS2,61.6,55.0,544,4.37,4.39,2.7
42221,0.33,Ideal,E,IF,62.1,55.0,1289,4.43,4.46,2.76
42307,0.41,Ideal,F,VVS1,62.1,57.0,1295,4.75,4.79,2.96
27207,2.02,Very Good,F,SI1,62.7,59.0,17530,7.97,8.03,5.02
22207,1.5,Good,H,VS1,63.4,59.0,10256,7.2,7.29,4.59


### 2. 데이터 전처리

In [15]:
# x, y 분할
target = 'price'
x = data.drop(target, axis=1)
y = data.loc[:, target]

In [16]:
# 가변수화
col_cat = ['cut', 'color', 'clarity']
x = pd.get_dummies(x, columns=col_cat, drop_first=True)

In [17]:
# train, test 분할
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

### 3. 학습

In [18]:
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_absolute_error

In [19]:
model = LinearRegression()

In [20]:
model.fit(x_train, y_train)

LinearRegression()

### 4. 예측

In [21]:
pred = model.predict(x_test)
pred

array([ 3.72967079e+03,  6.41279604e+03,  1.35494212e+03,  3.46259836e+03,
        6.68769277e+03,  1.04238240e+03,  1.05009576e+03,  8.54623937e+03,
        5.97082582e+03,  1.64441481e+03,  4.42366756e+03, -3.11998244e+02,
       -8.56557590e+02,  7.54660131e+03,  7.64931006e+02,  5.95276484e+03,
       -4.65782017e+02,  6.37202733e+03, -3.69718792e+01,  1.95626314e+03,
        4.56043835e+03,  7.51740332e+03,  5.30592767e+03,  1.53352719e+03,
        2.31487978e+03,  4.88436505e+03,  1.50865610e+03,  2.06695751e+03,
        1.16853226e+04,  1.56455081e+03,  3.23497525e+03,  2.83089315e+02,
        2.31973209e+03,  4.43123405e+03,  1.96686332e+03,  2.26685453e+03,
       -2.73652949e+02,  1.16290628e+04,  3.62951782e+03,  1.07400724e+04,
        4.20405454e+03,  9.12067364e+03,  3.49074343e+03,  8.04363898e+01,
        4.25515101e+02,  6.39840353e+03, -5.43462789e+02,  4.42052023e+03,
        4.44456737e+03,  1.12896693e+03,  6.86512658e+03,  8.31993210e+03,
        9.19932941e+02,  

### 5. 평가

In [22]:
mean_absolute_error(y_test, pred)

811.405480731183

## 실습2 : 보스턴시 타운별 집값 예측

### 1. 데이터 로딩

In [23]:
path = 'https://raw.githubusercontent.com/DA4BAM/dataset/master/boston.csv'
cols = ['crim','indus','chas','nox','rm', 'dis', 'tax','ptratio','lstat','medv']
data = pd.read_csv(path, usecols=cols)
data.head()

Unnamed: 0,crim,indus,chas,nox,rm,dis,tax,ptratio,lstat,medv
0,0.00632,2.31,0,0.538,6.575,4.09,296,15.3,4.98,24.0
1,0.02731,7.07,0,0.469,6.421,4.9671,242,17.8,9.14,21.6
2,0.02729,7.07,0,0.469,7.185,4.9671,242,17.8,4.03,34.7
3,0.03237,2.18,0,0.458,6.998,6.0622,222,18.7,2.94,33.4
4,0.06905,2.18,0,0.458,7.147,6.0622,222,18.7,5.33,36.2


### 2. 데이터 전처리

In [24]:
# x, y 분할
target = 'medv'
x = data.drop(target, axis=1)
y = data.loc[:, target]

In [25]:
# train, test 분할
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

### 3. 학습

In [26]:
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_absolute_error

In [27]:
model = LinearRegression()

In [28]:
model.fit(x_train, y_train)

LinearRegression()

### 4. 예측

In [29]:
pred = model.predict(x_test)
pred

array([17.18796983, 33.45980829, 35.83365832, 17.50029552, 26.23912148,
       25.64554788, 23.25756849, 11.37221652, 41.50517204, 10.98403998,
       27.07216817, 16.2308305 , 20.54209501, 27.36466212, 32.90729002,
       18.06338953, 28.44214459, 28.15696751, 18.86156809, 30.63890432,
       29.01409258, 21.82377987, 18.33743763, 18.70745346, 32.08538584,
        1.87835   , 23.34506285, 18.80237664, 19.08189159, 21.60670611,
       15.31030547,  0.49670909, 24.3356073 , 17.64863757, 11.21912602,
       13.84033876, 10.83040512, 12.86070036, 17.23276794, 17.76195641,
       25.2579805 , 15.22531726, 42.05895289, 17.98870007, 21.18101057,
       27.04375937, 21.79564113, 20.1942654 , 27.59521949, 24.31567264,
        4.11672544, 24.45703453, 32.69521807, 11.62300328, 16.71684158,
       17.39254544, 17.99685456, 21.05394619, 14.75249743, 10.54460911,
       24.85089526, 23.11655803, 19.82755755, 19.06278451, 18.31190937,
       35.73846546, 25.46812616, 27.90544355, 31.8392701 , 10.30

### 5. 평가

In [30]:
mean_absolute_error(y_test, pred)

3.758699697602604