# Scikit-learn


#### 사이킷런이란?

- 파이썬을 활용한 머신러닝 도구
- 데이터 분석(예측)을 위한 간단하고 효율적인 도구
- 누구나 쉽게 다양한 상황에서 활용 가능
- 오픈소스

#### 사이킷런으로 할 수 있는 것

- 분류 (e.g., 스팸)
- 회귀 (e.g., 가격)
- 클러스터링 (e.g., 고객 세그먼트)
- 차원축소 (e.g., 변수/컬럼의 수를 줄임)
- 모델 선택 (e.g., 모델 튜닝, 평가)
- 전처리 (e.g., 데이터 가공/변환)

<br>

## 데이터 확인

In [28]:
import sklearn
print(sklearn.__version__)

1.6.1


In [None]:
# 데이터 생성
import pandas as pd
import numpy as np
data = pd.DataFrame({
    '메뉴': ['[인기]아이펠치킨','닭강정','간장치킨','마늘치킨','파닭','승일양념치킨','양념반후라이드반','황금후라이드','[베스트]풀잎치킨'],
    '가격': [16000,15000,14000,14000,14000,13000,13000,12000,9900],
    '호수' : [11,12,9,9,11,10,10,10,10],
    '칼로리' : [1200.0,1500.0,1600.0,1800.0,1300.0,1400.0,1300.0,1000.0,1000.0],
    '할인율' : [0.5,0.2,0.2,0.2,0.2,0.2,0.2,0.2,np.nan],
    '할인가' : [8000.0,12000.0,11200.0,11200.0,11200.0,10400.0,10400.0,9600.0,np.nan],
    '원산지' : ['국내산','브라질','국내산','국내산','브라질','국내산','국내산','국내산','국내산'],
    '살찔까요' : ['no','yes','yes','yes','yes','yes','yes','no','no'],
    '고민' : ['무조건먹자','먹지말자','먹지말자','먹지말자','먹지말자','먹지말자','먹지말자','무조건먹자','무조건먹자']
})
data.to_csv('final_modudak.csv', index=False)
data

Unnamed: 0,메뉴,가격,호수,칼로리,할인율,할인가,원산지,살찔까요,고민
0,[인기]아이펠치킨,16000,11,1200.0,0.5,8000.0,국내산,no,무조건먹자
1,닭강정,15000,12,1500.0,0.2,12000.0,브라질,yes,먹지말자
2,간장치킨,14000,9,1600.0,0.2,11200.0,국내산,yes,먹지말자
3,마늘치킨,14000,9,1800.0,0.2,11200.0,국내산,yes,먹지말자
4,파닭,14000,11,1300.0,0.2,11200.0,브라질,yes,먹지말자
5,승일양념치킨,13000,10,1400.0,0.2,10400.0,국내산,yes,먹지말자
6,양념반후라이드반,13000,10,1300.0,0.2,10400.0,국내산,yes,먹지말자
7,황금후라이드,12000,10,1000.0,0.2,9600.0,국내산,no,무조건먹자
8,[베스트]풀잎치킨,9900,10,1000.0,,,국내산,no,무조건먹자


In [None]:
# 데이터 변경
data.loc[2, '원산지'] = '미국'

In [None]:
# 활용할 데이터 선택
df = data[['가격', '호수', '칼로리', '원산지', '살찔까요']].copy()  # DataFrame이므로 대괄호 2개
df

Unnamed: 0,가격,호수,칼로리,원산지,살찔까요
0,16000,11,1200.0,국내산,no
1,15000,12,1500.0,브라질,yes
2,14000,9,1600.0,미국,yes
3,14000,9,1800.0,국내산,yes
4,14000,11,1300.0,브라질,yes
5,13000,10,1400.0,국내산,yes
6,13000,10,1300.0,국내산,yes
7,12000,10,1000.0,국내산,no
8,9900,10,1000.0,국내산,no


<br>

## 데이터 전처리: 범주형 데이터

- 레이블 인코딩
- 원핫 인코딩

In [4]:
# type 확인
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   가격      9 non-null      int64  
 1   호수      9 non-null      int64  
 2   칼로리     9 non-null      float64
 3   원산지     9 non-null      object 
 4   살찔까요    9 non-null      object 
dtypes: float64(1), int64(2), object(2)
memory usage: 492.0+ bytes


<br>

### 레이블(label) 인코딩

In [9]:
# 레이블(label) 인코딩
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
# le.fit(df['원산지'])
# le.transform(df['원산지'])

In [10]:
# fit_transform
le.fit_transform(df['원산지'])

array([0, 2, 1, 0, 2, 0, 0, 0, 0])

In [11]:
# 데이터 변환
df['원산지'] = le.fit_transform(df['원산지'])
df

Unnamed: 0,가격,호수,칼로리,원산지,살찔까요
0,16000,11,1200.0,0,no
1,15000,12,1500.0,2,yes
2,14000,9,1600.0,1,yes
3,14000,9,1800.0,0,yes
4,14000,11,1300.0,2,yes
5,13000,10,1400.0,0,yes
6,13000,10,1300.0,0,yes
7,12000,10,1000.0,0,no
8,9900,10,1000.0,0,no


In [12]:
# '살찔까요' 레이블 인코딩
le = LabelEncoder()
df['살찔까요'] = le.fit_transform(df['살찔까요'])
df

Unnamed: 0,가격,호수,칼로리,원산지,살찔까요
0,16000,11,1200.0,0,0
1,15000,12,1500.0,2,1
2,14000,9,1600.0,1,1
3,14000,9,1800.0,0,1
4,14000,11,1300.0,2,1
5,13000,10,1400.0,0,1
6,13000,10,1300.0,0,1
7,12000,10,1000.0,0,0
8,9900,10,1000.0,0,0


<br>

### 레이블 인코딩(심화)

- 여러 개 컬럼을 한 번에 인코딩

In [13]:
# 활용할 데이터 선택
df = data[['가격', '호수', '칼로리', '원산지', '살찔까요']].copy()
df

Unnamed: 0,가격,호수,칼로리,원산지,살찔까요
0,16000,11,1200.0,국내산,no
1,15000,12,1500.0,브라질,yes
2,14000,9,1600.0,미국,yes
3,14000,9,1800.0,국내산,yes
4,14000,11,1300.0,브라질,yes
5,13000,10,1400.0,국내산,yes
6,13000,10,1300.0,국내산,yes
7,12000,10,1000.0,국내산,no
8,9900,10,1000.0,국내산,no


In [17]:
# object 타입 컬럼 선택
# 방법 1. cols = ['원산지', '살찔까요']
cols = df.select_dtypes(include='object').columns

In [19]:
# 한 번에 레이블 인코딩
from sklearn.preprocessing import LabelEncoder

for col in cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

In [20]:
# 인코딩 결과 확인
df

Unnamed: 0,가격,호수,칼로리,원산지,살찔까요
0,16000,11,1200.0,0,0
1,15000,12,1500.0,2,1
2,14000,9,1600.0,1,1
3,14000,9,1800.0,0,1
4,14000,11,1300.0,2,1
5,13000,10,1400.0,0,1
6,13000,10,1300.0,0,1
7,12000,10,1000.0,0,0
8,9900,10,1000.0,0,0


<br>

### 원핫(one-hot) 인코딩

In [21]:
# 활용할 데이터 선택
df = data[['가격', '호수', '칼로리', '원산지', '살찔까요']].copy()
df

Unnamed: 0,가격,호수,칼로리,원산지,살찔까요
0,16000,11,1200.0,국내산,no
1,15000,12,1500.0,브라질,yes
2,14000,9,1600.0,미국,yes
3,14000,9,1800.0,국내산,yes
4,14000,11,1300.0,브라질,yes
5,13000,10,1400.0,국내산,yes
6,13000,10,1300.0,국내산,yes
7,12000,10,1000.0,국내산,no
8,9900,10,1000.0,국내산,no


In [27]:
# 원핫인코딩
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse_output=False)  # sparse=False: array 형태로 반환받을 수 있다.
cat = ohe.fit_transform(df[['원산지']])     # 원핫인코딩은 fit_transform할 때 DataFrame 형태로 전달해야 한다.
cat

array([[1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.]])

In [29]:
# 카테고리
ohe.categories_

[array(['국내산', '미국', '브라질'], dtype=object)]

In [31]:
# 피처(컬럼)이름 + 카테고리
ohe.get_feature_names_out()

array(['원산지_국내산', '원산지_미국', '원산지_브라질'], dtype=object)

In [33]:
# 데이터프레임으로 변환
df_cat = pd.DataFrame(cat, columns=ohe.get_feature_names_out())
df_cat

Unnamed: 0,원산지_국내산,원산지_미국,원산지_브라질
0,1.0,0.0,0.0
1,0.0,0.0,1.0
2,0.0,1.0,0.0
3,1.0,0.0,0.0
4,0.0,0.0,1.0
5,1.0,0.0,0.0
6,1.0,0.0,0.0
7,1.0,0.0,0.0
8,1.0,0.0,0.0


In [35]:
# 데이터프레임 합치기
df = pd.concat([df, df_cat], axis=1)
df

Unnamed: 0,가격,호수,칼로리,원산지,살찔까요,원산지_국내산,원산지_미국,원산지_브라질
0,16000,11,1200.0,국내산,no,1.0,0.0,0.0
1,15000,12,1500.0,브라질,yes,0.0,0.0,1.0
2,14000,9,1600.0,미국,yes,0.0,1.0,0.0
3,14000,9,1800.0,국내산,yes,1.0,0.0,0.0
4,14000,11,1300.0,브라질,yes,0.0,0.0,1.0
5,13000,10,1400.0,국내산,yes,1.0,0.0,0.0
6,13000,10,1300.0,국내산,yes,1.0,0.0,0.0
7,12000,10,1000.0,국내산,no,1.0,0.0,0.0
8,9900,10,1000.0,국내산,no,1.0,0.0,0.0


In [36]:
# 기존 컬럼 삭제
df = df.drop(['원산지'], axis=1)
df

Unnamed: 0,가격,호수,칼로리,살찔까요,원산지_국내산,원산지_미국,원산지_브라질
0,16000,11,1200.0,no,1.0,0.0,0.0
1,15000,12,1500.0,yes,0.0,0.0,1.0
2,14000,9,1600.0,yes,0.0,1.0,0.0
3,14000,9,1800.0,yes,1.0,0.0,0.0
4,14000,11,1300.0,yes,0.0,0.0,1.0
5,13000,10,1400.0,yes,1.0,0.0,0.0
6,13000,10,1300.0,yes,1.0,0.0,0.0
7,12000,10,1000.0,no,1.0,0.0,0.0
8,9900,10,1000.0,no,1.0,0.0,0.0


<br>

### 원핫 인코딩(심화)

- 여러 개 컬럼을 한 번에 인코딩

In [57]:
# 활용할 데이터 선택
df = data[['가격', '호수', '칼로리', '원산지', '살찔까요']].copy()
df

Unnamed: 0,가격,호수,칼로리,원산지,살찔까요
0,16000,11,1200.0,국내산,no
1,15000,12,1500.0,브라질,yes
2,14000,9,1600.0,미국,yes
3,14000,9,1800.0,국내산,yes
4,14000,11,1300.0,브라질,yes
5,13000,10,1400.0,국내산,yes
6,13000,10,1300.0,국내산,yes
7,12000,10,1000.0,국내산,no
8,9900,10,1000.0,국내산,no


In [58]:
# 원핫인코딩
cols = df.select_dtypes(include='object').columns
ohe = OneHotEncoder(sparse_output=False)
cat = ohe.fit_transform(df[cols])
df_cat = pd.DataFrame(cat, columns=ohe.get_feature_names_out())
df_cat

Unnamed: 0,원산지_국내산,원산지_미국,원산지_브라질,살찔까요_no,살찔까요_yes
0,1.0,0.0,0.0,1.0,0.0
1,0.0,0.0,1.0,0.0,1.0
2,0.0,1.0,0.0,0.0,1.0
3,1.0,0.0,0.0,0.0,1.0
4,0.0,0.0,1.0,0.0,1.0
5,1.0,0.0,0.0,0.0,1.0
6,1.0,0.0,0.0,0.0,1.0
7,1.0,0.0,0.0,1.0,0.0
8,1.0,0.0,0.0,1.0,0.0


In [59]:
# 데이터프레임 합치기
df = pd.concat([df, df_cat], axis=1)
df.head()

Unnamed: 0,가격,호수,칼로리,원산지,살찔까요,원산지_국내산,원산지_미국,원산지_브라질,살찔까요_no,살찔까요_yes
0,16000,11,1200.0,국내산,no,1.0,0.0,0.0,1.0,0.0
1,15000,12,1500.0,브라질,yes,0.0,0.0,1.0,0.0,1.0
2,14000,9,1600.0,미국,yes,0.0,1.0,0.0,0.0,1.0
3,14000,9,1800.0,국내산,yes,1.0,0.0,0.0,0.0,1.0
4,14000,11,1300.0,브라질,yes,0.0,0.0,1.0,0.0,1.0


In [60]:
# 컬럼 삭제
df = df.drop(cols, axis=1)
df

Unnamed: 0,가격,호수,칼로리,원산지_국내산,원산지_미국,원산지_브라질,살찔까요_no,살찔까요_yes
0,16000,11,1200.0,1.0,0.0,0.0,1.0,0.0
1,15000,12,1500.0,0.0,0.0,1.0,0.0,1.0
2,14000,9,1600.0,0.0,1.0,0.0,0.0,1.0
3,14000,9,1800.0,1.0,0.0,0.0,0.0,1.0
4,14000,11,1300.0,0.0,0.0,1.0,0.0,1.0
5,13000,10,1400.0,1.0,0.0,0.0,0.0,1.0
6,13000,10,1300.0,1.0,0.0,0.0,0.0,1.0
7,12000,10,1000.0,1.0,0.0,0.0,1.0,0.0
8,9900,10,1000.0,1.0,0.0,0.0,1.0,0.0


<br>

### [Tip] 원핫 인코딩: 판다스 활용

In [66]:
# 활용할 데이터 선택
df = data[['가격', '호수', '칼로리', '원산지', '살찔까요']].copy()
df.head()

Unnamed: 0,가격,호수,칼로리,원산지,살찔까요
0,16000,11,1200.0,국내산,no
1,15000,12,1500.0,브라질,yes
2,14000,9,1600.0,미국,yes
3,14000,9,1800.0,국내산,yes
4,14000,11,1300.0,브라질,yes


In [67]:
# 판다스를 활용한 원핫 인코딩
df = pd.get_dummies(df, dtype=int)  # 기본값: dtype=bool
df

Unnamed: 0,가격,호수,칼로리,원산지_국내산,원산지_미국,원산지_브라질,살찔까요_no,살찔까요_yes
0,16000,11,1200.0,1,0,0,1,0
1,15000,12,1500.0,0,0,1,0,1
2,14000,9,1600.0,0,1,0,0,1
3,14000,9,1800.0,1,0,0,0,1
4,14000,11,1300.0,0,0,1,0,1
5,13000,10,1400.0,1,0,0,0,1
6,13000,10,1300.0,1,0,0,0,1
7,12000,10,1000.0,1,0,0,1,0
8,9900,10,1000.0,1,0,0,1,0


Train 데이터와 Test 데이터의 컬럼 수는 반드시 일치해야 한다.  
그렇지 않을 때는 판다스가 아닌, 사이킷런의 원핫 인코딩을 사용해야 한다.

<br>

## 데이터 전처리: 수치형 데이터

- 표준화
- 정규화(Min-Max)

<br>

### 표준화 (StandardScaler)

- 평균이 0, 분산이 1인 표준정규분포로 변환

$$
z = \frac{x - \mu}{\sigma}
$$


In [68]:
# 원핫 인코딩된 데이터
df.head()

Unnamed: 0,가격,호수,칼로리,원산지_국내산,원산지_미국,원산지_브라질,살찔까요_no,살찔까요_yes
0,16000,11,1200.0,1,0,0,1,0
1,15000,12,1500.0,0,0,1,0,1
2,14000,9,1600.0,0,1,0,0,1
3,14000,9,1800.0,1,0,0,0,1
4,14000,11,1300.0,0,0,1,0,1


In [69]:
# StandardScaler
from sklearn.preprocessing import StandardScaler
cols = ['가격', '호수', '칼로리']
scaler = StandardScaler()
df[cols] = scaler.fit_transform(df[cols])
df

Unnamed: 0,가격,호수,칼로리,원산지_국내산,원산지_미국,원산지_브라질,살찔까요_no,살찔까요_yes
0,1.54247,0.848875,-0.57792,1,0,0,1,0
1,0.941508,1.940285,0.622376,0,0,1,0,1
2,0.340545,-1.333946,1.022475,0,1,0,0,1
3,0.340545,-1.333946,1.822672,1,0,0,0,1
4,0.340545,0.848875,-0.177822,0,0,1,0,1
5,-0.260417,-0.242536,0.222277,1,0,0,0,1
6,-0.260417,-0.242536,-0.177822,1,0,0,0,1
7,-0.861379,-0.242536,-1.378118,1,0,0,1,0
8,-2.1234,-0.242536,-1.378118,1,0,0,1,0


<br>

### 정규화 (MinMaxScaler)

- 모든 값을 0~1 사이 값으로 변환 (데이터 사이즈 축소)

$$
x_{\text{scale}} = \frac{x - x_{\min}}{x_{\max} - x_{\min}}
$$


In [70]:
# MinMaxScaler
from sklearn.preprocessing import MinMaxScaler
cols = ['가격', '호수', '칼로리']
scaler = MinMaxScaler()
df[cols] = scaler.fit_transform(df[cols])
df

Unnamed: 0,가격,호수,칼로리,원산지_국내산,원산지_미국,원산지_브라질,살찔까요_no,살찔까요_yes
0,1.0,0.666667,0.25,1,0,0,1,0
1,0.836066,1.0,0.625,0,0,1,0,1
2,0.672131,0.0,0.75,0,1,0,0,1
3,0.672131,0.0,1.0,1,0,0,0,1
4,0.672131,0.666667,0.375,0,0,1,0,1
5,0.508197,0.333333,0.5,1,0,0,0,1
6,0.508197,0.333333,0.375,1,0,0,0,1
7,0.344262,0.333333,0.0,1,0,0,1,0
8,0.0,0.333333,0.0,1,0,0,1,0


<br>

## 사이킷런에서 제공하는 데이터셋

### 유방암 데이터

In [71]:
# 사이킷런 데이터 불러오기 - 유방암 데이터
from sklearn.datasets import load_breast_cancer
dataset = load_breast_cancer()
dataset

{'data': array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,
         1.189e-01],
        [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,
         8.902e-02],
        [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,
         8.758e-02],
        ...,
        [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,
         7.820e-02],
        [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,
         1.240e-01],
        [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,
         7.039e-02]]),
 'target': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
        1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
        1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
        1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0

In [72]:
# 피처 이름 확인
dataset.feature_names

array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error',
       'fractal dimension error', 'worst radius', 'worst texture',
       'worst perimeter', 'worst area', 'worst smoothness',
       'worst compactness', 'worst concavity', 'worst concave points',
       'worst symmetry', 'worst fractal dimension'], dtype='<U23')

In [73]:
# 타겟 확인
dataset.target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0,

In [74]:
# 데이터 확인
dataset.data[:2]

array([[1.799e+01, 1.038e+01, 1.228e+02, 1.001e+03, 1.184e-01, 2.776e-01,
        3.001e-01, 1.471e-01, 2.419e-01, 7.871e-02, 1.095e+00, 9.053e-01,
        8.589e+00, 1.534e+02, 6.399e-03, 4.904e-02, 5.373e-02, 1.587e-02,
        3.003e-02, 6.193e-03, 2.538e+01, 1.733e+01, 1.846e+02, 2.019e+03,
        1.622e-01, 6.656e-01, 7.119e-01, 2.654e-01, 4.601e-01, 1.189e-01],
       [2.057e+01, 1.777e+01, 1.329e+02, 1.326e+03, 8.474e-02, 7.864e-02,
        8.690e-02, 7.017e-02, 1.812e-01, 5.667e-02, 5.435e-01, 7.339e-01,
        3.398e+00, 7.408e+01, 5.225e-03, 1.308e-02, 1.860e-02, 1.340e-02,
        1.389e-02, 3.532e-03, 2.499e+01, 2.341e+01, 1.588e+02, 1.956e+03,
        1.238e-01, 1.866e-01, 2.416e-01, 1.860e-01, 2.750e-01, 8.902e-02]])

In [76]:
# 데이터프레임으로 만들기
cancer_df = pd.DataFrame(data=dataset.data, columns=dataset.feature_names)
cancer_df

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


In [77]:
# 타켓 추가하기
cancer_df['target'] = dataset.target
cancer_df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [78]:
# 사이킷런에서 제공하는 데이터셋 확인 (load_*)
import sklearn.datasets
sklearn.datasets.__all__

['clear_data_home',
 'dump_svmlight_file',
 'fetch_20newsgroups',
 'fetch_20newsgroups_vectorized',
 'fetch_file',
 'fetch_lfw_pairs',
 'fetch_lfw_people',
 'fetch_olivetti_faces',
 'fetch_species_distributions',
 'fetch_california_housing',
 'fetch_covtype',
 'fetch_rcv1',
 'fetch_kddcup99',
 'fetch_openml',
 'get_data_home',
 'load_diabetes',
 'load_digits',
 'load_files',
 'load_iris',
 'load_breast_cancer',
 'load_linnerud',
 'load_sample_image',
 'load_sample_images',
 'load_svmlight_file',
 'load_svmlight_files',
 'load_wine',
 'make_biclusters',
 'make_blobs',
 'make_circles',
 'make_classification',
 'make_checkerboard',
 'make_friedman1',
 'make_friedman2',
 'make_friedman3',
 'make_gaussian_quantiles',
 'make_hastie_10_2',
 'make_low_rank_matrix',
 'make_moons',
 'make_multilabel_classification',
 'make_regression',
 'make_s_curve',
 'make_sparse_coded_signal',
 'make_sparse_spd_matrix',
 'make_sparse_uncorrelated',
 'make_spd_matrix',
 'make_swiss_roll']

<br>

### 당뇨병 데이터

In [102]:
# 사이킷런 데이터 불러오기 - 유방암 데이터
from sklearn.datasets import load_diabetes
dataset = load_diabetes()

In [103]:
# 데이터프레임으로 만들기
diabetes_df = pd.DataFrame(data=dataset.data, columns=dataset.feature_names)
diabetes_df.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204
2,0.085299,0.05068,0.044451,-0.00567,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.02593
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641


In [104]:
# 타겟 추가하기
diabetes_df['target'] = dataset.target
diabetes_df.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,target
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646,151.0
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204,75.0
2,0.085299,0.05068,0.044451,-0.00567,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.02593,141.0
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362,206.0
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641,135.0


<br>

## 머신러닝: 분류

### 검증 데이터 분리

- 유방암 데이터

In [88]:
# 검증 데이터 분리
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    cancer_df.drop('target', axis=1), cancer_df['target'], test_size=0.3, random_state=42
)

In [89]:
# 학습 데이터 확인(X)
X_train.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
149,13.74,17.91,88.12,585.0,0.07944,0.06376,0.02881,0.01329,0.1473,0.0558,...,15.34,22.46,97.19,725.9,0.09711,0.1824,0.1564,0.06019,0.235,0.07014
124,13.37,16.39,86.1,553.5,0.07115,0.07325,0.08092,0.028,0.1422,0.05823,...,14.26,22.75,91.99,632.1,0.1025,0.2531,0.3308,0.08978,0.2048,0.07628
421,14.69,13.98,98.22,656.1,0.1031,0.1836,0.145,0.063,0.2086,0.07406,...,16.46,18.34,114.1,809.2,0.1312,0.3635,0.3219,0.1108,0.2827,0.09208
195,12.91,16.33,82.53,516.4,0.07941,0.05366,0.03873,0.02377,0.1829,0.05667,...,13.88,22.0,90.81,600.6,0.1097,0.1506,0.1764,0.08235,0.3024,0.06949
545,13.62,23.23,87.19,573.2,0.09246,0.06747,0.02974,0.02443,0.1664,0.05801,...,15.35,29.09,97.58,729.8,0.1216,0.1517,0.1049,0.07174,0.2642,0.06953


In [90]:
# 학습 데이터 확인(y)
y_train.head()

Unnamed: 0,target
149,1
124,1
421,1
195,1
545,1


In [91]:
# 데이터 크기
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((398, 30), (171, 30), (398,), (171,))

<br>

### 의사결졍나무 (Decision Tree)

In [94]:
# 머신러닝(분류)
from sklearn.tree import DecisionTreeClassifier

# 모델 선택
model = DecisionTreeClassifier()
# 학습
model.fit(X_train, y_train)
# 예측
pred = model.predict(X_test)
pred

array([1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1,
       1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1,
       0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1,
       0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0,
       1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1])

<br>

### 평가 (accuracy)

In [95]:
# 정확도 accuracy_score(실제값, 예측값)
from sklearn.metrics import accuracy_score
accuracy_score(y_test, pred)

0.9415204678362573

<br>

## 머신러닝: 회귀

### 검증 데이터 분리

- 당뇨병 데이터

In [110]:
# 검증 데이터 분리
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    diabetes_df.drop('target', axis=1), diabetes_df['target'], test_size=0.3, random_state=42
)

In [111]:
# 학습 데이터 확인(X)
X_train.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
225,0.030811,0.05068,0.032595,0.049415,-0.040096,-0.043589,-0.069172,0.034309,0.063015,0.003064
412,0.074401,-0.044642,0.085408,0.063187,0.014942,0.013091,0.015505,-0.002592,0.006207,0.085907
118,-0.05637,0.05068,-0.010517,0.025315,0.023198,0.040022,-0.039719,0.034309,0.020609,0.056912
114,0.023546,-0.044642,0.110198,0.063187,0.013567,-0.032942,-0.024993,0.020655,0.099241,0.023775
364,0.001751,0.05068,-0.006206,-0.019442,-0.009825,0.004949,-0.039719,0.034309,0.014821,0.098333


In [112]:
# 학습 데이터 확인(y)
y_train.head()

Unnamed: 0,target
225,208.0
412,261.0
118,179.0
114,258.0
364,262.0


In [113]:
# 데이터 크기
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((309, 10), (133, 10), (309,), (133,))

<br>

### 선형회귀 (LinearRegression)

In [115]:
# 머신러닝(회귀)
from sklearn.linear_model import LinearRegression

# 모델 선택
model = LinearRegression()
# 학습
model.fit(X_train, y_train)
# 예측
pred = model.predict(X_test)
pred

array([138.46970031, 181.10052342, 125.34400904, 292.75977277,
       123.8830531 ,  91.89830434, 257.26463123, 177.76169318,
        84.98549706, 109.15960992,  94.4855284 , 166.13235108,
        57.40742502, 206.13897354,  97.7811842 , 131.00472765,
       214.29789972, 252.52907661, 199.66656381, 219.49985634,
       203.23671317,  88.00656925,  70.65108459, 186.92233427,
       155.81266751, 162.81022205, 191.93135706, 184.72924276,
        46.62920829, 108.26566599, 178.14743952,  91.35065005,
       127.72125745, 184.04205666, 172.23799897, 189.51548863,
       121.83265708, 117.75339206, 145.67950306,  58.57563401,
        72.55421321, 107.27571105, 166.15280313, 153.29339984,
       165.19282154,  62.95752038,  73.58909449, 110.05656189,
        54.49723354, 164.88920632, 152.49472803,  63.8874565 ,
       111.4354561 , 108.34936269, 179.96973311, 158.70054112,
        95.04833555, 209.68990406, 118.37356519,  69.69946093,
       189.04680627, 204.99138626, 140.26840176, 105.75

<br>

### 평가 (mes)

In [116]:
# 정확도 mean_squared_error(실제값, 예측값)
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, pred)

2821.750981001311