# 1장 지도학습 모형

## 1절 데이터 분할

### 1. 홀드아웃

In [4]:
from sklearn.datasets import load_breast_cancer 
from sklearn.model_selection import train_test_split 

# print(load_breast_cancer())
data = load_breast_cancer()['data']
target = load_breast_cancer()['target']

X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.3, random_state=2022)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

X_train, X_test, y_train, y_test = train_test_split(data, target, stratify=target, test_size=0.3, random_state=2022)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(398, 30) (171, 30) (398,) (171,)
(398, 30) (171, 30) (398,) (171,)


### 2. K-fold

In [6]:
import numpy as np
X = np.arange(10)

from sklearn.model_selection import KFold 

kfold = KFold(n_splits = 5)

for train_idx, test_idx in kfold.split(X):
    print("학습 :", train_idx, "평가 :", test_idx)

학습 : [2 3 4 5 6 7 8 9] 평가 : [0 1]
학습 : [0 1 4 5 6 7 8 9] 평가 : [2 3]
학습 : [0 1 2 3 6 7 8 9] 평가 : [4 5]
학습 : [0 1 2 3 4 5 8 9] 평가 : [6 7]
학습 : [0 1 2 3 4 5 6 7] 평가 : [8 9]


In [7]:
import numpy as np 
X = np.arange(15)
y = [0] * 6 + [1] * 3 + [2] * 6

from sklearn.model_selection import StratifiedKFold 
kfold = StratifiedKFold(n_splits=3)

for train_idx, test_idx in kfold.split(X, y):
    print("학습 :", train_idx, "평가 :", test_idx)

학습 : [ 2  3  4  5  7  8 11 12 13 14] 평가 : [ 0  1  6  9 10]
학습 : [ 0  1  4  5  6  8  9 10 13 14] 평가 : [ 2  3  7 11 12]
학습 : [ 0  1  2  3  6  7  9 10 11 12] 평가 : [ 4  5  8 13 14]


## 2절 성과분석

### 1. 분류 지표

In [11]:
from sklearn.metrics import confusion_matrix 

# 이진분류
y_true = [0, 0, 0, 1, 1, 1]
y_pred = [0, 1, 0, 1, 1, 1]

print(confusion_matrix(y_true, y_pred))

# 이진분류(레이블로 되어있을 경우)
y_true = ['A', 'A', 'A', 'B', 'B', 'B']
y_pred = ['A', 'B', 'A', 'B', 'B', 'B']

print(confusion_matrix(y_true, y_pred, labels = ['A', 'B']))

# 다지분류(레이블 : 0,1,2)
y_true = [0, 0, 0, 1, 1, 2, 2, 2, 2]
y_pred = [0, 1, 1, 1, 0, 0, 1, 2, 2]

print(confusion_matrix(y_true, y_pred))

[[2 1]
 [0 3]]
[[2 1]
 [0 3]]
[[1 2 0]
 [1 1 0]
 [1 1 2]]


### 2. 예측 지표

In [6]:
from sklearn.metrics import *

import numpy as np 
np.random.seed(123)

y_true = np.random.random_sample(5)
print(y_true)

y_pred = np.random.random_sample(5)
print(y_pred)

# MSE
mse = mean_squared_error(y_true, y_pred)
print(mse)

# MAE
mae = mean_absolute_error(y_true, y_pred)
print(mae)

# MAPE
mape = mean_absolute_percentage_error(y_true, y_pred)
print(mape)

[0.69646919 0.28613933 0.22685145 0.55131477 0.71946897]
[0.42310646 0.9807642  0.68482974 0.4809319  0.39211752]
0.17581754220802784
0.36474003862364796
1.0843148337483364


## 3절 선형 모델

### 1. 사이킷런을 활용한 다중 선형 회귀분석

In [17]:
from sklearn.linear_model import LinearRegression 
from sklearn.datasets import load_diabetes
import pandas as pd 

# print(load_diabetes())

diabetes = load_diabetes()

# print(diabetes.keys())

data = diabetes['data']
target = diabetes['target']

column = ['bmi', 'bp', 's1', 's2', 's3']

df = pd.DataFrame(data, columns=diabetes['feature_names'])
X = df[column]
y = target

model = LinearRegression()

model.fit(X, y)

print(model.coef_)
print(model.intercept_)

print(model.score(X, y))

[ 608.94692667  301.1268683   990.86452444 -938.97359917 -597.46181621]
152.13348416289614
0.4772123190202695


In [21]:
from sklearn.linear_model import Ridge 
column = ['bmi', 'bp', 's1', 's2', 's3']
X = df[column]
y = target

model = Ridge(alpha = 0.1)

model.fit(X, y)

print(model.intercept_)
print(model.coef_)

152.1334841628961
[ 595.99425538  339.08790294  397.33725338 -338.99514707 -406.34548455]


In [22]:
from sklearn.linear_model import Lasso 
column = ['bmi', 'bp', 's1', 's2', 's3']
X = df[column]
y = target 

model = Lasso(alpha = 0.5)

model.fit(X, y)

print(model.intercept_)
print(model.coef_)

152.13348416289608
[ 574.04562479  237.22854049    0.            0.         -165.17168117]


# 2장 군집 모형

## 1절 군집 평가

### 1. 실루엣계수

sklearn.metrics.silhouette_score(X, y, ...) : 전체 개체에 대한 실루엣계수의 평균 계산  
sklearn.metrics.silhouette_sample(X, y, ...) : 각 개체에 대한 실루엣계수 계산  

### 2. RI와 ARI

In [25]:
labels_true = [0, 0, 0, 1, 1, 1, 1, 2, 2]
labels_pred = [0, 0, 1, 1, 1, 1, 2, 2, 2]

from sklearn.metrics.cluster import rand_score, adjusted_rand_score

# RI(랜드지수)
ri = rand_score(labels_true, labels_pred)
print(ri)

# ARI(조정 랜드지수)
ari = adjusted_rand_score(labels_true, labels_pred)
print(ari)

0.7222222222222222
0.3076923076923077


## 2절 계층적 군집분석

In [30]:
from sklearn.cluster import AgglomerativeClustering 
from sklearn.metrics.cluster import rand_score, adjusted_rand_score 

from sklearn.datasets import load_iris

iris = load_iris()

print(iris.keys())

data = iris['data']
target = iris['target']

# 와드연결법
agg_ward = AgglomerativeClustering(n_clusters = 3)
pred_ward = agg_ward.fit_predict(data)

# 평균연결법
agg_avg = AgglomerativeClustering(n_clusters = 3, linkage = 'average')
pred_avg = agg_avg.fit_predict(data)

# 최장연결법
agg_comp = AgglomerativeClustering(n_clusters = 3, linkage = 'complete')
pred_comp = agg_comp.fit_predict(data)

# 최단연결법
agg_sing = AgglomerativeClustering(n_clusters = 3, linkage = 'single')
pred_sing = agg_sing.fit_predict(data)

# RI 비교
print(rand_score(target, pred_ward)) # 와드연결법
print(rand_score(target, pred_avg)) # 평균연결법
print(rand_score(target, pred_comp)) # 최장연결법
print(rand_score(target, pred_sing)) # 최단연결법

print('-' * 20)

# ARI 비교
print(adjusted_rand_score(target, pred_ward)) # 와드연결법
print(adjusted_rand_score(target, pred_avg)) # 평균연결법
print(adjusted_rand_score(target, pred_comp)) # 최장연결법
print(adjusted_rand_score(target, pred_sing)) # 최단연결법

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])
0.8797315436241611
0.8922595078299776
0.8367785234899329
0.7766442953020134
--------------------
0.7311985567707746
0.7591987071071522
0.6422512518362898
0.5637510205230709


## 3절 k-means 군집분석

In [39]:
from sklearn.cluster import KMeans 
from sklearn.metrics import silhouette_score, silhouette_samples 

from sklearn.datasets import load_iris 
iris = load_iris()
data = iris['data']

# k=2일 때 k-means 군집모형으로 군집 형성
kmeans_k2 = KMeans(n_clusters=2, random_state=2022)
pred_k2 = kmeans_k2.fit_predict(data)

# k=3일 때 k-means 군집모형으로 군집 형성
kmeans_k3 = KMeans(n_clusters=3, random_state=2022)
pred_k3 = kmeans_k3.fit_predict(data)

# k=4일 때 k-means 군집모형으로 군집 형성
kmeans_k4 = KMeans(n_clusters=4, random_state=2022)
pred_k4 = kmeans_k4.fit_predict(data)


import pandas as pd


# k=2일 때
sil_k2 = silhouette_samples(data, pred_k2)
df_k2 = pd.DataFrame({'labels' : pred_k2, 'silhouette' : sil_k2})

# 레이블별 실루엣계수의 평균
print(df_k2.groupby('labels').mean())
# 전체 실루엣계수 평균
print(silhouette_score(data, pred_k2))


# k=3일 때
sil_k3 = silhouette_samples(data, pred_k3)
df_k3 = pd.DataFrame({'labels' : pred_k3, 'silhouette' : sil_k3})

# 레이블별 실루엣계수의 평균
print(df_k3.groupby('labels').mean())
# 전체 실루엣계수 평균
print(silhouette_score(data, pred_k3))

# k=4일 때
sil_k4 = silhouette_samples(data, pred_k4)
df_k4 = pd.DataFrame({'labels' : pred_k4, 'silhouette' : sil_k4})

# 레이블별 실루엣계수의 평균
print(df_k4.groupby('labels').mean())
# 전체 실루엣계수 평균
print(silhouette_score(data, pred_k4))

        silhouette
labels            
0         0.769526
1         0.632701
0.6810461692117462
        silhouette
labels            
0         0.798140
1         0.417320
2         0.451105
0.5528190123564095
        silhouette
labels            
0         0.419518
1         0.763176
2         0.321324
3         0.362998
0.49805050499728726


  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
