# ◈ 데이터 분석 준비

## 1) Colab 환경 설정

In [None]:
# 시각화 폰트 설치
!sudo apt-get install -y fonts-nanum
!sudo fc-cache -fv
!rm ~/.cache/matplotlib -rf

# 트리맵
!pip install squarify

# 요인분석
!pip install pingouin factor_analyzer

# 연관분석
!pip install -U apyori mlxtend

## 2) 필수 라이브러리 로드

In [None]:
import warnings
warnings.filterwarnings(action='ignore')

import pandas as pd
import numpy as np
import missingno as msno
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import multiprocess as mp

from scipy import stats
from patsy import demo_data, dmatrix, dmatrices

## 3) 글로벌 환경 설정

In [None]:
#%precision 3 # np 정확도를 소숫점 3자리로 고정
%matplotlib inline
sns.set_theme()

# plt.rc('font', family='NanumBarunGothic')
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['font.size'] = 12
plt.rcParams['figure.figsize'] = (10, 5)

sns_color = sns.color_palette('pastel')
plt_line = ['-', '--', ':', '-.']

## 4) 분석 데이터 로드

In [None]:
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split

boston = load_boston()
X = boston.data
y = boston.target

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.75, random_state=13)

# ◈ sklearn datasets
|데이터명명령어|feature|target|row|
|------|------|------|------|
|보스톤 집값 데이터|load_boston|13|number|506|
|당뇨병 환자 데이터|load_diabetes|10|number|442|
|아이리스 붓꽃 데이터|load_iris|4|class(3)|150|
|손글씨 데이터|load_digits|64|class(10)|1793|
|와인 데이터|load_wine|13|class(3)|178|
|위스콘신 유방암 환자 데이터|load_breast_cancer|30|class(2)|569|
|multi-output regression 용|load_linnerud|3|number(3)|178|

# ◈ sciki learn 주요 모듈
| 모듈 | 설명 |
|------|------|
| `sklearn.datasets` | 내장된 예제 데이터 세트 |
| `sklearn.preprocessing` | 다양한 데이터 전처리 기능 제공 (변환, 정규화, 스케일링 등) |
| `sklearn.feature_selection` | 특징(feature)를 선택할 수 있는 기능 제공 | 
| `sklearn.feature_extraction` | 특징(feature) 추출에 사용 |
| `sklearn.decomposition` | 차원 축소 관련 알고리즘 지원 (PCA, NMF, Truncated SVD 등)
| `sklearn.model_selection` | 교차 검증을 위해 데이터를 학습/테스트용으로 분리, 최적 파라미터를 추출하는 API 제공 (GridSearch 등)
| `sklearn.metrics` | 분류, 회귀, 클러스터링, Pairwise에 대한 다양한 성능 측정 방법 제공 (Accuracy, Precision, Recall, ROC-AUC, RMSE 등) |
| `sklearn.pipeline` | 특징 처리 등의 변환과 ML 알고리즘 학습, 예측 등을 묶어서 실행할 수 있는 유틸리티 제공 |
| `sklearn.linear_model` | 선형 회귀, 릿지(Ridge), 라쏘(Lasso), 로지스틱 회귀 등 회귀 관련 알고리즘과 SGD(Stochastic Gradient Descent) 알고리즘 제공 |
| `sklearn.svm` | 서포트 벡터 머신 알고리즘 제공 |
| `sklearn.neighbors` | 최근접 이웃 알고리즘 제공 (k-NN 등)
| `sklearn.naive_bayes` | 나이브 베이즈 알고리즘 제공 (가우시안 NB, 다항 분포 NB 등) |
| `sklearn.tree` | 의사 결정 트리 알고리즘 제공 |
| `sklearn.ensemble` | 앙상블 알고리즘 제공 (Random Forest, AdaBoost, GradientBoost 등) |
| `sklearn.cluster` | 비지도 클러스터링 알고리즘 제공 (k-Means, 계층형 클러스터링, DBSCAN 등)

# ◈ sklearn 고급 기능

## 1) Batch Work

### 가) make_pipeline
-  전처리, 모델 생성, 학습을 한 번에 처리할 수 있는 클래스

In [None]:
from sklearn import set_config
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import Ridge

model = make_pipeline(
    SimpleImputer(strategy="median"), 
    StandardScaler(), 
    Ridge(alpha=1)
    ).fit(X_train, y_train)

y_pred = model.predict(X_test)
print(f'학습 정확도: {model.score(X_train, y_train):.2f}')
print(f'예측 정확도: {model.score(X_test, y_test):.2f}')

set_config(display="diagram")
model

학습 정확도: 0.74
예측 정확도: 0.71


### 나) make_column_transformer

In [None]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OrdinalEncoder

X = pd.DataFrame({
    'city': ['London', 'London', 'Paris', 'Sallisaw'],
    'title': ["His Last Bow", "How Watson Learned the Trick", "A Moveable Feast", "The Grapes of Wrath"],
    'expert_rating': [5, 3, 4, 5],
    'user_rating': [4, 5, 4, 3]})
X

encorder = make_column_transformer(
    (OrdinalEncoder(), ['city', 'title']),
    (StandardScaler(), ['expert_rating', 'user_rating']),
    remainder=StandardScaler()
    )

encorder.fit_transform(X)

## 2) Cross Validation
- Scoring
  - Category : 'accuracy', 'f1_micro', 'f1_weighted'
  - Numeric : neg_mean_absolute_error, neg_mean_squared_error,neg_root_mean_squared_error, r2

### 가) cross_validate
- 여러 Score 리턴

In [None]:
from sklearn.model_selection import cross_validate
cv_score = cross_validate(
    model,
    X,
    y,
    cv = 5,
    scoring = ('neg_mean_absolute_error', 'neg_mean_squared_error', 'r2'),
    # return_train_score=True,
    n_jobs = mp.cpu_count()
)

pd.DataFrame(data=cv_score)

Unnamed: 0,fit_time,score_time,test_neg_mean_absolute_error,test_neg_mean_squared_error,test_r2
0,0.004967,0.001732,-2.60377,-12.313831,0.643441
1,0.003392,0.001071,-3.882676,-25.810719,0.71648
2,0.004111,0.001712,-4.380231,-33.004923,0.587888
3,0.003845,0.001617,-5.555077,-80.50284,0.08219
4,0.003694,0.005278,-4.729004,-32.884796,-0.236814


### 나) cross_val_score
- 하나의 Score만 리턴

In [None]:
from sklearn.model_selection import cross_val_score
cv_score = cross_val_score(
    model,
    X,
    y,
    cv = 5,
    scoring = 'r2',
    n_jobs = mp.cpu_count()
)

cv_score

array([ 0.64344111,  0.71648023,  0.58788768,  0.08218971, -0.23681375])

### 다) GridSearchCV
- 교차검증 & 하이퍼파라미트 최적화 튜닝

In [None]:
model = Ridge().fit(X_train, y_train)

y_pred = model.predict(X_test)
print(f'학습 정확도: {model.score(X_train, y_train):.2f}')
print(f'예측 정확도: {model.score(X_test, y_test):.2f}')

학습 정확도: 0.74
예측 정확도: 0.70


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge

model = Ridge()
params = {
    'alpha':[0.5, 0.8, 1, 1.2, 1.5]
}

gridsc = GridSearchCV(
    model,
    param_grid=params,
    cv=3,
    scoring='r2',
    refit=True,
).fit(X_train, y_train)

y_pred = gridsc.predict(X_test)
print(f'학습 정확도: {gridsc.score(X_train, y_train):.2f}')
print(f'예측 정확도: {gridsc.score(X_test, y_test):.2f}')

학습 정확도: 0.74
예측 정확도: 0.71


- GridSearchCV 정보 출력

In [None]:
print(f'best params : {gridsc.best_params_}')
print(f'best score : {gridsc.best_score_}')
df_score = pd.DataFrame(gridsc.cv_results_)
df_score.iloc[:, 5:]

best params : {'alpha': 0.5}
best score : 0.6768345888634606


Unnamed: 0,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,{'alpha': 0.5},0.633995,0.710867,0.685642,0.676835,0.031995,1
1,{'alpha': 0.8},0.632428,0.711785,0.68554,0.676585,0.03301,2
2,{'alpha': 1},0.63176,0.712078,0.685569,0.676469,0.033415,3
3,{'alpha': 1.2},0.631282,0.712226,0.685653,0.676387,0.033688,4
4,{'alpha': 1.5},0.630808,0.71228,0.685853,0.676314,0.033938,5


- Best Estimator 사용

In [None]:
model = gridsc.best_estimator_

y_pred = model.predict(X_test)
print(f'학습 정확도: {model.score(X_train, y_train):.2f}')
print(f'예측 정확도: {model.score(X_test, y_test):.2f}')

학습 정확도: 0.74
예측 정확도: 0.71


- make_pipeline과 함께 사용

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge

params = {
    'alpha':[0.5, 0.8, 1, 1.2, 1.5]
}

model = make_pipeline(
    SimpleImputer(strategy="median"), 
    StandardScaler(), 
    GridSearchCV(
        Ridge(),
        param_grid=params,
        cv=3,
        scoring='r2',
        refit=True,
      )
    ).fit(X_train, y_train)

y_pred = model.predict(X_test)
print(f'학습 정확도: {model.score(X_train, y_train):.2f}')
print(f'예측 정확도: {model.score(X_test, y_test):.2f}')

print(f'best params : {model[2].best_params_}')
print(f'best score : {model[2].best_score_}')

학습 정확도: 0.74
예측 정확도: 0.71
best params : {'alpha': 1.5}
best score : 0.6813947182664787
