## 작업형 2유형 최종정리
- 작업형1 : 3문제 (30점), 데이터 전처리
- `작업형2 : 1문제 (40점), 분류/회귀 예측 모델링`
- 작업형3 : 2문제 (30점), 가설 검정

## 주요 라이브러리
- palmerpenguins : 팔머펭귄 데이터셋의 목표는 iris 데이터셋의 대안으로 데이터 탐색 및 시각화를 위한 데이터셋 제공.
- scikit-learn : 머신러닝을 위한 라이브러리
- lightgbm : LightGBM은 Microsoft에서 개발한 오픈 소스 기계 학습 라이브러리로, 대용량 데이터셋에서 빠른 속도와 높은 성능을 제공하는 것이 특징

## 주의
- 각 코드에 대한 설명은 별도로 하지 않습니다.

## 데이터 파일 불러오기

In [1]:
import pandas as pd 
from palmerpenguins import load_penguins 

penguins = load_penguins()
penguins['ID'] = penguins.reset_index().index + 1
penguins.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year,ID
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007,1
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007,2
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,2007,3
3,Adelie,Torgersen,,,,,,2007,4
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,2007,5


In [2]:
cols = penguins.columns.tolist()
cols = cols[-1:] + cols[:-1]
print(cols)

['ID', 'species', 'island', 'bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g', 'sex', 'year']


- 컬럼의 순서를 변경한다. ID가 가장 먼저 오도록 한다.

In [3]:
penguins = penguins[cols]
penguins

Unnamed: 0,ID,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,1,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007
1,2,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007
2,3,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,2007
3,4,Adelie,Torgersen,,,,,,2007
4,5,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,2007
...,...,...,...,...,...,...,...,...,...
339,340,Chinstrap,Dream,55.8,19.8,207.0,4000.0,male,2009
340,341,Chinstrap,Dream,43.5,18.1,202.0,3400.0,female,2009
341,342,Chinstrap,Dream,49.6,18.2,193.0,3775.0,male,2009
342,343,Chinstrap,Dream,50.8,19.0,210.0,4100.0,male,2009


### 데이터 가공
- 지금까지 열린 대회에서는 결측치가 존재 하지 않았던 것으로 기억
    + 만약 잘못된 정보라면 알려주세요
    + 결측치를 제거한다.

In [4]:
penguins = penguins.dropna().reset_index(drop=True)
penguins

Unnamed: 0,ID,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,1,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007
1,2,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007
2,3,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,2007
3,5,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,2007
4,6,Adelie,Torgersen,39.3,20.6,190.0,3650.0,male,2007
...,...,...,...,...,...,...,...,...,...
328,340,Chinstrap,Dream,55.8,19.8,207.0,4000.0,male,2009
329,341,Chinstrap,Dream,43.5,18.1,202.0,3400.0,female,2009
330,342,Chinstrap,Dream,49.6,18.2,193.0,3775.0,male,2009
331,343,Chinstrap,Dream,50.8,19.0,210.0,4100.0,male,2009


## 데이터셋 분리
- 기사시험과 같이 데이터셋을 만들기 위해 데이터셋을 분리하고 저장한다. 

### 회귀모형을 위한 데이터셋 정리

In [5]:
from sklearn.model_selection import train_test_split

y = penguins['body_mass_g']
X = penguins.drop(['body_mass_g'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

X_train.to_csv("penguin_reg_X_train.csv", index=False)
X_test.to_csv("penguin_reg_X_test.csv", index=False)
y_train.to_csv("penguin_reg_y_train.csv", index=False)
y_test.to_csv("penguin_reg_y_test.csv", index=False)

## 회귀모형 만들기 정리
- 기본적으로 아래 데이터셋 불러오기는 제공된다. 

In [6]:
import pandas as pd 

X_train = pd.read_csv("penguin_reg_X_train.csv")
X_test = pd.read_csv("penguin_reg_X_test.csv")
y_train = pd.read_csv("penguin_reg_y_train.csv")

## ID 제거

In [7]:
X_train_id = X_train.pop("ID")
X_test_id = X_test.pop("ID")

### 데이터 확인

In [8]:
X_train.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,sex,year
0,Adelie,Biscoe,40.5,17.9,187.0,female,2007
1,Chinstrap,Dream,49.2,18.2,195.0,male,2007
2,Chinstrap,Dream,52.8,20.0,205.0,male,2008
3,Adelie,Biscoe,37.6,17.0,185.0,female,2008
4,Gentoo,Biscoe,47.3,15.3,222.0,male,2007


In [9]:
y_train.head()

Unnamed: 0,body_mass_g
0,3200.0
1,4400.0
2,4550.0
3,3600.0
4,5250.0


In [10]:
X_test.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,sex,year
0,Adelie,Dream,39.5,16.7,178.0,female,2007
1,Chinstrap,Dream,50.9,17.9,196.0,female,2009
2,Adelie,Torgersen,42.1,19.1,195.0,male,2008
3,Gentoo,Biscoe,46.6,14.2,210.0,female,2008
4,Adelie,Biscoe,41.1,18.2,192.0,male,2008


### 결측치 확인

In [11]:
X_train.isnull().sum()

species              0
island               0
bill_length_mm       0
bill_depth_mm        0
flipper_length_mm    0
sex                  0
year                 0
dtype: int64

In [12]:
X_test.isnull().sum()

species              0
island               0
bill_length_mm       0
bill_depth_mm        0
flipper_length_mm    0
sex                  0
year                 0
dtype: int64

In [13]:
y_train.isnull().sum()

body_mass_g    0
dtype: int64

In [14]:
y_train

Unnamed: 0,body_mass_g
0,3200.0
1,4400.0
2,4550.0
3,3600.0
4,5250.0
...,...
228,4750.0
229,3900.0
230,3200.0
231,3950.0


### 컬럼 분리
- 범주형 컬럼과 숫자형 컬럼으로 분리

In [15]:
import numpy as np

cat_cols = X_train.select_dtypes(exclude = np.number).columns.tolist()
num_cols = X_train.select_dtypes(include = np.number).columns.tolist()
print(cat_cols, num_cols)

['species', 'island', 'sex'] ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'year']


- year은 num_cols에서 제거한다.

In [16]:
num_cols.remove("year")
num_cols

['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm']

### 데이터셋 분리

In [17]:
from sklearn.model_selection import train_test_split
# 과적합 검증을 하기 위해 val 빼내는거임.
X_tr, X_val, y_tr, y_val = train_test_split(
    X_train, y_train['body_mass_g'], 
    stratify = X_train['sex'], 
    test_size=0.3, 
    random_state=42
)

X_tr.shape, X_val.shape, y_tr.shape, y_val.shape

((163, 7), (70, 7), (163,), (70,))

### 모형 만들기 
- pipeline을 활용하여 모형을 만들면, 매우 쉽게 작성할 수 있다.
- OrdinalEncoder를 추가할 때는 아래와 같이 작성한다.
  + `ord_encoder` 영역만 살펴본다. 
```python
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer

column_transformer = ColumnTransformer([
    ("scaler", StandardScaler(), num_cols), 
    ("ohd_encoder", OneHotEncoder(), cat_cols)
    ("ord_encoder", OrdinalEncoder(categories=[["Adelie", "Gentoo", "Chinstrap"]]), ['species'])
], remainder="passthrough")
```

In [29]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline 
from sklearn.metrics import make_scorer, mean_squared_error
from lightgbm import LGBMRegressor
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import loguniform
import numpy as np

def rmse(y_tr, y_val):
    return np.sqrt(mean_squared_error(y_tr, y_val))

param_grid = {
    "clf__learning_rate": loguniform(0.0001, 0.1), 
    "clf__n_estimators" : np.arange(30, 50), 
    "clf__max_depth" : np.arange(3, 30, 2), 
    "clf__num_leaves" : np.arange(30, 50), 
    "clf__min_split_gain" : np.arange(0, 1.1, 0.1), 
    "clf__subsample" : np.arange(0.6, 1.0, 0.1)
}

column_transformer = ColumnTransformer([
    ("scaler", StandardScaler(), num_cols), 
    ("ohd_encoder", OneHotEncoder(), cat_cols)
], remainder="passthrough")

pipeline = Pipeline([
    ("preprocessing", column_transformer), 
    ("clf", LGBMRegressor(random_state=42))
])

random_search = RandomizedSearchCV(
    estimator=pipeline, 
    param_distributions = param_grid, 
    n_iter = 10, 
    scoring = make_scorer(rmse, greater_is_better=False),
    cv=5, 
    verbose=0,
    n_jobs=-1
)

random_search.fit(X_tr, y_tr)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000271 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 149
[LightGBM] [Info] Number of data points in the train set: 163, number of used features: 12
[LightGBM] [Info] Start training from score 4278.220859


### 평가확인

In [30]:
def get_score(model, X_tr, X_val, y_tr, y_val):
    tr_pred = model.predict(X_tr) # 만약 확률로 구할시, predict_proba()[:, 1]
    val_pred = model.predict(X_val)
    tr_score = rmse(y_tr, tr_pred)
    val_score = rmse(y_val, val_pred)
    return f"train: {tr_score}, validation: {val_score}"

get_score(random_search, X_tr, X_val, y_tr, y_val) #차이가 적어야 과적합이 아니다.

'train: 263.43504943212, validation: 317.5889209387227'

## 평가 제출

In [None]:
final_preds = random_search.predict(X_test)
result = pd.DataFrame({
    "ID" : X_test_id, 
    "preds": final_preds.astype("int64")
})

result.head()

In [None]:
result.to_csv("A수험번호.csv", index=False)