In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from google.colab import drive
drive.mount('/content/drive')

DATA_PATH = "/content/drive/MyDrive/data/"

SEED = 42 # 시드값

# 데이터 블러오기
train = pd.read_csv(f"{DATA_PATH}titanic_train.csv") # 학습데이터
test = pd.read_csv(f"{DATA_PATH}titanic_test.csv") # 테스트 데이터

# 결측치 처리
age_mean = train["age"].mean()
fare_median = train["fare"].median()
cabin_unk = "UNK"
embarked_mode = train["embarked"].mode()[0]
train["age"] = train["age"].fillna(age_mean)
train["cabin"] = train["cabin"].fillna(cabin_unk)
test["age"] = test["age"].fillna(age_mean)
test["fare"] = test["fare"].fillna(fare_median)
test["cabin"] = test["cabin"].fillna(cabin_unk)
test["embarked"] = test["embarked"].fillna(embarked_mode)

# 특성으로 사용할 변수 선택
cols = ["age","sibsp","parch","fare","pclass","gender","embarked"]
train_ft = train[cols].copy()
test_ft = test[cols].copy()

# 범주형 변수 원핫인코딩
cols = ['gender','embarked']
enc = OneHotEncoder(handle_unknown = 'ignore')
enc.fit(train[cols])
tmp = pd.DataFrame(
    enc.transform(train_ft[cols]).toarray(),
    columns = enc.get_feature_names_out()
)
train_ft = pd.concat([train_ft,tmp],axis=1).drop(columns=cols)
tmp = pd.DataFrame(
    enc.transform(test_ft[cols]).toarray(),
    columns = enc.get_feature_names_out()
)
test_ft = pd.concat([test_ft,tmp],axis=1).drop(columns=cols)

# Min-Max Scaling
scaler = MinMaxScaler()
scaler.fit(train_ft)
train_ft[train_ft.columns] = scaler.transform(train_ft)
test_ft[test_ft.columns] = scaler.transform(test_ft)

# 정답 데이터
target = train["survived"]

train_ft.shape, test_ft.shape, target.shape

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


((916, 10), (393, 10), (916,))

# AutoML(Automated machine learning)
- 시간 소모적이고 반복적인 기계 학습 모델 개발 작업을 자동화하는 프로세스
- 데이터 과학자, 분석가 및 개발자는 모델 품질을 유지하면서 확장성, 효율성 및 생산성이 높은 ML 모델을 빌드할 수 있다.

## FLAML(A Fast Library for Automated Machine Learning & Tuning)
- 머신러닝 학습 및 하이퍼파라미터 튜닝을 자동화해 주는 라이브러리
- https://microsoft.github.io/FLAML/docs/Use-Cases/Task-Oriented-AutoML
- flaml 설치하기
    ```bash
    pip install flaml
    ```

In [3]:
%pip install flaml

Collecting flaml
  Downloading FLAML-2.3.1-py3-none-any.whl.metadata (16 kB)
Downloading FLAML-2.3.1-py3-none-any.whl (313 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m313.3/313.3 kB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: flaml
Successfully installed flaml-2.3.1


- `AutoML` 클래스의 `fit` 메서드 주요 파라미터
    - metric
        - 평가지표
        - ex) 'roc_auc'
    - task
        - 작업 유형
        - ex) 'classification'
    - estimator_list
        - FLAML에서 제공해주는 모델들의 별칭을 리스트에 넣어주면 된다.
        - 생략시 자동으로 모델들이 선택된다.
        - ex) ['lgbm', 'rf', 'xgboost', 'extra_tree', 'xgb_limitdepth', 'lrl1']
    - time_budget
        - 수행 시간
            - 초단위
    - ensemble
        - 튜닝 후 스태킹 앙상블 여부
        - False(기본값) : 최상의 모델을 선택해서 학습
        - True: 스태킹하여 앙상블
        - `dict` 예시
            - {'final_estimator' : 사이킷런 모델 객체 }
    - n_splits
        - 교차검증 폴드 수(기본 5)
    - seed
        - 시드값
    - early_stop
        - True or False(기본값)
        - 튜닝이 수렴할 경우 조기 중지여부

In [6]:
from flaml import AutoML

auto_ml = AutoML()

params = {
    "metric": "roc_auc",
    "task": "classification",
    "time_budget": 180,
    "seed": SEED,
    "early_stop": True
}

auto_ml.fit(train_ft, target, **params)

[flaml.automl.logger: 10-30 10:11:34] {1728} INFO - task = classification
[flaml.automl.logger: 10-30 10:11:34] {1739} INFO - Evaluation method: cv
[flaml.automl.logger: 10-30 10:11:34] {1838} INFO - Minimizing error metric: 1-roc_auc
[flaml.automl.logger: 10-30 10:11:34] {1955} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'xgboost', 'extra_tree', 'xgb_limitdepth', 'sgd', 'lrl1']
[flaml.automl.logger: 10-30 10:11:34] {2258} INFO - iteration 0, current learner lgbm
[flaml.automl.logger: 10-30 10:11:34] {2393} INFO - Estimated sufficient time budget=4317s. Estimated necessary time budget=100s.
[flaml.automl.logger: 10-30 10:11:34] {2442} INFO -  at 0.5s,	estimator lgbm's best error=0.1071,	best estimator lgbm's best error=0.1071
[flaml.automl.logger: 10-30 10:11:34] {2258} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 10-30 10:11:35] {2442} INFO -  at 1.6s,	estimator lgbm's best error=0.1071,	best estimator lgbm's best error=0.1071
[flaml.automl.logger: 10-30

INFO:flaml.tune.searcher.blendsearch:No low-cost partial config given to the search algorithm. For cost-frugal search, consider providing low-cost values for cost-related hps via 'low_cost_partial_config'. More info can be found at https://microsoft.github.io/FLAML/docs/FAQ#about-low_cost_partial_config-in-tune


[flaml.automl.logger: 10-30 10:11:36] {2442} INFO -  at 2.3s,	estimator sgd's best error=0.1151,	best estimator lgbm's best error=0.1071
[flaml.automl.logger: 10-30 10:11:36] {2258} INFO - iteration 3, current learner sgd
[flaml.automl.logger: 10-30 10:11:36] {2442} INFO -  at 2.5s,	estimator sgd's best error=0.1151,	best estimator lgbm's best error=0.1071
[flaml.automl.logger: 10-30 10:11:36] {2258} INFO - iteration 4, current learner lgbm
[flaml.automl.logger: 10-30 10:11:37] {2442} INFO -  at 3.8s,	estimator lgbm's best error=0.0964,	best estimator lgbm's best error=0.0964
[flaml.automl.logger: 10-30 10:11:37] {2258} INFO - iteration 5, current learner sgd
[flaml.automl.logger: 10-30 10:11:38] {2442} INFO -  at 3.9s,	estimator sgd's best error=0.1128,	best estimator lgbm's best error=0.0964
[flaml.automl.logger: 10-30 10:11:38] {2258} INFO - iteration 6, current learner xgboost
[flaml.automl.logger: 10-30 10:11:38] {2442} INFO -  at 4.7s,	estimator xgboost's best error=0.1109,	best 

- 선택된 모델 객체 확인

In [7]:
auto_ml.model.estimator

- 튜닝된 하이퍼파라미터

In [8]:
auto_ml.best_config

{'n_estimators': 37,
 'max_leaves': 18,
 'min_child_weight': 0.04366188742154608,
 'learning_rate': 0.0049612654242581775,
 'subsample': 0.6834574344907128,
 'colsample_bylevel': 0.9421179471135136,
 'colsample_bytree': 0.787068707168211,
 'reg_alpha': 0.0009765625,
 'reg_lambda': 0.3588430756461608}

- 선택된 모델의 cv 점수 확인하기

In [9]:
1-auto_ml.best_loss

0.9118604482220043

- 예측

In [19]:
pred = auto_ml.predict_proba(test_ft)[:,1]
pred.shape

(393,)

- 앙상블 해보기

In [10]:
auto_ml_ens = AutoML()
params = {
    "metric": "roc_auc",
    "task": "classification",
    "time_budget": 180,
    "seed": SEED,
    "early_stop": True,
    "ensemble": True # 메타모델은 로지스틱회귀
}

auto_ml_ens.fit(train_ft, target, **params)

[flaml.automl.logger: 10-30 10:16:51] {1728} INFO - task = classification
[flaml.automl.logger: 10-30 10:16:51] {1739} INFO - Evaluation method: cv
[flaml.automl.logger: 10-30 10:16:51] {1838} INFO - Minimizing error metric: 1-roc_auc
[flaml.automl.logger: 10-30 10:16:51] {1955} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'xgboost', 'extra_tree', 'xgb_limitdepth', 'sgd', 'lrl1']
[flaml.automl.logger: 10-30 10:16:51] {2258} INFO - iteration 0, current learner lgbm
[flaml.automl.logger: 10-30 10:16:51] {2393} INFO - Estimated sufficient time budget=1755s. Estimated necessary time budget=41s.
[flaml.automl.logger: 10-30 10:16:51] {2442} INFO -  at 0.2s,	estimator lgbm's best error=0.1071,	best estimator lgbm's best error=0.1071
[flaml.automl.logger: 10-30 10:16:51] {2258} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 10-30 10:16:51] {2442} INFO -  at 0.5s,	estimator lgbm's best error=0.1071,	best estimator lgbm's best error=0.1071
[flaml.automl.logger: 10-30 

INFO:flaml.tune.searcher.blendsearch:No low-cost partial config given to the search algorithm. For cost-frugal search, consider providing low-cost values for cost-related hps via 'low_cost_partial_config'. More info can be found at https://microsoft.github.io/FLAML/docs/FAQ#about-low_cost_partial_config-in-tune


[flaml.automl.logger: 10-30 10:16:52] {2442} INFO -  at 0.9s,	estimator sgd's best error=0.1150,	best estimator lgbm's best error=0.1071
[flaml.automl.logger: 10-30 10:16:52] {2258} INFO - iteration 3, current learner sgd
[flaml.automl.logger: 10-30 10:16:52] {2442} INFO -  at 1.1s,	estimator sgd's best error=0.1150,	best estimator lgbm's best error=0.1071
[flaml.automl.logger: 10-30 10:16:52] {2258} INFO - iteration 4, current learner lgbm
[flaml.automl.logger: 10-30 10:16:53] {2442} INFO -  at 1.8s,	estimator lgbm's best error=0.0964,	best estimator lgbm's best error=0.0964
[flaml.automl.logger: 10-30 10:16:53] {2258} INFO - iteration 5, current learner xgboost
[flaml.automl.logger: 10-30 10:16:54] {2442} INFO -  at 3.1s,	estimator xgboost's best error=0.1109,	best estimator lgbm's best error=0.0964
[flaml.automl.logger: 10-30 10:16:54] {2258} INFO - iteration 6, current learner extra_tree
[flaml.automl.logger: 10-30 10:16:54] {2442} INFO -  at 3.3s,	estimator extra_tree's best error

In [11]:
pred_ens = auto_ml_ens.predict_proba(test_ft)[:,1]
pred_ens.shape

(393,)

In [16]:
from lightgbm import LGBMClassifier

auto_ml_ens = AutoML()
params = {
    "metric": "accuracy",
    "task": "classification",
    "time_budget": 1800,
    "seed": SEED,
    "early_stop": True,
    "ensemble": {'final_estimator': LGBMClassifier(random_state=SEED)}, # 메타모델 변경 시
}

auto_ml_ens.fit(train_ft, target, **params)

[flaml.automl.logger: 10-30 10:50:36] {1728} INFO - task = classification
[flaml.automl.logger: 10-30 10:50:36] {1739} INFO - Evaluation method: cv
[flaml.automl.logger: 10-30 10:50:36] {1838} INFO - Minimizing error metric: 1-accuracy
[flaml.automl.logger: 10-30 10:50:36] {1955} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'xgboost', 'extra_tree', 'xgb_limitdepth', 'sgd', 'catboost', 'lrl1']
[flaml.automl.logger: 10-30 10:50:36] {2258} INFO - iteration 0, current learner lgbm
[flaml.automl.logger: 10-30 10:50:36] {2393} INFO - Estimated sufficient time budget=570s. Estimated necessary time budget=14s.
[flaml.automl.logger: 10-30 10:50:36] {2442} INFO -  at 0.1s,	estimator lgbm's best error=0.1910,	best estimator lgbm's best error=0.1910
[flaml.automl.logger: 10-30 10:50:36] {2258} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 10-30 10:50:36] {2442} INFO -  at 0.1s,	estimator lgbm's best error=0.1910,	best estimator lgbm's best error=0.1910
[flaml.automl.lo

INFO:flaml.tune.searcher.blendsearch:No low-cost partial config given to the search algorithm. For cost-frugal search, consider providing low-cost values for cost-related hps via 'low_cost_partial_config'. More info can be found at https://microsoft.github.io/FLAML/docs/FAQ#about-low_cost_partial_config-in-tune


[flaml.automl.logger: 10-30 10:50:36] {2442} INFO -  at 0.5s,	estimator sgd's best error=0.1463,	best estimator sgd's best error=0.1463
[flaml.automl.logger: 10-30 10:50:36] {2258} INFO - iteration 4, current learner xgboost
[flaml.automl.logger: 10-30 10:50:36] {2442} INFO -  at 0.6s,	estimator xgboost's best error=0.1910,	best estimator sgd's best error=0.1463
[flaml.automl.logger: 10-30 10:50:36] {2258} INFO - iteration 5, current learner lgbm
[flaml.automl.logger: 10-30 10:50:37] {2442} INFO -  at 0.7s,	estimator lgbm's best error=0.1321,	best estimator lgbm's best error=0.1321
[flaml.automl.logger: 10-30 10:50:37] {2258} INFO - iteration 6, current learner lgbm
[flaml.automl.logger: 10-30 10:50:37] {2442} INFO -  at 0.7s,	estimator lgbm's best error=0.1321,	best estimator lgbm's best error=0.1321
[flaml.automl.logger: 10-30 10:50:37] {2258} INFO - iteration 7, current learner lgbm
[flaml.automl.logger: 10-30 10:50:37] {2442} INFO -  at 0.8s,	estimator lgbm's best error=0.1321,	bes

INFO:flaml.tune.searcher.blendsearch:No low-cost partial config given to the search algorithm. For cost-frugal search, consider providing low-cost values for cost-related hps via 'low_cost_partial_config'. More info can be found at https://microsoft.github.io/FLAML/docs/FAQ#about-low_cost_partial_config-in-tune


[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
[flaml.automl.logger: 10-30 11:09:13] {2258} INFO - iteration 3549, current learner sgd
[flaml.automl.logger: 10-30 11:09:13] {2442} INFO -  at 1117.1s,	estimator sgd's best error=0.1408,	best estimator rf's best error=0.1245
[flaml.automl.logger: 10-30 11:09:13] {2258} INFO - iteration 3550, current learner catboost
[flaml.automl.logger: 10-30 11:09:14] {2442} INFO -  at 1117.8s,	estimator catboost's best error=0.1343,	best estimator rf's best error=0.1245
[flaml.automl.logger: 10-30 11:09:14] {2258} INFO - iteration 3551, current learner sgd
[flaml.automl.logger: 10-30 11:09:14] {2442} INFO -  at 1117.9s,	estimator sgd's best error=0.1408,	best estimator rf's best error=0.1245
[flaml.automl.logger: 10-30 11:09:14] {2258} INFO - iteration 3552, current learner sgd
[flaml.automl.logger: 10-30 11:09:14] {2442} INFO -  at 1118.1s,	estimator sgd's best error=0.1408,	best estimator rf's best error=0.1245
[flaml.automl.logger: 10-30 11:09:14

In [13]:
%pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [23]:
auto_ml_ens = AutoML()
params = {
    "metric": "roc_auc",
    "task": "classification",
    "time_budget": 60,
    "seed": SEED,
    "ensemble": True,
    "early_stop": True,
    "estimator_list": ['catboost', 'lgbm', 'rf', 'xgboost', 'extra_tree', 'xgb_limitdepth'],
}

auto_ml_ens.fit(train_ft, target, **params)

[flaml.automl.logger: 10-30 11:25:00] {1728} INFO - task = classification
[flaml.automl.logger: 10-30 11:25:00] {1739} INFO - Evaluation method: cv
[flaml.automl.logger: 10-30 11:25:00] {1838} INFO - Minimizing error metric: 1-roc_auc
[flaml.automl.logger: 10-30 11:25:00] {1955} INFO - List of ML learners in AutoML Run: ['catboost', 'lgbm', 'rf', 'xgboost', 'extra_tree', 'xgb_limitdepth']
[flaml.automl.logger: 10-30 11:25:00] {2258} INFO - iteration 0, current learner catboost
[flaml.automl.logger: 10-30 11:25:00] {2393} INFO - Estimated sufficient time budget=6049s. Estimated necessary time budget=6s.
[flaml.automl.logger: 10-30 11:25:00] {2442} INFO -  at 0.7s,	estimator catboost's best error=0.0975,	best estimator catboost's best error=0.0975
[flaml.automl.logger: 10-30 11:25:00] {2258} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 10-30 11:25:01] {2442} INFO -  at 1.3s,	estimator lgbm's best error=0.1071,	best estimator catboost's best error=0.0975
[flaml.automl.lo