#### ML Flow를 활용한 BPED 과제 적용 예제
##### 1. 클라이언트 & Experiments 설정
>    * 클라이언트 & Experiments 선택
##### 2. 모델 생성
>    * Experiments & Run name 설정
>    * 데이터 불러오기 & 전처리
>    * 데이터 분리
>    * 모델 파라미터 설정
>    * ML Flow Run context 작성
    

<hr/>

***

In [1]:

from pprint import pprint

# 모델 학습 관련 라이브러리
import mlflow
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from mlflow.models import infer_signature
import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [2]:
## Config info

# *[Client server path]
tracking_uri_path = "http://127.0.0.1:8080" #"http://192.168.25.154:5000"

# *[Experiment]
experiment_name_info = "Bped_442T06_Li_test"

# *[Model]
run_name_info = "xgboost_model"
scaler_info = StandardScaler()
params = {
    "n_estimators":1000,
    "learning_rate": 0.01,
    "max_depth": 1,
    'min_child_weight': 2,
    'subsample': 0.8,
    'gamma': 1,
    'colsample_bytree': 0.8,
    "random_state": 42
}
model_type_info = 'regressor'
training_info = 'xgboost/standardscaler/n_estimators=1000'

#### 클라이언트 & Experiments 선택

In [6]:
# Tracking URI 설정
mlflow.set_tracking_uri(f"{tracking_uri_path}")


In [7]:
# Experiment 선택
mlflow.set_experiment(f"{experiment_name_info}")

<Experiment: artifact_location='mlflow-artifacts:/173616641003583631', creation_time=1708588090996, experiment_id='173616641003583631', last_update_time=1708588090996, lifecycle_stage='active', name='Bped_442T06_Li_test', tags={'mlflow.note.content': '프로젝트 명 : ML Flow를 활용한 BPED 개발 Tutorial\n'
                        '개발 모델 명 : 442T06_Li Regression 모델\n'
                        '개발자 : 안광혁\n'
                        '개발 기간 : 2024.02.16 ~\n',
 'project_name': 'MLflow Tutorial test',
 'project_quarter': 'Q1-2024',
 'store_dept': 'AI사업부',
 'team': 'Data Analyasis'}>

* * *

#### 모델 생성

In [8]:
# Tracking을 위한 Client 셋
#mlflow.set_tracking_uri("http://192.168.25.154:5000")

# Experiment 설정
#Bped_442T06_Li_experiment = mlflow.set_experiment("Bped_442T06_Li")

# Run name 설정
run_name = f"{run_name_info}"

# artifact path set
#artifact_path = 'xgboost_regressor_20240220'


In [9]:
## 데이터 불러오기

# 사용 변수 정보
df_features = pd.read_json('model_features_v1.2.json')
COL_442T06_Li = df_features[df_features['442T06_Li']=='Y'][['INDEX','FEATURE']] #23년

# 데이터 셋 구분
X_raw = pd.read_csv('data/train_dataset.csv')
Y_raw = pd.read_csv('data/train_dataset.csv')

# 독립변수, 종속변수 설정
X = X_raw[COL_442T06_Li['FEATURE'].to_list()]
Y = Y_raw['442T06_Li']

In [10]:
## 모델 Pipeline 생성

# Scaler 설정
numeric_columns = COL_442T06_Li['FEATURE'].to_list()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', scaler_info, numeric_columns),
    ],
    remainder='passthrough'
)

# 모델 파라미터 설정
#params = {
#    "n_estimators":1000,
#    "learning_rate": 0.01,
#    "max_depth": 1,
#    'min_child_weight': 2,
#    'subsample': 0.8,
#    'gamma': 1,
#    'colsample_bytree': 0.8,
#    "random_state": 42
#}
xgbr = xgb.XGBRegressor(**params)

# 데이터 셋 구분
X_train, X_val, y_train, y_val = train_test_split(X, Y, test_size=0.2, shuffle=False) # 시계열 데이터 평가이기에 shuffle은 False로 함
#X_train, X_val, y_train, y_val = train_test_split(X, Y, test_size=0.2, shuffle=False) # 시계열 데이터 평가이기에 shuffle은 False로 함

print("=======[1]데이터 셋 구분 통과=======")

# Pipeline 구성
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('xgb_regressor', xgbr)
    ])

# 모델 학습
pipeline.fit(X_train, y_train)

print("=======[2]모델 핏 통과=======")

# X_val 예측
y_pred = pipeline.predict(X_val)

# Pipeline 구성 eval 시각화 용
pipeline_for_eval = Pipeline(steps=[
    ('preprocessor', preprocessor)
    ])
# eval 시각화 용
pipeline_for_eval.fit(X_train)
X_train_val = pipeline_for_eval.transform(X_train)
X_val_eval = pipeline_for_eval.transform(X_val)

print("=======[3]run context 시작=======")

# ML Flow Run Context 작성
with mlflow.start_run(run_name=run_name) as run:
    
    print("=======[4]log_input 시작=======")
    # 
    mlflow.log_input(mlflow.data.from_numpy(np.array(X)), context='features')
    mlflow.log_input(mlflow.data.from_numpy(np.array(Y)), context='labels')
    
    # Log the parameters used for the model fit
    mlflow.log_params(params)
    
    print("=======[5]set_tag 시작=======")
    # Set a tag that we can use to remind ourselves what this run was for
    mlflow.set_tag("Training Info", f"{training_info}")
    
    print("=======[6]infer_signature 시작=======")
    # Infer the model signature
    signature = infer_signature(X_train_val, xgbr.predict(X_train_val)) # Web UI에서 Run Dataset 반영 됨, mlflow.evaluate의 data 및 targets에도 데이터 형태 영향 줌

    print("=======[7]log_model 시작=======")
    # Log an instance of the trained model for later use
    mlflow.sklearn.log_model(
        sk_model=xgbr,
        input_example=X_val_eval, 
        artifact_path='model',
        signature=signature
        #registerd_model_name = "등록할 모델 이름"
        )
    
    print("=======[8]get_artifact_uri 시작=======")
    model_uri = mlflow.get_artifact_uri("model")
    print('model_uri :',model_uri)
    
    eval_metrics  = mlflow.evaluate(
        model_uri,
        data = X_val_eval,
        targets = np.array(y_val),
        model_type = f"{model_type_info}",
        evaluators = ["default"]
    )
    
print("Evaluation Metrics:", eval_metrics)





MlflowException: API request to http://192.168.25.154:5000/api/2.0/mlflow-artifacts/artifacts/173616641003583631/590dc0f7d5a0469081fa09fa1efda24f/artifacts/model/input_example.json failed with exception HTTPConnectionPool(host='192.168.25.154', port=5000): Max retries exceeded with url: /api/2.0/mlflow-artifacts/artifacts/173616641003583631/590dc0f7d5a0469081fa09fa1efda24f/artifacts/model/input_example.json (Caused by ProtocolError('Connection aborted.', ConnectionAbortedError(10053, '현재 연결은 사용자의 호스트 시스템의 소프트웨어의 의해 중단되었습니다', None, 10053, None)))