# MLflow 튜토리얼 커스텀하게 바꿔보기

In [4]:
import os
os.getcwd()
os.chdir('C:/Users/canmanmo/Desktop/mlflow_custom')
os.getcwd()
os.listdir()

['train.py']

## 타이타닉 데이터 및 모델로 교체

In [None]:
# The data set used in this example is from http://archive.ics.uci.edu/ml/datasets/Wine+Quality
# P. Cortez, A. Cerdeira, F. Almeida, T. Matos and J. Reis.
# Modeling wine preferences by data mining from physicochemical properties. In Decision Support Systems, Elsevier, 47(4):547-553, 2009.

import os
import warnings
import sys

import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LogisticRegression
from urllib.parse import urlparse
import mlflow
import mlflow.sklearn

import logging

logging.basicConfig(level=logging.WARN)
logger = logging.getLogger(__name__)


def eval_metrics(actual, pred):
    score = model.score(actual, pred)
    return score


if __name__ == "__main__":
    warnings.filterwarnings("ignore")
    np.random.seed(40)

    # Read the wine-quality csv file from the URL
    csv_url = (
        "https://raw.githubusercontent.com/TeamLab/machine_learning_from_scratch_with_python/master/code/ch12/titanic/train.csv"
    )
    try:
        data = pd.read_csv(csv_url)
    except Exception as e:
        logger.exception(
            "Unable to download training & test CSV, check your internet connection. Error: %s", e
        )

    # Removing unnecessary columns 
    data.drop(columns=["Cabin", "Ticket", "Name", "Embarked"], inplace=True)
    # Change Sex from male to 1 and female to 0
    data["Sex"] = (data["Sex"].str[0]=="m").replace("m", 1).astype(int)
    # forward filling those columns which have NaN values
    data["Age"] = data["Age"].ffill()
    data["Fare"] = data["Fare"].ffill()    
    
    # Split the data into training and test sets. (0.75, 0.25) split.
    train, test = train_test_split(data)

    # The predicted column is "Survived" which is a scalar from [3, 9]
    train_x = train.drop(["Survived"], axis=1)
    test_x = test.drop(["Survived"], axis=1)
    train_y = train[["Survived"]]
    test_y = test[["Survived"]]
    
    max_iter = 100

    with mlflow.start_run():
        model = LogisticRegression(max_iter=max_iter, random_state=42)
        model.fit(train_x, train_y)

#         predicted_survived = model.predict(test_x)

        (score) = eval_metrics(test_x, test_y)

        print("LogisticRegression model (max_iter=%d):" % (max_iter))
        print("  Score: %s" % score)


        mlflow.log_param("max_iter", max_iter)
        mlflow.log_metric("score", score)

        tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme

        # Model registry does not work with file store
        if tracking_url_type_store != "file":

            # Register the model
            # There are other ways to use the Model Registry, which depends on the use case,
            # please refer to the doc for more information:
            # https://mlflow.org/docs/latest/model-registry.html#api-workflow
            mlflow.sklearn.log_model(model, "model", registered_model_name="LogisticTitanicModel")
        else:
            mlflow.sklearn.log_model(model, "model")

## 실행 결과

In [12]:
!python train.py

LogisticRegression model (max_iter=100):
  Score: 0.7847533632286996


## 실패 
- 하이퍼파라미터 변경해서 모델이 적용되지 않는다 
- 해결해야 함

In [None]:
!python Titanic/train.py 500

# 레퍼런스

- 구글 검색어
1. mlflow hyperparameter tuning
2. mlflow mlproject file custom

- URL
1. https://www.phdata.io/blog/bayesian-hyperparameter-optimization-with-mlflow/
2. https://stackoverflow.com/questions/57550351/mlflow-1-2-0-define-mlproject-file 
3. https://scikit-learn.org/stable/modules/model_evaluation.html#classification-metrics(sklearn 평가지표)
4. https://mlflow.org/docs/latest/tutorials-and-examples/tutorial.html(MLflow 튜토리얼)
