# Modeler and Model with Iris dataset

次のtutorialを実行する。
https://h1st.readthedocs.io/en/latest/tutorials/examples/modeler-model.html

このチュートリアルでは、例としてアイリス・データセットを使用して、ModelerとModelの作り方を学びます。

まず、アイリス・データセットをロードするために、load_dataメソッドを持つMLModelerを作ります。そして、LogisticRegressionを学習するために学習データを作ります。H1stフレームワークはload_dataをコールして、あなたが定義した対応するMLModelを訓練するためのメソッドをビルドします。

This tutorial shows how you can create Modeler and Model using the iris dataset as an example. Firstly, let’s create an MLModeler with load_data to load the iris dataset and generate training data to train a LogisticRegression base model in train. The h1st framework provides the build method which calls load_data and train and produces the corresponding MLModel which you needs to define.

In [1]:
import os
from typing import Any, Dict
import tempfile
import pandas as pd
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
from h1st.model.model import Model
from h1st.model.ml_modeler import MLModeler
from h1st.model.ml_model import MLModel

class MyMLModeler(MLModeler):
    def __init__(self):
        self.stats = {}
        self.example_test_data_ratio = 0.2

    def load_data(self) -> Dict:
        df_raw = datasets.load_iris(as_frame=True).frame
        return self.generate_training_data({'df_raw': df_raw})

    def preprocess(self, data):
        self.stats['scaler'] = StandardScaler()
        return self.stats['scaler'].fit_transform(data)

    def generate_training_data(self, data: Dict[str, Any]) -> Dict[str, Any]:
        df_raw = data['df_raw']
        df_raw.columns = ['sepal_length','sepal_width','petal_length','petal_width', 'species']

        self.stats['targets'] = ['Setosa', 'Versicolour', 'Virginica']
        self.stats['targets_dict'] = {k: v for v, k in enumerate(self.stats['targets'])}

        # Shuffle all the df_raw
        df_raw = df_raw.sample(frac=1, random_state=5).reset_index(drop=True)

        # Preprocess data
        df_raw.loc[:, 'sepal_length':'petal_width'] = self.preprocess(
            df_raw.loc[:, 'sepal_length':'petal_width'])

        # Split to training and testing data
        n = df_raw.shape[0]
        n_test = int(n * self.example_test_data_ratio)
        training_data = df_raw.iloc[n_test:, :].reset_index(drop=True)
        test_data = df_raw.iloc[:n_test, :].reset_index(drop=True)

        # Split the data to features and labels
        train_data_x = training_data.loc[:, 'sepal_length':'petal_width']
        train_data_y = training_data['species']
        test_data_x = test_data.loc[:, 'sepal_length':'petal_width']
        test_data_y = test_data['species']

        # When returning many variables, it is a good practice to give them names:
        return {
            'train_x':train_data_x,
            'train_y':train_data_y,
            'test_x':test_data_x,
            'test_y':test_data_y,
        }

    def train_base_model(self, data: Dict[str, Any]) -> Any:
        X, y = data['train_x'], data['train_y']
        model = LogisticRegression(random_state=0)
        model.fit(X, y)
        return model

    def evaluate_model(self, data: Dict, model: MLModel) -> Dict:
        super().evaluate_model(data, model)
        X, y_true = data['test_x'], data['test_y']
        y_pred = pd.Series(model.predict({'X': X, 'y': y_true})['species']).map(model.stats['targets_dict'])
        return {'micro_f1_score': f1_score(y_true, y_pred, average='micro')}


  from .autonotebook import tqdm as notebook_tqdm
2023-02-17 17:14:02.851947: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-02-17 17:14:03.333955: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-02-17 17:14:03.334006: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-02-17 17:14:04.274054: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open s

In [3]:
# このセルはコード理解のために追加したもの

# MyMLModelerの動きをテストする
m = MyMLModeler()
iris_data = m.load_data()

# irisの元データは150個だが、ここで既に学習用120、テスト用30に分割されている。
print(type(iris_data))
print(iris_data['train_x'].shape)
print(iris_data['train_y'].shape)
print(iris_data['test_x'].shape)
print(iris_data['test_y'].shape)

# 全体を表示してみる
print('---------------------------')
print(iris_data['train_x'])
print('---------------------------')
print(iris_data['train_y'])
print('---------------------------')
print(iris_data['test_x'])
print('---------------------------')
print(iris_data['test_y'])
print('---------------------------')
print(iris_data)

<class 'dict'>
(120, 4)
(120,)
(30, 4)
(30,)
---------------------------
     sepal_length  sepal_width  petal_length  petal_width
0       -0.900681     0.788808     -1.283389    -1.315444
1       -0.416010    -1.743357      0.137547     0.132510
2       -0.173674    -1.052767     -0.146641    -0.262387
3       -0.537178     1.479398     -1.283389    -1.315444
4        1.401508     0.328414      0.535409     0.264142
..            ...          ...           ...          ...
115     -1.748856    -0.362176     -1.340227    -1.315444
116      0.310998    -0.592373      0.535409     0.000878
117      1.038005     0.558611      1.103783     1.712096
118      2.249683    -1.052767      1.785832     1.448832
119     -0.173674    -0.592373      0.194384     0.132510

[120 rows x 4 columns]
---------------------------
0      0
1      1
2      1
3      0
4      1
      ..
115    0
116    1
117    2
118    2
119    1
Name: species, Length: 120, dtype: int64
---------------------------
    sepal_l

In [5]:
class MyMLModel(MLModel):
    def preprocess(self, data: Dict[str, Any]) -> Dict[str, Any]:
        raw_data = data['X']
        return {
            'X': self.stats['scaler'].transform(raw_data)
        }

    def predict(self, input_data: dict) -> dict:
        preprocess_data = self.preprocess(input_data)
        y = self.base_model.predict(preprocess_data['X'])
        return {'species': [self.stats['targets'][item] for item in y]}

In [6]:
my_ml_modeler = MyMLModeler()
my_ml_modeler.model_class = MyMLModel

my_ml_model = my_ml_modeler.build_model()

print(my_ml_model.metrics)

{'micro_f1_score': 0.3}


X does not have valid feature names, but LogisticRegression was fitted with feature names


In [9]:
# 学習済みモデルに、2セットの特徴量を渡して分類を試みている
prediction = my_ml_model.predict({
    'X': pd.DataFrame(
        [[5.1, 3.5, 1.5, 0.2],
        [7.1, 3.5, 1.5, 0.6]],
        columns=['sepal_length','sepal_width','petal_length','petal_width'])
})
prediction

X does not have valid feature names, but LogisticRegression was fitted with feature names


{'species': ['Setosa', 'Versicolour']}