<font color="#CA3532"><h1 align="left">Master Data Analytics. EDEM.</h1></font>
<font color="#6E6E6E"><h2 align="left">Herramientas MLOps.</h2></font> 
<font color="#6E6E6E"><h2 align="left">Tarea 1. Pipeline entrenamiento de modelos.</h2></font> 
#### Daniel Ruiz Riquelme
https://docs.metaflow.org/metaflow/basics

##  Install dependencies

In [None]:
#!pip install metaflow
#!pip install scikit-learn
#!pip install pandas

## Set username

In [1]:
# Set username for workflows
import os
os.environ["USERNAME"] = "carlos"

In [8]:
%%writefile metaflow_trainingflow.py
from metaflow import FlowSpec, Parameter, step
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import pickle

class TrainingFlow(FlowSpec):
    max_depth = Parameter('max_depth', default=2, help='Max depth of the random forest classifier')
    n_estimators = Parameter('n_estimators', default=100, help='Number of estimators for the random forest classifier')
    random_state = Parameter('random_state', default=0, help='Random state for the random forest classifier')
    
    @step
    def start(self):
        self.next(self.ingest_data)
        
    @step
    def ingest_data(self):
        from sklearn.datasets import load_iris
    
        iris = load_iris()
        
        self.X = iris.data
        self.y = iris.target
        
        self.next(self.split_data)

    @step
    def split_data(self):
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self.X, self.y, test_size=0.2, random_state=self.random_state)
        self.next(self.train)

    @step
    def train(self):
        self.model = RandomForestClassifier(
            max_depth=self.max_depth,
            n_estimators=self.n_estimators,
            random_state=self.random_state
        )
        self.model.fit(self.X_train, self.y_train)
        self.next(self.show_metrics)

    @step
    def show_metrics(self):
        self.y_pred = self.model.predict(self.X_test)
        self.accuracy = accuracy_score(self.y_test, self.y_pred)
        self.report = classification_report(self.y_test, self.y_pred)
        print(f"Accuracy: {self.accuracy}")
        print(f"Classification Report:\n{self.report}")
        self.next(self.register_model)
        
    @step
    def register_model(self):
        with open('random_forest_model.pkl', 'wb') as f:
            pickle.dump(self.model, f)
        print("Model saved as random_forest_model.pkl")
        self.next(self.end)
        
    @step
    def end(self):
        pass
    
if __name__ == '__main__':
    TrainingFlow()


Overwriting metaflow_trainingflow.py


In [10]:
!python metaflow_trainingflow.py run --max_depth 2 --n_estimators 100 --random_state 0

[35m[1mMetaflow 2.12.5[0m[35m[22m executing [0m[31m[1mTrainingFlow[0m[35m[22m[0m[35m[22m for [0m[31m[1muser:maccharlie[0m[35m[22m[K[0m[35m[22m[0m
[35m[22mValidating your flow...[K[0m[35m[22m[0m
[32m[1m    The graph looks good![K[0m[32m[1m[0m
[35m[22mRunning pylint...[K[0m[35m[22m[0m
[32m[22m    Pylint not found, so extra checks are disabled.[K[0m[32m[22m[0m
[35m2024-06-22 12:43:32.382 [0m[1mWorkflow starting (run-id 1719053012381214):[0m
[35m2024-06-22 12:43:32.391 [0m[32m[1719053012381214/start/1 (pid 64709)] [0m[1mTask is starting.[0m
[35m2024-06-22 12:43:32.820 [0m[32m[1719053012381214/start/1 (pid 64709)] [0m[1mTask finished successfully.[0m
[35m2024-06-22 12:43:32.824 [0m[32m[1719053012381214/ingest_data/2 (pid 64712)] [0m[1mTask is starting.[0m
[35m2024-06-22 12:43:33.283 [0m[32m[1719053012381214/ingest_data/2 (pid 64712)] [0m[1mTask finished successfully.[0m
[35m2024-06-22 12:43:33.288 [0m[32m[17