# Vertex AI - KubeFlow Pipelines (KFP)

A lo largo del pipeline, se utilizan componentes custom que son creados a partir de código Python y cuentan con una serie de objetos propios de KFP, que son definidos en esta notebook y almacenados como objetos yaml para ser leídos en la notebook donde se ejecuta el pipeline en sí.

Cabe mencionar que también se puede realizar pipelines -y componentes custom- utilizando Tensorflow Extended (TFX).

Throughout the pipeline, we use custom component created from Python code and with some objects that come from KFP. These are defined within this notebook, stored as yaml objects and then read by the main notebook where the pipeline itself its executed.

It's worth mentioning that these pipelines -and custom components as well- can also be built using Tensorflow Extended (TFX)

In [1]:
from kfp.v2 import compiler, dsl
from kfp.v2.dsl import pipeline, component, Artifact, Dataset, Input, Metrics, Model, Output, InputPath, OutputPath
from typing import NamedTuple

In [2]:
@component(
    packages_to_install=["pandas", "pyarrow", "scikit-learn==1.0.0", "google-cloud-bigquery", "google-cloud-bigquery-storage", "pandas-gbq", "google-cloud-aiplatform"],
    base_image="python:3.9",
    output_component_file="bq_current_raw_to_stage.yaml"
)

def bq_current_raw_to_stage_ml(
    project: str,
    region: str,
    bq_current_raw_url: str,
    bq_current_stage_url: str,
    stage_data_bucket: str,
    gcs_predict_source: OutputPath(str)
):
    '''
    Toma el dataset de BigQuery establecido como el presente y lo procesa, colocandolo en la tabla stage_ml. Tambien sube una version timestamped de la data en csv.
    Takes the Bigquery Dataset established as the present and processes it, placing it in the stage_ml table. It also uploads a timestamped csv version of the data.
    '''
    import pandas as pd
    import pandas_gbq
    import numpy as np
    from google.cloud import bigquery
    from google.cloud import storage
    from google.cloud.storage import Blob
    from google.cloud import aiplatform
    
    aiplatform.init(project = project,
                    location = region)
    
    ### get data from bq_source
    bqclient = bigquery.Client(project = project, location = region)
    

    # Download a table
    table = bigquery.TableReference.from_string(
        bq_current_raw_url
    )
    rows = bqclient.list_rows(
        table
    )
    data = rows.to_dataframe(
        create_bqstorage_client=True, # guarda acá
    )
    
    # process
    
    df = data[['trip_month', 'trip_day', 'trip_day_of_week', 'trip_hour', 'trip_seconds', 'trip_miles', 'payment_type', 'euclidean']]
    
    df2 = pd.get_dummies(df, columns = ['payment_type'], drop_first = True)
    
    df2.columns = df2.columns.str.replace(' ','_')
    
    # upload to bq
    
    df2.to_gbq(bq_current_stage_url,
               project,
               chunksize=None, 
               if_exists='replace', # el default tira error, aca queremos que siempre reemplace
               table_schema=[{'name': 'trip_month','type': 'INTEGER'},
                             {'name': 'trip_day','type': 'INTEGER'},
                             {'name': 'trip_day_of_week','type': 'INTEGER'},
                             {'name': 'trip_hour','type': 'INTEGER'},
                             {'name': 'trip_seconds','type': 'INTEGER'},
                             {'name': 'trip_miles','type': 'FLOAT'},
                             {'name': 'euclidean','type': 'FLOAT'},
                             #{'name': 'target','type': 'INTEGER'}, eliminamos el target para simular la realidad
                             {'name': 'payment_type_Credit_Card','type': 'INTEGER'},
                             {'name': 'payment_type_Dispute','type': 'INTEGER'},
                             {'name': 'payment_type_Mobile','type': 'INTEGER'},
                             {'name': 'payment_type_No_Charge','type': 'INTEGER'},
                             {'name': 'payment_type_Prcard','type': 'INTEGER'},
                             {'name': 'payment_type_Unknown','type': 'INTEGER'}
                             ]
    )
    
    # ponerle a la data tambien un timestamp
    
    
    from datetime import datetime

    TIMESTAMP =datetime.now().strftime("%Y%m%d%H%M%S")
    
    DATA_PATH = f"predicted_data_{TIMESTAMP}.csv"
    
    df2.to_csv(DATA_PATH, index = False)
    
    gcsclient = storage.Client() # tal vez vaya stage_data_bucket
    bucket = gcsclient.get_bucket(stage_data_bucket)
    
    blob_train = bucket.blob(DATA_PATH)
    blob_train.upload_from_filename(DATA_PATH)
    
    
    GCS_PREDICT_SOURCE = f"gs://{stage_data_bucket}/{DATA_PATH}"
    
    with open(gcs_predict_source, 'w') as f:
              f.write(GCS_PREDICT_SOURCE)

In [3]:
@component(
    packages_to_install=["pandas", "pyarrow", "scikit-learn==1.0.0", "google-cloud-bigquery", "google-cloud-bigquery-storage", "pandas-gbq", "google-cloud-aiplatform"],
    base_image="python:3.9",
    output_component_file="bq_historic_raw_to_stage.yaml"
)

def bq_historic_raw_to_stage_ml(
    project: str,
    region: str,
    bq_historic_raw_url: str,
    bq_historic_stage_url: str,
    
) -> str:
    
    '''
    Toma el dataset de BigQuery establecido como el periodo historico y lo procesa, colocandolo en la tabla stage_ml.
    Takes the Bigquery Dataset established as historic and processes it, placing it in the stage_ml table.
    '''
    import pandas as pd
    import pandas_gbq
    import numpy as np
    from google.cloud import bigquery
    from google.cloud import storage
    from google.cloud.storage import Blob
    from google.cloud import aiplatform
    
    aiplatform.init(project = project,
                    location = region)
    
    ### get data from bq_source
    bqclient = bigquery.Client(project = project, location = region)
    

    # Download a table.
    table = bigquery.TableReference.from_string(
        bq_historic_raw_url
    )
    rows = bqclient.list_rows(
        table
    )
    data = rows.to_dataframe(
        create_bqstorage_client=True, # guarda acá
    )
    
    df = data[['trip_month', 'trip_day', 'trip_day_of_week', 'trip_hour', 'trip_seconds', 'trip_miles', 'payment_type', 'euclidean', 'tip_bin']]
    
    df = df.rename(columns = {'tip_bin':'target'})
    
    df2 = pd.get_dummies(df, columns = ['payment_type'], drop_first = True)
    
    df2.columns = df2.columns.str.replace(' ','_')
    
    df2.to_gbq(bq_historic_stage_url,
               project,
               chunksize=None, # I have tried with several chunk sizes, it runs faster when it's one big chunk (at least for me)
               if_exists='replace', # el default tira error, aca no queremos eso
               #verbose=False
               table_schema=[{'name': 'trip_month','type': 'INTEGER'},
                             {'name': 'trip_day','type': 'INTEGER'},
                             {'name': 'trip_day_of_week','type': 'INTEGER'},
                             {'name': 'trip_hour','type': 'INTEGER'},
                             {'name': 'trip_seconds','type': 'INTEGER'},
                             {'name': 'trip_miles','type': 'FLOAT'},
                             {'name': 'euclidean','type': 'FLOAT'},
                             {'name': 'target','type': 'INTEGER'},
                             {'name': 'payment_type_Credit_Card','type': 'INTEGER'},
                             {'name': 'payment_type_Dispute','type': 'INTEGER'},
                             {'name': 'payment_type_Mobile','type': 'INTEGER'},
                             {'name': 'payment_type_No_Charge','type': 'INTEGER'},
                             {'name': 'payment_type_Prcard','type': 'INTEGER'},
                             {'name': 'payment_type_Unknown','type': 'INTEGER'}
                             ]
    )
    
    URL_TO_GO = bq_historic_stage_url
    
    return URL_TO_GO

In [4]:
@component(
    packages_to_install=["pandas", "pyarrow", "scikit-learn==1.0.0", "google-cloud-bigquery", "google-cloud-bigquery-storage"],
    base_image="python:3.9",
    output_component_file="get_chicago_data.yaml"
)

def get_chicago_data(
    project: str,
    region: str,
    bq_source_url: str,
    stage_data_bucket: str,
    dataset_train: Output[Dataset],
    dataset_val: Output[Dataset],
    dataset_test: Output[Dataset] 
):
    '''
    Toma los datos que se consideran historicos de la tabla de BQ y separa en train, validation y test. Ademas de pasarlos como componentes del pipeline, guarda una version de los datos en el bucket de stage.
    Takes the data considered as historic from the BQ table and splits it into train, validation and test. Besides passing them as pipeline component, it stores a version of the data in the stage bucket.
    '''
    import pandas as pd
    import numpy as np
    from sklearn.model_selection import train_test_split as tts
    from google.cloud import bigquery
    from google.cloud import storage
    from google.cloud.storage import Blob
    
    TRAIN_DATA_PATH = 'chicago_taxi_train.csv'
    VAL_DATA_PATH = 'chicago_taxi_val.csv'
    TEST_DATA_PATH = 'chicago_taxi_test.csv'
    
    ### get data from bq_source
    bqclient = bigquery.Client(project = project, location = region)
    

    # Download tje table.
    table = bigquery.TableReference.from_string(
        bq_source_url
    )
    rows = bqclient.list_rows(
        table,

    )
    data = rows.to_dataframe(
        create_bqstorage_client=True, # guarda acá
    )
    
    # splits in train, val and test
      
    train, test = tts(data, test_size=0.3)
    train_data, val_data = tts(train, test_size = 0.2)
    
    train_data.to_csv(TRAIN_DATA_PATH)
    val_data.to_csv(VAL_DATA_PATH)
    test.to_csv(TEST_DATA_PATH)
    
    ### so far we have the paths, we have to upload them to the bucket / hasta aca están los csvs en los PATH
    gcsclient = storage.Client() 
    bucket = gcsclient.get_bucket(stage_data_bucket)
    
    blob_train = bucket.blob(TRAIN_DATA_PATH)
    blob_train.upload_from_filename(TRAIN_DATA_PATH)
    
    blob_train = bucket.blob(VAL_DATA_PATH)
    blob_train.upload_from_filename(VAL_DATA_PATH)
    
    blob_test = bucket.blob(TEST_DATA_PATH)
    blob_test.upload_from_filename(TEST_DATA_PATH)
    
    train_data.to_csv(dataset_train.path + ".csv" , index=False, encoding='utf-8-sig')
    val_data.to_csv(dataset_val.path + ".csv" , index=False, encoding='utf-8-sig')
    test.to_csv(dataset_test.path + ".csv" , index=False, encoding='utf-8-sig')

In [5]:
@component(
    packages_to_install = [
        "pandas",
        "sklearn",
    ], base_image="python:3.9",
    output_component_file="train_rf_chicago.yaml"
)
def train_rf_chicago(
    dataset:  Input[Dataset],
    model: Output[Model], 
):
    '''
    Definicion de componente custom: entrena un modelo Random Forest usando la data que viene de la particion de train, pasada como componente.
    Custom component definition: train a Random Forest model using the data that comes from the train partition, passed as component.
    '''
    
    from sklearn.ensemble import RandomForestClassifier
    import pandas as pd
    import pickle

    data = pd.read_csv(dataset.path+".csv")
    model_rf = RandomForestClassifier()
    model_rf.fit(
        data.drop(columns=["target"]),
        data.target,
    )
    model.metadata["framework"] = "RF"
    file_name = model.path + f".pkl"
    with open(file_name, 'wb') as file:  
        pickle.dump(model_rf, file)

In [6]:
@component(
    packages_to_install = [
        "pandas",
        "sklearn",
    ], base_image="python:3.9",
    output_component_file="train_lr_chicago.yaml"
)
def train_lr_chicago(
    dataset:  Input[Dataset],
    model: Output[Model], 
):
    '''
    Definicion de componente custom: entrena un modelo Regresion Logistica usando la data que viene de la particion de train, pasada como componente.
    Custom component definition: train a Logistic Regression model using the data that comes from the train partition, passed as component.
    '''
    
    from sklearn.linear_model import LogisticRegression
    import pandas as pd
    import pickle

    data = pd.read_csv(dataset.path+".csv")
    model_lr = LogisticRegression()
    model_lr.fit(
        data.drop(columns=["target"]),
        data.target,
    )
    model.metadata["framework"] = "LR"
    file_name = model.path + f".pkl"
    with open(file_name, 'wb') as file:  
        pickle.dump(model_lr, file)

In [7]:
@component(
    packages_to_install = [
        "pandas",
        "sklearn",
    ], base_image="python:3.9",
    output_component_file="model_evaluation.yaml"
)
def model_evaluation(
    val_set:  Input[Dataset],
    lr_chicago_model: Input[Model],
    rf_chicago_model: Input[Model],
    lr_kpi: Output[Metrics],
    rf_kpi: Output[Metrics],
    winning_model_name: Output[Artifact],
):
    '''
    Evaluacion de modelos entrenados. Toma los modelos previamente entrenados (pkls) y los evalua segun la metrica F1. El nombre del ganador y NO el modelo en si mismo son pasados como componente, asi como tambien la metrica kpi deseada. 
    Evaluation of trained models. Grabs the previously trained models (pkls) and evaluates them according to F1 score metric. The name of the winner and NOT the model itself gets passed as component, as well as chosen kpi metrics.
    
    '''

    from sklearn.ensemble import RandomForestClassifier
    from sklearn.linear_model import LogisticRegression
    
    import pandas as pd
    import logging 
    import pickle
    from sklearn.metrics import roc_curve, confusion_matrix, accuracy_score, f1_score
    import json
    import typing
    
    rf_model = RandomForestClassifier()
    file_name = rf_chicago_model.path + ".pkl"
    with open(file_name, 'rb') as file:  
        rf_model = pickle.load(file)
        
    lr_model = LogisticRegression()
    file_name = lr_chicago_model.path + ".pkl"
    with open(file_name, 'rb') as file:  
        lr_model = pickle.load(file)
    
    data = pd.read_csv(val_set.path+".csv")
    y_test = data.drop(columns=["target"])
    y_target=data.target
    
    
    y_pred_rf = rf_model.predict(y_test)
    y_pred_lr = lr_model.predict(y_test)

  
    # seleccion de modelo
    rf_f1 = f1_score(data.target, y_pred_rf.round())
    lr_f1 = f1_score(data.target, y_pred_lr.round())
    
    
    
    model_dict = dict({lr_f1: lr_model, rf_f1: rf_model})
    
    def model_check(val1, val2):
        if val1 >= val2:
            return val1
        else:
            return val2
    
    best_f1 = model_check(lr_f1, rf_f1)
    best_model = model_dict[best_f1]
    
        
    #xgb_kpi.log_metric("f1_score", float(xgb_f1))
    rf_kpi.log_metric("f1_score", float(rf_f1))
    lr_kpi.log_metric("f1_score", float(lr_f1))
    
    
    winning_model_name_str = type(best_model).__name__
    
    winning_dict = {'model': winning_model_name_str}
    
    winning_model_name.metadata = winning_dict

In [8]:
@component(
    packages_to_install=["pandas", "scikit-learn==1.0.0", "google-cloud-bigquery", "google-cloud-bigquery-storage", "google-cloud-aiplatform"],
    base_image="python:3.9",
    output_component_file="best_model_hp_tuning.yaml"
)

def best_model_hp_tuning(
    project: str,
    region: str,
    stage_data_bucket: str,
    winning_model_name: Input[Artifact],
    model_spec: Output[Artifact],
    trials: int,
    parallel_trials: int,
    kpi: Output[Metrics],
    model_name: Output[Metrics] 
): 
    '''
    Tuneo de hiperparametros. Toma el nombre del modelo ganador y utiliza la imagen de Docker correspondiente para lanzar un job de entrenamiento. Los hiperparametros obtenidos son pasados como componentes.
    Hyperparameter tuning. Takes the name of the winning model and uses the corresponding Docker image to launch a training job. The chosen hyperparameters are passed as a component.
    '''
    from google.cloud import aiplatform
    from google.cloud.aiplatform import hyperparameter_tuning as hpt
    from google.protobuf.json_format import MessageToDict
    import pandas as pd
    
    aiplatform.init(project = project,
                    location = region)
    
    # train images definition
    RF_HP_IMAGE = "gcr.io/vertex-testing-327520/rf_hp_job:v1"
    LR_HP_IMAGE = "gcr.io/vertex-testing-327520/lr_hp_job:v1"
    
    # get model name
    model_dict = winning_model_name.metadata
        
    WINNING_MODEL_NAME = model_dict.get('model')
    
    from datetime import datetime

    TIMESTAMP =datetime.now().strftime("%Y%m%d%H%M%S")
    
    if WINNING_MODEL_NAME == 'LogisticRegression':
        WINNING_MODEL_IMAGE = LR_HP_IMAGE
    elif WINNING_MODEL_NAME == 'RandomForestClassifier':
        WINNING_MODEL_IMAGE = RF_HP_IMAGE
    else:
        WINNING_MODEL_IMAGE = None 
    
    worker_pool_specs = [{
    "machine_spec": {
        "machine_type": "n1-standard-8",
        #"accelerator_type": "NVIDIA_TESLA_T4",
        #"accelerator_count": 1
    },
    "replica_count": 1,
    "container_spec": {
        "image_uri": WINNING_MODEL_IMAGE
    }
    }]
    
    
    metric_spec={'f1_score':'maximize'}

    # Dictionary representing parameters to optimize.
    # The dictionary key is the parameter_id, which is passed into your training
    # job as a command line argument,
    # And the dictionary value is the parameter specification of the metric.
    
    lr_parameter_spec = {
        "penalty": hpt.CategoricalParameterSpec(values=['l1', 'l2']),
        "C": hpt.DoubleParameterSpec(min=0.001, max=1, scale="log"),
        "solver": hpt.CategoricalParameterSpec(values=['saga', 'liblinear'])
    }
    
    rf_parameter_spec = {
        "max_leaf_nodes": hpt.DiscreteParameterSpec(values=[4, 8, 10], scale=None),
        "max_depth": hpt.DiscreteParameterSpec(values=[4, 8, 10], scale=None),
        "n_estimators": hpt.DiscreteParameterSpec(values=[5, 7, 9], scale=None)
    }
    
    
    if WINNING_MODEL_NAME == 'LogisticRegression':
        parameter_spec = lr_parameter_spec
    elif WINNING_MODEL_NAME == 'RandomForestClassifier':
        parameter_spec = rf_parameter_spec
    else:
        parameter_spec = None 
    
    DISPLAY_NAME = f"{WINNING_MODEL_NAME}-{TIMESTAMP}"
    
    hp_custom_job = aiplatform.CustomJob(display_name=DISPLAY_NAME,
                                         worker_pool_specs=worker_pool_specs,
                                         staging_bucket=f'gs://{stage_data_bucket}')
    
    
    hp_job = aiplatform.HyperparameterTuningJob(
        display_name=DISPLAY_NAME,
        custom_job=hp_custom_job,
        metric_spec=metric_spec,
        parameter_spec=parameter_spec, 
        max_trial_count=trials,
        parallel_trial_count=parallel_trials
    )

    hp_job.run()
    
    # helper function
    def get_trials_as_df(trials):
        results = []
        for trial in trials:
            row = {}
            t = MessageToDict(trial._pb)
            # print(t)
            row["Trial ID"], row["Status"], row["Start time"], row["End time"] = (
                t["id"],
                t["state"],
                t["startTime"],
                t.get("endTime", None),
            )

            for param in t["parameters"]:
                row[param["parameterId"]] = param["value"]

            if t["state"] == "SUCCEEDED":
                row["Training step"] = t["finalMeasurement"]["stepCount"]
                for metric in t["finalMeasurement"]["metrics"]:
                    row[metric["metricId"]] = metric["value"]
            results.append(row)

        _df = pd.DataFrame(results)
        return _df
    
    df_trials = get_trials_as_df(hp_job.trials)
    
    # get trial id of the best run from the Trials
    best_trial_id = df_trials.loc[df_trials["f1_score"].idxmax()]["Trial ID"]
    # get best run definition
    best_run = df_trials[df_trials['Trial ID']==best_trial_id]
    
    # retrieve parameters tuned in this run
    param_names = []

    for i in parameter_spec.keys():
        param_names.append(i)
    
    best_run_to_dict = best_run[param_names]
    best_run_parameters = best_run_to_dict.to_dict('r')[0]
    
    model_spec.metadata = best_run_parameters
    
    kpi_acc = float(best_run['f1_score'])
    
    kpi.log_metric("f1_score", float(kpi_acc))
    model_name.log_metric('model', WINNING_MODEL_NAME)

In [9]:
@component(
    packages_to_install = [
        "pandas",
        "sklearn",
        #"xgboost==1.4"
    ], base_image="python:3.9",
    output_component_file="train_best_model.yaml"
)
def train_best_model(
    dataset_train:  Input[Dataset],
    dataset_val: Input[Dataset],
    model: Output[Model],
    parameters: Input[Artifact],
    winning_model_name: Input[Artifact],
):
    
    '''
    Entrenamiento del modelo seleccionado con los hiperparametros elegidos. Combina la seleccion de algoritmo e hiperparametros, entrena y pasa el pkl como componente.
    Training of the chosen model with its hyperparameters. Combines the algorithm selection and training, and passes the pkl as a component.

    '''
    
    #from xgboost import XGBClassifier
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.linear_model import LogisticRegression
    import pandas as pd
    import pickle
    
    # get model name and parameters 
    
    best_parameters = parameters.metadata
        
    model_dict = winning_model_name.metadata
    WINNING_MODEL_NAME = model_dict.get('model')
    
    # choose model and place parameters
    if WINNING_MODEL_NAME == 'LogisticRegression':
        best_model = LogisticRegression(**best_parameters)
    elif WINNING_MODEL_NAME == 'RandomForestClassifier':
        best_model = RandomForestClassifier(**best_parameters)
    else:
        best_model = None 
        
    # get data 

    data_train = pd.read_csv(dataset_train.path+".csv")
    data_val = pd.read_csv(dataset_val.path+".csv")
    
    data = pd.concat([data_train, data_val])
    
    # train
    best_model.fit(
        data.drop(columns=["target"]),
        data.target,
    )
    model.metadata["framework"] = WINNING_MODEL_NAME
    
    file_name = model.path + f".pkl"
    with open(file_name, 'wb') as file:  
        pickle.dump(best_model, file)
    

In [10]:
@component(
    packages_to_install = [
        "pandas",
        "sklearn",
        #"xgboost==1.4"
    ], base_image="python:3.9",
    output_component_file="best_model_evaluation.yaml"
)
def best_model_evaluation(
    test_set:  Input[Dataset],
    winning_model_name: Input[Artifact], # tiene que saber qué objeto instanciar adentro
    best_model: Input[Model], # y acá tomar los datos para cargarlo
    best_model_kpi: Output[Metrics],
    threshold: float
)-> NamedTuple("Outputs", [("dep_decision", str)]):
    
    '''
    Toma el mejor modelo entrenado y lo evalua usando el set de test. Si pasa un cierto umbral, devuelve "true" y marca el inicio del proximo paso, si no lo hace, el proceso se detiene.
    Takes the trained best model and evaluates it using the test set. If it passes a certain threshold, it returns "true" and sets the beginning of the next step, if it doesn't, the process halts.
    
    '''
    

    #from xgboost import XGBClassifier
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.linear_model import LogisticRegression
    import pandas as pd
    import logging 
    import pickle
    from sklearn.metrics import roc_curve, confusion_matrix, accuracy_score, f1_score
    import json
    import typing
    
    model_dict = winning_model_name.metadata
    WINNING_MODEL_NAME = model_dict.get('model')
    
    #TIMESTAMP =datetime.now().strftime("%Y%m%d%H%M%S")
    
    if WINNING_MODEL_NAME == 'LogisticRegression':
        model = LogisticRegression()
    elif WINNING_MODEL_NAME == 'RandomForestClassifier':
        model = RandomForestClassifier()
    else:
        model = None 

    
    file_name = best_model.path + ".pkl"
    with open(file_name, 'rb') as file:  
        model = pickle.load(file)
        
    data = pd.read_csv(test_set.path+".csv")
    y_test = data.drop(columns=["target"])
    y_target=data.target
    
    
    y_pred = model.predict(y_test)
    

  
    # evaluacion de modelo 
    f1_value = f1_score(data.target, y_pred.round())
    
    # toma decision
    
    if f1_value >= threshold:
        dep_decision = 'true'
    else:
        dep_decision = 'false'
    
    # guarda la metrica
    best_model_kpi.log_metric("f1_score", float(f1_value))
    
    return (dep_decision, )

In [11]:
@component(
    packages_to_install=["pandas", "scikit-learn==1.0.0", "google-cloud-bigquery", "google-cloud-bigquery-storage", "google-cloud-aiplatform"],
    base_image="python:3.9",
    output_component_file="upload_model_to_vertex_and_batch_prediction.yaml"
)
def upload_model_to_vertex_and_batch_prediction(
    project: str,
    region: str,
    serving_container: str,
    trained_model: Input[Model],
    winning_model_name: Input[Artifact],
    gcs_predict_source: str,
    gcs_predict_dest: str

):
    '''
    Toma el mejor modelo entrenado en formato pkl y lo convierte en un Vertex Managed Model a partir del cual se realizan las predicciones en formato batch.
    Takes the trained best model in pkl format and uploads it to a Vertex Managed Model and uses it to do a batch prediction job.
    '''
    
    from typing import Dict, Optional, Sequence

    from google.cloud import aiplatform
    
    from datetime import datetime
    
    model_dict = winning_model_name.metadata
    WINNING_MODEL_NAME = model_dict.get('model')
    
    TIMESTAMP =datetime.now().strftime("%Y%m%d%H%M%S")
    
    DISPLAY_NAME = WINNING_MODEL_NAME +'-' + TIMESTAMP
    
    MODEL_URI = trained_model.uri
    MODEL_PATH = MODEL_URI[:-5] # pequeño hack para que encuentre el directorio con el modelo
    
    def upload_model_sample(
        project: str,
        location: str,
        display_name: str,
        serving_container_image_uri: str,
        artifact_uri: Optional[str] = None,
        sync: bool = True,
    ):
        

        aiplatform.init(project=project, location=location)

        model = aiplatform.Model.upload(
            display_name=display_name,
            artifact_uri=artifact_uri,
            serving_container_image_uri=serving_container,
            sync=sync,
        )

        model.wait()

        print(model.display_name)
        print(model.resource_name)
        return model
    
    model_test = upload_model_sample(
        project = project,
        location = region,
        display_name = DISPLAY_NAME,
        serving_container_image_uri= serving_container,
        artifact_uri = MODEL_PATH
    )
    
    batch_job = model_test.batch_predict(
        job_display_name=DISPLAY_NAME,
        gcs_source = gcs_predict_source,
        instances_format="csv",
        gcs_destination_prefix=gcs_predict_dest,
        machine_type = 'n1-standard-16'
    )