# Faza 1 Stworzenie komponentów do pipelinów 

Ten notebook ma na celu impelemtacje poszczególnych komponentów na podstawie jupyter notebook. 
    

In [18]:
import kfp
from kfp import dsl
from kfp.dsl import (Artifact,
                        Dataset,
                        Input,
                        InputPath,
                        Model,
                        Output,
                        OutputPath,
                        component,
                        Metrics)
from kfp import compiler
from google.cloud import aiplatform
from typing import NamedTuple
import logging

In [19]:

# --- Komponent 1: Przygotowanie danych ---
@component(
    packages_to_install=["pandas==2.2.2", "gcsfs", "fsspec", "pyarrow"],
    base_image="python:3.9"
)
def get_data(gcs_input_path: str
            , input_data : Output[Dataset]):

    import pandas as pd 
    #wczytanie danych
    df = pd.read_csv(gcs_input_path)
    df.to_csv(input_data.path, index=False)


Defincicja komponentów i stworzenie pipeline

In [None]:


# --- Definicja głównego potoku Vertex AI ---
@kfp.dsl.pipeline(
    name="test",
    description="Potok trenujący i rejestrujący model SVC.",
    pipeline_root="gs://vertex-ai-bucket-s25537",
)
def penguin_pipeline(
    gcs_data_path: str = "gs://data-s25537/penguins.csv",
    project_id: str = "mlops-on-gcp-s25537",
    region: str = "us-central1",
    model_name: str = "puffin",
    test_split_ratio: float = 0.3,
    min_accuracy_threshold: float = 95.0,
):
    """Definiuje przepływ pracy w potoku z warunkową rejestracją."""
    get_data_task = get_data(gcs_input_path=gcs_data_path)
    
    transform_data_task = preprocess_data(
        input_data=get_data_task.outputs["input_data"],
        test_split_ratio=test_split_ratio
    )
    
    train_task = train_svc_model(
        train_dataset=transform_data_task.outputs["train_dataset"]
    )
    
    evaluate_task = evaluate_svc_model(
        test_dataset=transform_data_task.outputs["test_dataset"],
        model=train_task.outputs["model"],
    )

    # Warunek: zarejestruj model tylko, jeśli dokładność jest wystarczająco wysoka
    with dsl.If(
        evaluate_task.outputs["accuracy"] >= min_accuracy_threshold,
        name="accuracy-check",
    ):
        register_model(
            project_id=project_id,
            region=region,
            model_display_name=model_name,
            model=train_task.outputs["model"],
        )


In [25]:
compiler.Compiler().compile(
        pipeline_func=penguin_pipeline,
        package_path="penguin_svc_pipeline_with_registry.json",
    )

aiplatform.init(project="mlops-on-gcp-s25537", location='us-central1')

job = aiplatform.PipelineJob(
        display_name="penguin-svc-with-registry-run",
        template_path="penguin_svc_pipeline_with_registry.json",
        pipeline_root="gs://vertex-ai-bucket-s25537",
        enable_caching=True,
    )

print("Uruchamianie potoku w Vertex AI...")
job.run()
print("Potok został uruchomiony. Sprawdź postęp w konsoli Google Cloud.")

Uruchamianie potoku w Vertex AI...
Creating PipelineJob
PipelineJob created. Resource name: projects/739663413023/locations/us-central1/pipelineJobs/penguin-svc-classification-with-registry-20250705184555
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/739663413023/locations/us-central1/pipelineJobs/penguin-svc-classification-with-registry-20250705184555')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/penguin-svc-classification-with-registry-20250705184555?project=739663413023
PipelineJob projects/739663413023/locations/us-central1/pipelineJobs/penguin-svc-classification-with-registry-20250705184555 current state:
3
PipelineJob projects/739663413023/locations/us-central1/pipelineJobs/penguin-svc-classification-with-registry-20250705184555 current state:
3
PipelineJob projects/739663413023/locations/us-central1/pipelineJobs/penguin-svc-classification-with-registry-20250705184555 current 