# Faza 1 Stworzenie komponentów do pipelinów 

Ten notebook ma na celu...
    

In [18]:
import kfp
from kfp import dsl
from kfp.dsl import (Artifact,
                        Dataset,
                        Input,
                        InputPath,
                        Model,
                        Output,
                        OutputPath,
                        component,
                        Metrics)
from kfp import compiler
from google.cloud import aiplatform
from typing import NamedTuple
import logging

In [19]:

# --- Komponent 1: Przygotowanie danych ---
@component(
    packages_to_install=["pandas==2.2.2", "gcsfs", "fsspec", "pyarrow"],
    base_image="python:3.9"
)
def get_data(gcs_input_path: str
            , input_data : Output[Dataset]):

    import pandas as pd 
    #wczytanie danych
    df = pd.read_csv(gcs_input_path)
    df.to_csv(input_data.path, index=False)


Defincicja komponentów i stworzenie pipeline

In [20]:
@component(
    packages_to_install=["pandas==2.2.2", "scikit-learn==1.5.0", "pyarrow"],
    base_image="python:3.9"
)
def preprocess_data(
    input_data: Input[Dataset],
    train_dataset: Output[Dataset],
    test_dataset: Output[Dataset],
    test_split_ratio: float,
):

    import pandas as pd
    from sklearn.model_selection import train_test_split
    pd.options.mode.chained_assignment = None

    """Czyści, imputuje, dzieli i zapisuje dane jako artefakty treningowe/testowe."""

    df = pd.read_csv(input_data.path)
    df.loc[336, 'sex'] = 'FEMALE'
    numerical_cols = ['culmen_length_mm', 'culmen_depth_mm', 'flipper_length_mm', 'body_mass_g']
    for col in numerical_cols:
        df[col] = df[col].fillna(df[col].median())
    df['sex'] = df['sex'].fillna(df['sex'].mode()[0])
    df['sex'] = df['sex'].map({'MALE': 0, 'FEMALE': 1})
    df_processed = pd.get_dummies(df, columns=['island'], drop_first=True)
    X = df_processed.drop('species', axis=1)
    y = df_processed['species']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_split_ratio, random_state=42, stratify=y)
    train_df = pd.concat([X_train, y_train], axis=1)
    test_df = pd.concat([X_test, y_test], axis=1)
    # zapis do artefaktów: 
    train_df.to_csv(train_dataset.path, index=False)
    test_df.to_csv(test_dataset.path, index=False)

In [21]:
# --- Komponent 3: Trenowanie modelu SVC ---
@component(
    base_image="python:3.9",
    packages_to_install=["kfp" ,"pandas==2.2.2", "pyarrow", "scikit-learn==1.5.0", "gcsfs==2024.6.0", "fsspec", "click==8.1.7", "docstring-parser==0.16", "urllib3", "protobuf"]
)
def train_svc_model(
    train_dataset: Input[Dataset],
    model: Output[Model],
):
    """Trenuje klasyfikator SVC i zapisuje model."""
    import pandas as pd
    from sklearn.svm import SVC
    from sklearn.preprocessing import StandardScaler
    from sklearn.pipeline import Pipeline
    import pickle

    print(f"train_data : {train_dataset}")
    print(f"model : {model}")
    train_df = pd.read_csv(train_dataset.path)
    X_train = train_df.drop('species', axis=1)
    y_train = train_df['species']
    svc_pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('svc', SVC(kernel='rbf', probability=True, random_state=42))
    ])
    svc_pipeline.fit(X_train, y_train)

    file_name = model.path + f".pkl"
    with open(file_name, 'wb') as file:  
        pickle.dump(svc_pipeline, file)


In [22]:
@component(
    base_image="python:3.9",
    packages_to_install=["pandas==2.2.2", "scikit-learn==1.5.0", "joblib==1.4.2", "gcsfs==2024.6.0", "fsspec"],
)
def evaluate_svc_model(
    test_dataset: Input[Dataset],
    model: Input[Model],
    metrics: Output[Metrics],
) -> NamedTuple("Outputs", [("accuracy", float)]):
    """Ocenia model, zapisuje metryki i zwraca dokładność."""
    import pandas as pd
    from sklearn.metrics import accuracy_score, classification_report
    import pickle 

    print(f"model.path : {model.path}")
    file_name = model.path + f".pkl"
    print(f"file_name : {file_name}")
    
    with open(file_name, 'rb') as file:  
        model = pickle.load(file)

    test_df = pd.read_csv(test_dataset.path)
    X_test = test_df.drop('species', axis=1)
    y_test = test_df['species']
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    
    metrics.log_metric("accuracy", (accuracy * 100.0))
    print(f"Accuracy: {accuracy}")

    report = classification_report(y_test, y_pred, output_dict=True)
        
    for class_label, class_metrics in report.items():
        if isinstance(class_metrics, dict):
            for metric_name, metric_value in class_metrics.items():
                metrics.log_metric(f"{class_label}_{metric_name}", metric_value)
    
    return (accuracy * 100.0,)

In [23]:

# --- Komponent 5: Rejestracja modelu w Vertex AI Model Registry ---
@component(
    base_image="python:3.9",
    packages_to_install=["pandas", "google-cloud-aiplatform==1.55.0", "gcsfs==2024.6.0", "fsspec", "pyarrow", "scikit-learn==1.5.0" ],
)
def register_model(
    model: Input[Model],
    project_id: str,
    region: str,
    model_display_name: str,
):
    """Rejestruje model w Vertex AI Model Registry."""
    from google.cloud import aiplatform
    import os

    print(f"project_id : {project_id}")
    print(f"region : {region}")
    print(f"model : {model}")

    aiplatform.init(project=project_id, location=region)
    serving_container_image = "us-docker.pkg.dev/vertex-ai/prediction/sklearn-cpu.1-5:latest"
    model_path = '/'.join(model.uri.split('/')[:-1])
    # Przesłanie i rejestracja modelu
    registered_model = aiplatform.Model.upload(
        display_name=model_display_name,
        artifact_uri=model_path,
        serving_container_image_uri=serving_container_image,
        sync=True,
        labels = {"model_type": "svc", "framework" : "scikit-learn"}
    )
    print(f"Zarejestrowano model: {registered_model.resource_name}")

In [24]:


# --- Definicja głównego potoku Vertex AI ---
@kfp.dsl.pipeline(
    name="penguin-svc-classification-with-registry",
    description="Potok trenujący i rejestrujący model SVC.",
    pipeline_root="gs://vertex-ai-bucket-s25537",
)
def penguin_pipeline(
    gcs_data_path: str = "gs://data-s25537/penguins.csv",
    project_id: str = "mlops-on-gcp-s25537",
    region: str = "us-central1",
    model_name: str = "puffin",
    test_split_ratio: float = 0.3,
    min_accuracy_threshold: float = 95.0,
):
    """Definiuje przepływ pracy w potoku z warunkową rejestracją."""
    get_data_task = get_data(gcs_input_path=gcs_data_path)
    
    transform_data_task = preprocess_data(
        input_data=get_data_task.outputs["input_data"],
        test_split_ratio=test_split_ratio
    )
    
    train_task = train_svc_model(
        train_dataset=transform_data_task.outputs["train_dataset"]
    )
    
    evaluate_task = evaluate_svc_model(
        test_dataset=transform_data_task.outputs["test_dataset"],
        model=train_task.outputs["model"],
    )

    # Warunek: zarejestruj model tylko, jeśli dokładność jest wystarczająco wysoka
    with dsl.If(
        evaluate_task.outputs["accuracy"] >= min_accuracy_threshold,
        name="accuracy-check",
    ):
        register_model(
            project_id=project_id,
            region=region,
            model_display_name=model_name,
            model=train_task.outputs["model"],
        )


In [25]:
compiler.Compiler().compile(
        pipeline_func=penguin_pipeline,
        package_path="penguin_svc_pipeline_with_registry.json",
    )

aiplatform.init(project="mlops-on-gcp-s25537", location='us-central1')

job = aiplatform.PipelineJob(
        display_name="penguin-svc-with-registry-run",
        template_path="penguin_svc_pipeline_with_registry.json",
        pipeline_root="gs://vertex-ai-bucket-s25537",
        enable_caching=True,
    )

print("Uruchamianie potoku w Vertex AI...")
job.run()
print("Potok został uruchomiony. Sprawdź postęp w konsoli Google Cloud.")

Uruchamianie potoku w Vertex AI...
Creating PipelineJob
PipelineJob created. Resource name: projects/739663413023/locations/us-central1/pipelineJobs/penguin-svc-classification-with-registry-20250705184555
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/739663413023/locations/us-central1/pipelineJobs/penguin-svc-classification-with-registry-20250705184555')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/penguin-svc-classification-with-registry-20250705184555?project=739663413023
PipelineJob projects/739663413023/locations/us-central1/pipelineJobs/penguin-svc-classification-with-registry-20250705184555 current state:
3
PipelineJob projects/739663413023/locations/us-central1/pipelineJobs/penguin-svc-classification-with-registry-20250705184555 current state:
3
PipelineJob projects/739663413023/locations/us-central1/pipelineJobs/penguin-svc-classification-with-registry-20250705184555 current 