In [None]:
from clearml import Dataset, PipelineController

pipe = PipelineController(
    name="Pipeline Controller", project="Amazon reviews", version="1.0.0"
)
pipe.add_parameter(
    name="dataset_name",
    description="ClearML dataset name",
    default="Amazon reviews dataset",
)
pipe.add_parameter(
    name="dataset_project",
    description="ClearML project",
    default="Amazon reviews",
)
pipe.add_parameter(
    name="dataset_version",
    description="ClearML dataset version",
    default="1.2",
)
pipe.add_parameter(
    name="test_size", description="Test ratio size", default=0.2, param_type="float"
)
pipe.add_parameter(
    name="random_state", description="Random state", default=42, param_type="int"
)

from pathlib import Path

import pandas as pd
import polars as pl
from sklearn.model_selection import train_test_split


def dataset_train_test_split(
    dataset_name, dataset_project, dataset_version, test_size, random_state
):
    dataset = Dataset.get(
        dataset_name=dataset_name,
        dataset_project=dataset_project,
        dataset_version=dataset_version,
    )
    datset_path = Path(dataset.get_local_copy())

    data: pd.DataFrame = pl.concat(
        [pl.read_csv(data_file) for data_file in datset_path.iterdir()]
    )
    train, test = train_test_split(
        data.to_pandas(), test_size=test_size, random_state=random_state
    )
    result_path = Path("data/prepared/")
    result_path.mkdir(exist_ok=True, parents=True)
    train.to_csv(result_path / "split" / "train.csv")
    test.to_csv(result_path / "split" / "test.csv")
    prepared_dataset = Dataset.get(
        dataset_name=dataset_name,
        dataset_project=dataset_project,
        dataset_version=f"{dataset_version}.1",
    )
    prepared_dataset.add_files(result_path / "split")
    prepared_dataset.upload()
    prepared_dataset.finalize()
    return train, test


pipe.add_function_step(
    name="train_test_split",
    function=dataset_train_test_split,
    function_kwargs=dict(
        dataset_name="${pipeline.dataset_name}",
        dataset_project="${pipeline.dataset_project}",
        dataset_version="${pipeline.dataset_version}",
        test_size="${pipeline.test_size}",
        random_state="${pipeline.random_state}",
    ),
    function_return=["train_dataframe", "test_dataframe"],
    cache_executed_step=True,
)

ClearML Task: created new task id=8e4954b9e976410cbb39742914f95ad7
ClearML results page: http://89.169.184.62:30080/projects/3ad78869b7994342ad2f93dba10a2825/experiments/8e4954b9e976410cbb39742914f95ad7/output/log
CLEARML-SERVER new package available: UPGRADE to v2.1.0 is recommended!
Release Notes:
### New Features
- New UI task creation options
- Support bash as well as python scripts
- Support file upload
- Add per project UI scalar view configuration ([clearml #1377](https://github.com/clearml/clearml/issues/1377))
- Add support for custom x-axis label in UI Task scalars
- Add global search bar to all UI pages
- Add filter to UI Model Endpoints table 
- Add clicking UI breadcrumbs project name of full-screen task opens the project’s task table ([clearml #1376](https://github.com/clearml/clearml/issues/1376))
- Improve UI task debug sample viewer:
- Zoom setting persists when navigating between samples ([clearml #1390](https://github.com/clearml/clearml/issues/1390))
- Zoom focuses 