Skip to content
Permalink
Browse files

Updated code snippets and screenshots to match the latest XGBoost sam…

…ple code (#1431)

* Initial commit of XGBoost sample fixes.

* Fixes first screenshot.

* Updated some screenshots.

* Updated more screenshots.
  • Loading branch information
sarahmaddox authored and k8s-ci-robot committed Dec 3, 2019
1 parent 3eb0bc9 commit 1c8b22cfcd052d256c2ae449db6381a78ced68f9
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -78,14 +78,14 @@ Below is an extract from the Python code that defines the
[GitHub](https://github.com/kubeflow/pipelines/tree/master/samples/core/xgboost_training_cm).

```python
@dsl.pipeline(
name='XGBoost Trainer',
description='A trainer that does end-to-end distributed training for XGBoost models.'
name='XGBoost Trainer',
description='A trainer that does end-to-end distributed training for XGBoost models.'
)
def xgb_train_pipeline(
output,
project,
output='gs://your-gcs-bucket',
project='your-gcp-project',
cluster_name='xgb-%s' % dsl.RUN_ID_PLACEHOLDER,
region='us-central1',
train_data='gs://ml-pipeline-playground/sfpd/train.csv',
eval_data='gs://ml-pipeline-playground/sfpd/eval.csv',
@@ -95,37 +95,98 @@ def xgb_train_pipeline(
workers=2,
true_label='ACTION',
):
delete_cluster_op = DeleteClusterOp('delete-cluster', project, region).apply(gcp.use_gcp_secret('user-gcp-sa'))
with dsl.ExitHandler(exit_op=delete_cluster_op):
create_cluster_op = CreateClusterOp('create-cluster', project, region, output).apply(gcp.use_gcp_secret('user-gcp-sa'))
analyze_op = AnalyzeOp('analyze', project, region, create_cluster_op.output, schema,
train_data, '%s/{{workflow.name}}/analysis' % output).apply(gcp.use_gcp_secret('user-gcp-sa'))
transform_op = TransformOp('transform', project, region, create_cluster_op.output,
train_data, eval_data, target, analyze_op.output,
'%s/{{workflow.name}}/transform' % output).apply(gcp.use_gcp_secret('user-gcp-sa'))
train_op = TrainerOp('train', project, region, create_cluster_op.output, transform_op.outputs['train'],
transform_op.outputs['eval'], target, analyze_op.output, workers,
rounds, '%s/{{workflow.name}}/model' % output).apply(gcp.use_gcp_secret('user-gcp-sa'))
predict_op = PredictOp('predict', project, region, create_cluster_op.output, transform_op.outputs['eval'],
train_op.output, target, analyze_op.output, '%s/{{workflow.name}}/predict' % output).apply(gcp.use_gcp_secret('user-gcp-sa'))
confusion_matrix_op = ConfusionMatrixOp('confusion-matrix', predict_op.output,
'%s/{{workflow.name}}/confusionmatrix' % output).apply(gcp.use_gcp_secret('user-gcp-sa'))
roc_op = RocOp('roc', predict_op.output, true_label, '%s/{{workflow.name}}/roc' % output).apply(gcp.use_gcp_secret('user-gcp-sa'))
output_template = str(output) + '/' + dsl.RUN_ID_PLACEHOLDER + '/data'
# Current GCP pyspark/spark op do not provide outputs as return values, instead,
# we need to use strings to pass the uri around.
analyze_output = output_template
transform_output_train = os.path.join(output_template, 'train', 'part-*')
transform_output_eval = os.path.join(output_template, 'eval', 'part-*')
train_output = os.path.join(output_template, 'train_output')
predict_output = os.path.join(output_template, 'predict_output')
with dsl.ExitHandler(exit_op=dataproc_delete_cluster_op(
project_id=project,
region=region,
name=cluster_name
)):
_create_cluster_op = dataproc_create_cluster_op(
project_id=project,
region=region,
name=cluster_name,
initialization_actions=[
os.path.join(_PYSRC_PREFIX,
'initialization_actions.sh'),
],
image_version='1.2'
)
_analyze_op = dataproc_analyze_op(
project=project,
region=region,
cluster_name=cluster_name,
schema=schema,
train_data=train_data,
output=output_template
).after(_create_cluster_op).set_display_name('Analyzer')
_transform_op = dataproc_transform_op(
project=project,
region=region,
cluster_name=cluster_name,
train_data=train_data,
eval_data=eval_data,
target=target,
analysis=analyze_output,
output=output_template
).after(_analyze_op).set_display_name('Transformer')
_train_op = dataproc_train_op(
project=project,
region=region,
cluster_name=cluster_name,
train_data=transform_output_train,
eval_data=transform_output_eval,
target=target,
analysis=analyze_output,
workers=workers,
rounds=rounds,
output=train_output
).after(_transform_op).set_display_name('Trainer')
_predict_op = dataproc_predict_op(
project=project,
region=region,
cluster_name=cluster_name,
data=transform_output_eval,
model=train_output,
target=target,
analysis=analyze_output,
output=predict_output
).after(_train_op).set_display_name('Predictor')
_cm_op = confusion_matrix_op(
predictions=os.path.join(predict_output, 'part-*.csv'),
output_dir=output_template
).after(_predict_op)
_roc_op = roc_op(
predictions_dir=os.path.join(predict_output, 'part-*.csv'),
true_class=true_label,
true_score_column=true_label,
output_dir=output_template
).after(_predict_op)
dsl.get_pipeline_conf().add_op_transformer(
gcp.use_gcp_secret('user-gcp-sa'))
```

### Pipeline data on the Kubeflow Pipelines UI
### Pipeline input data on the Kubeflow Pipelines UI

The screenshot below shows the Kubeflow Pipelines UI for kicking off a run of
the pipeline. The pipeline definition in your code determines which parameters
appear in the UI form. The pipeline definition can also set default values for
these parameters. The arrows on the screenshot indicate the
parameters that do not have useful default values in this particular example:
The partial screenshot below shows the Kubeflow Pipelines UI for kicking off a
run of the pipeline. The pipeline definition in your code determines which
parameters appear in the UI form. The pipeline definition can also set default
values for the parameters:

<img src="/docs/images/pipelines-start-xgboost-run.png"
alt="Starting the XGBoost run on the pipelines UI"
@@ -131,8 +131,8 @@ Follow these steps to set up the necessary GCP services and run the sample:

The sample supplies the values for the other parameters:

* region: The GCP geographical region in which the training and evaluaton data
are stored.
* region: The GCP geographical region in which the training and evaluation
data is stored.
* train-data: Cloud Storage path to the training data.
* eval-data: Cloud Storage path to the evaluation data.
* schema: Cloud Storage path to a JSON file describing the format of the
@@ -143,8 +143,8 @@ Follow these steps to set up the necessary GCP services and run the sample:
* true-label: Column to be used for text representation of the label output
by the model.

The arrows on the following screenshot indicate the run parameters that you
must supply:
The following partial screenshot shows the run parameters, including the
two parameters that you must supply:
<img src="/docs/images/pipelines-start-xgboost-run.png"
alt="Starting the XGBoost run on the pipelines UI"
class="mt-3 mb-3 border border-info rounded">

0 comments on commit 1c8b22c

Please sign in to comment.
You can’t perform that action at this time.