Migrate additional examples from xgboost-operator (#1461)

Signed-off-by: terrytangyuan <terrytangyuan@gmail.com>
kubeflow · Nov 9, 2021 · c41956d · c41956d
1 parent 06f4325
commit c41956d
Show file tree

Hide file tree

Showing 29 changed files with 3,304 additions and 0 deletions.
diff --git a/examples/xgboost/lightgbm-dist/Dockerfile b/examples/xgboost/lightgbm-dist/Dockerfile
@@ -0,0 +1,44 @@
+FROM ubuntu:16.04
+
+ARG CONDA_DIR=/opt/conda
+ENV PATH $CONDA_DIR/bin:$PATH
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    ca-certificates \
+    cmake \
+    build-essential \
+    gcc \
+    g++ \
+    git \
+    curl && \
+    # python environment
+    curl -sL https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -o conda.sh && \
+    /bin/bash conda.sh -f -b -p $CONDA_DIR && \
+    export PATH="$CONDA_DIR/bin:$PATH" && \
+    conda config --set always_yes yes --set changeps1 no && \
+    # lightgbm
+    conda install -q -y numpy==1.20.3 scipy==1.6.2 scikit-learn==0.24.2 pandas==1.3.0 && \
+    git clone --recursive --branch stable --depth 1 https://github.com/Microsoft/LightGBM && \
+    mkdir LightGBM/build && \
+    cd LightGBM/build && \
+    cmake .. && \
+    make -j4 && \
+    make install && \
+    cd ../python-package && \
+    python setup.py install_lib && \
+    # clean
+    apt-get autoremove -y && apt-get clean && \
+    conda clean -a -y && \
+    rm -rf /usr/local/src/* && \
+    rm -rf /LightGBM
+
+WORKDIR /app
+
+# Download the example data
+RUN mkdir data
+ADD https://raw.githubusercontent.com/microsoft/LightGBM/stable/examples/parallel_learning/binary.train data/.
+ADD https://raw.githubusercontent.com/microsoft/LightGBM/stable/examples/parallel_learning/binary.test data/.
+COPY *.py ./
+
+ENTRYPOINT [ "python", "/app/main.py" ]
diff --git a/examples/xgboost/lightgbm-dist/README.md b/examples/xgboost/lightgbm-dist/README.md
@@ -0,0 +1,210 @@
+### Distributed Lightgbm Job train
+
+This folder containers Dockerfile and Python scripts to run a distributed Lightgbm training using the XGBoost operator.
+The code is based in this [example](https://github.com/microsoft/LightGBM/tree/master/examples/parallel_learning) in the official github repository of the library.
+
+
+**Build image**
+The default image name and tag is `kubeflow/lightgbm-dist-py-test:1.0` respectiveily.
+
+```shell
+docker build -f Dockerfile -t kubeflow/lightgbm-dist-py-test:1.0 ./
+```
+
+**Start the training**
+
+```
+kubectl create -f xgboostjob_v1_lightgbm_dist_training.yaml
+```
+
+**Look at the job status**
+```
+ kubectl get -o yaml XGBoostJob/lightgbm-dist-train-test
+ ```
+Here is sample output when the job is running. The output result like this
+
+```
+apiVersion: xgboostjob.kubeflow.org/v1
+kind: XGBoostJob
+metadata:
+  annotations:
+    kubectl.kubernetes.io/last-applied-configuration: |
+      {"apiVersion":"xgboostjob.kubeflow.org/v1","kind":"XGBoostJob","metadata":{"annotations":{},"name":"lightgbm-dist-train-test","namespace":"default"},"spec":{"xgbReplicaSpecs":{"Master":{"replicas":1,"restartPolicy":"Never","template":{"apiVersion":"v1","kind":"Pod","spec":{"containers":[{"args":["--job_type=Train","--boosting_type=gbdt","--objective=binary","--metric=binary_logloss,auc","--metric_freq=1","--is_training_metric=true","--max_bin=255","--data=data/binary.train","--valid_data=data/binary.test","--num_trees=100","--learning_rate=01","--num_leaves=63","--tree_learner=feature","--feature_fraction=0.8","--bagging_freq=5","--bagging_fraction=0.8","--min_data_in_leaf=50","--min_sum_hessian_in_leaf=50","--is_enable_sparse=true","--use_two_round_loading=false","--is_save_binary_file=false"],"image":"kubeflow/lightgbm-dist-py-test:1.0","imagePullPolicy":"Never","name":"xgboostjob","ports":[{"containerPort":9991,"name":"xgboostjob-port"}]}]}}},"Worker":{"replicas":2,"restartPolicy":"ExitCode","template":{"apiVersion":"v1","kind":"Pod","spec":{"containers":[{"args":["--job_type=Train","--boosting_type=gbdt","--objective=binary","--metric=binary_logloss,auc","--metric_freq=1","--is_training_metric=true","--max_bin=255","--data=data/binary.train","--valid_data=data/binary.test","--num_trees=100","--learning_rate=01","--num_leaves=63","--tree_learner=feature","--feature_fraction=0.8","--bagging_freq=5","--bagging_fraction=0.8","--min_data_in_leaf=50","--min_sum_hessian_in_leaf=50","--is_enable_sparse=true","--use_two_round_loading=false","--is_save_binary_file=false"],"image":"kubeflow/lightgbm-dist-py-test:1.0","imagePullPolicy":"Never","name":"xgboostjob","ports":[{"containerPort":9991,"name":"xgboostjob-port"}]}]}}}}}}
+  creationTimestamp: "2020-10-14T15:31:23Z"
+  generation: 7
+  managedFields:
+  - apiVersion: xgboostjob.kubeflow.org/v1
+    fieldsType: FieldsV1
+    fieldsV1:
+      f:metadata:
+        f:annotations:
+          .: {}
+          f:kubectl.kubernetes.io/last-applied-configuration: {}
+      f:spec:
+        .: {}
+        f:xgbReplicaSpecs:
+          .: {}
+          f:Master:
+            .: {}
+            f:replicas: {}
+            f:restartPolicy: {}
+            f:template:
+              .: {}
+              f:spec: {}
+          f:Worker:
+            .: {}
+            f:replicas: {}
+            f:restartPolicy: {}
+            f:template:
+              .: {}
+              f:spec: {}
+    manager: kubectl-client-side-apply
+    operation: Update
+    time: "2020-10-14T15:31:23Z"
+  - apiVersion: xgboostjob.kubeflow.org/v1
+    fieldsType: FieldsV1
+    fieldsV1:
+      f:spec:
+        f:RunPolicy:
+          .: {}
+          f:cleanPodPolicy: {}
+        f:xgbReplicaSpecs:
+          f:Master:
+            f:template:
+              f:metadata:
+                .: {}
+                f:creationTimestamp: {}
+              f:spec:
+                f:containers: {}
+          f:Worker:
+            f:template:
+              f:metadata:
+                .: {}
+                f:creationTimestamp: {}
+              f:spec:
+                f:containers: {}
+      f:status:
+        .: {}
+        f:completionTime: {}
+        f:conditions: {}
+        f:replicaStatuses:
+          .: {}
+          f:Master:
+            .: {}
+            f:succeeded: {}
+          f:Worker:
+            .: {}
+            f:succeeded: {}
+    manager: main
+    operation: Update
+    time: "2020-10-14T15:34:44Z"
+  name: lightgbm-dist-train-test
+  namespace: default
+  resourceVersion: "38923"
+  selfLink: /apis/xgboostjob.kubeflow.org/v1/namespaces/default/xgboostjobs/lightgbm-dist-train-test
+  uid: b2b887d0-445b-498b-8852-26c8edc98dc7
+spec:
+  RunPolicy:
+    cleanPodPolicy: None
+  xgbReplicaSpecs:
+    Master:
+      replicas: 1
+      restartPolicy: Never
+      template:
+        metadata:
+          creationTimestamp: null
+        spec:
+          containers:
+          - args:
+            - --job_type=Train
+            - --boosting_type=gbdt
+            - --objective=binary
+            - --metric=binary_logloss,auc
+            - --metric_freq=1
+            - --is_training_metric=true
+            - --max_bin=255
+            - --data=data/binary.train
+            - --valid_data=data/binary.test
+            - --num_trees=100
+            - --learning_rate=01
+            - --num_leaves=63
+            - --tree_learner=feature
+            - --feature_fraction=0.8
+            - --bagging_freq=5
+            - --bagging_fraction=0.8
+            - --min_data_in_leaf=50
+            - --min_sum_hessian_in_leaf=50
+            - --is_enable_sparse=true
+            - --use_two_round_loading=false
+            - --is_save_binary_file=false
+            image: kubeflow/lightgbm-dist-py-test:1.0
+            imagePullPolicy: Never
+            name: xgboostjob
+            ports:
+            - containerPort: 9991
+              name: xgboostjob-port
+            resources: {}
+    Worker:
+      replicas: 2
+      restartPolicy: ExitCode
+      template:
+        metadata:
+          creationTimestamp: null
+        spec:
+          containers:
+          - args:
+            - --job_type=Train
+            - --boosting_type=gbdt
+            - --objective=binary
+            - --metric=binary_logloss,auc
+            - --metric_freq=1
+            - --is_training_metric=true
+            - --max_bin=255
+            - --data=data/binary.train
+            - --valid_data=data/binary.test
+            - --num_trees=100
+            - --learning_rate=01
+            - --num_leaves=63
+            - --tree_learner=feature
+            - --feature_fraction=0.8
+            - --bagging_freq=5
+            - --bagging_fraction=0.8
+            - --min_data_in_leaf=50
+            - --min_sum_hessian_in_leaf=50
+            - --is_enable_sparse=true
+            - --use_two_round_loading=false
+            - --is_save_binary_file=false
+            image: kubeflow/lightgbm-dist-py-test:1.0
+            imagePullPolicy: Never
+            name: xgboostjob
+            ports:
+            - containerPort: 9991
+              name: xgboostjob-port
+            resources: {}
+status:
+  completionTime: "2020-10-14T15:34:44Z"
+  conditions:
+  - lastTransitionTime: "2020-10-14T15:31:23Z"
+    lastUpdateTime: "2020-10-14T15:31:23Z"
+    message: xgboostJob lightgbm-dist-train-test is created.
+    reason: XGBoostJobCreated
+    status: "True"
+    type: Created
+  - lastTransitionTime: "2020-10-14T15:31:23Z"
+    lastUpdateTime: "2020-10-14T15:31:23Z"
+    message: XGBoostJob lightgbm-dist-train-test is running.
+    reason: XGBoostJobRunning
+    status: "False"
+    type: Running
+  - lastTransitionTime: "2020-10-14T15:34:44Z"
+    lastUpdateTime: "2020-10-14T15:34:44Z"
+    message: XGBoostJob lightgbm-dist-train-test is successfully completed.
+    reason: XGBoostJobSucceeded
+    status: "True"
+    type: Succeeded
+  replicaStatuses:
+    Master:
+      succeeded: 1
+    Worker:
+      succeeded: 2
+```
diff --git a/examples/xgboost/lightgbm-dist/main.py b/examples/xgboost/lightgbm-dist/main.py
@@ -0,0 +1,80 @@
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import logging
+import argparse
+
+from train import train
+
+from utils import generate_machine_list_file, generate_train_conf_file
+
+
+logger = logging.getLogger(__name__)
+
+
+def main(args, extra_args):
+
+    master_addr = os.environ["MASTER_ADDR"]
+    master_port = os.environ["MASTER_PORT"]
+    worker_addrs = os.environ["WORKER_ADDRS"]
+    worker_port = os.environ["WORKER_PORT"]
+    world_size = int(os.environ["WORLD_SIZE"])
+    rank = int(os.environ["RANK"])
+
+    logger.info(
+        "extract cluster info from env variables \n"
+        f"master_addr: {master_addr} \n"
+        f"master_port: {master_port} \n"
+        f"worker_addrs: {worker_addrs} \n"
+        f"worker_port: {worker_port} \n"
+        f"world_size: {world_size} \n"
+        f"rank: {rank} \n"
+    )
+
+    if args.job_type == "Predict":
+        logging.info("starting the predict job")
+
+    elif args.job_type == "Train":
+        logging.info("starting the train job")
+        logging.info(f"extra args:\n {extra_args}")
+        machine_list_filepath = generate_machine_list_file(
+            master_addr, master_port, worker_addrs, worker_port
+        )
+        logging.info(f"machine list generated in: {machine_list_filepath}")
+        local_port = worker_port if rank else master_port
+        config_file = generate_train_conf_file(
+            machine_list_file=machine_list_filepath,
+            world_size=world_size,
+            output_model="model.txt",
+            local_port=local_port,
+            extra_args=extra_args,
+        )
+        logging.info(f"config generated in: {config_file}")
+        train(config_file)
+    logging.info("Finish distributed job")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--job_type",
+        help="Job type to execute",
+        choices=["Train", "Predict"],
+        required=True,
+    )
+
+    logging.basicConfig(format="%(message)s")
+    logging.getLogger().setLevel(logging.INFO)
+    args, extra_args = parser.parse_known_args()
+    main(args, extra_args)
diff --git a/examples/xgboost/lightgbm-dist/train.py b/examples/xgboost/lightgbm-dist/train.py
@@ -0,0 +1,26 @@
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import logging
+import subprocess
+
+logger = logging.getLogger(__name__)
+
+
+def train(train_config_filepath: str):
+    cmd = ["lightgbm", f"config={train_config_filepath}"]
+    proc = subprocess.Popen(cmd, stdout=subprocess.PIPE)
+    line = proc.stdout.readline()
+    while line:
+        logger.info((line.decode("utf-8").strip()))
+        line = proc.stdout.readline()