Skip to content

Commit

Permalink
Migrate additional examples from xgboost-operator (#1461)
Browse files Browse the repository at this point in the history
Signed-off-by: terrytangyuan <terrytangyuan@gmail.com>
  • Loading branch information
terrytangyuan committed Nov 9, 2021
1 parent 06f4325 commit c41956d
Show file tree
Hide file tree
Showing 29 changed files with 3,304 additions and 0 deletions.
44 changes: 44 additions & 0 deletions examples/xgboost/lightgbm-dist/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
FROM ubuntu:16.04

ARG CONDA_DIR=/opt/conda
ENV PATH $CONDA_DIR/bin:$PATH

RUN apt-get update && \
apt-get install -y --no-install-recommends \
ca-certificates \
cmake \
build-essential \
gcc \
g++ \
git \
curl && \
# python environment
curl -sL https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -o conda.sh && \
/bin/bash conda.sh -f -b -p $CONDA_DIR && \
export PATH="$CONDA_DIR/bin:$PATH" && \
conda config --set always_yes yes --set changeps1 no && \
# lightgbm
conda install -q -y numpy==1.20.3 scipy==1.6.2 scikit-learn==0.24.2 pandas==1.3.0 && \
git clone --recursive --branch stable --depth 1 https://github.com/Microsoft/LightGBM && \
mkdir LightGBM/build && \
cd LightGBM/build && \
cmake .. && \
make -j4 && \
make install && \
cd ../python-package && \
python setup.py install_lib && \
# clean
apt-get autoremove -y && apt-get clean && \
conda clean -a -y && \
rm -rf /usr/local/src/* && \
rm -rf /LightGBM

WORKDIR /app

# Download the example data
RUN mkdir data
ADD https://raw.githubusercontent.com/microsoft/LightGBM/stable/examples/parallel_learning/binary.train data/.
ADD https://raw.githubusercontent.com/microsoft/LightGBM/stable/examples/parallel_learning/binary.test data/.
COPY *.py ./

ENTRYPOINT [ "python", "/app/main.py" ]
210 changes: 210 additions & 0 deletions examples/xgboost/lightgbm-dist/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,210 @@
### Distributed Lightgbm Job train

This folder containers Dockerfile and Python scripts to run a distributed Lightgbm training using the XGBoost operator.
The code is based in this [example](https://github.com/microsoft/LightGBM/tree/master/examples/parallel_learning) in the official github repository of the library.


**Build image**
The default image name and tag is `kubeflow/lightgbm-dist-py-test:1.0` respectiveily.

```shell
docker build -f Dockerfile -t kubeflow/lightgbm-dist-py-test:1.0 ./
```

**Start the training**

```
kubectl create -f xgboostjob_v1_lightgbm_dist_training.yaml
```

**Look at the job status**
```
kubectl get -o yaml XGBoostJob/lightgbm-dist-train-test
```
Here is sample output when the job is running. The output result like this

```
apiVersion: xgboostjob.kubeflow.org/v1
kind: XGBoostJob
metadata:
annotations:
kubectl.kubernetes.io/last-applied-configuration: |
{"apiVersion":"xgboostjob.kubeflow.org/v1","kind":"XGBoostJob","metadata":{"annotations":{},"name":"lightgbm-dist-train-test","namespace":"default"},"spec":{"xgbReplicaSpecs":{"Master":{"replicas":1,"restartPolicy":"Never","template":{"apiVersion":"v1","kind":"Pod","spec":{"containers":[{"args":["--job_type=Train","--boosting_type=gbdt","--objective=binary","--metric=binary_logloss,auc","--metric_freq=1","--is_training_metric=true","--max_bin=255","--data=data/binary.train","--valid_data=data/binary.test","--num_trees=100","--learning_rate=01","--num_leaves=63","--tree_learner=feature","--feature_fraction=0.8","--bagging_freq=5","--bagging_fraction=0.8","--min_data_in_leaf=50","--min_sum_hessian_in_leaf=50","--is_enable_sparse=true","--use_two_round_loading=false","--is_save_binary_file=false"],"image":"kubeflow/lightgbm-dist-py-test:1.0","imagePullPolicy":"Never","name":"xgboostjob","ports":[{"containerPort":9991,"name":"xgboostjob-port"}]}]}}},"Worker":{"replicas":2,"restartPolicy":"ExitCode","template":{"apiVersion":"v1","kind":"Pod","spec":{"containers":[{"args":["--job_type=Train","--boosting_type=gbdt","--objective=binary","--metric=binary_logloss,auc","--metric_freq=1","--is_training_metric=true","--max_bin=255","--data=data/binary.train","--valid_data=data/binary.test","--num_trees=100","--learning_rate=01","--num_leaves=63","--tree_learner=feature","--feature_fraction=0.8","--bagging_freq=5","--bagging_fraction=0.8","--min_data_in_leaf=50","--min_sum_hessian_in_leaf=50","--is_enable_sparse=true","--use_two_round_loading=false","--is_save_binary_file=false"],"image":"kubeflow/lightgbm-dist-py-test:1.0","imagePullPolicy":"Never","name":"xgboostjob","ports":[{"containerPort":9991,"name":"xgboostjob-port"}]}]}}}}}}
creationTimestamp: "2020-10-14T15:31:23Z"
generation: 7
managedFields:
- apiVersion: xgboostjob.kubeflow.org/v1
fieldsType: FieldsV1
fieldsV1:
f:metadata:
f:annotations:
.: {}
f:kubectl.kubernetes.io/last-applied-configuration: {}
f:spec:
.: {}
f:xgbReplicaSpecs:
.: {}
f:Master:
.: {}
f:replicas: {}
f:restartPolicy: {}
f:template:
.: {}
f:spec: {}
f:Worker:
.: {}
f:replicas: {}
f:restartPolicy: {}
f:template:
.: {}
f:spec: {}
manager: kubectl-client-side-apply
operation: Update
time: "2020-10-14T15:31:23Z"
- apiVersion: xgboostjob.kubeflow.org/v1
fieldsType: FieldsV1
fieldsV1:
f:spec:
f:RunPolicy:
.: {}
f:cleanPodPolicy: {}
f:xgbReplicaSpecs:
f:Master:
f:template:
f:metadata:
.: {}
f:creationTimestamp: {}
f:spec:
f:containers: {}
f:Worker:
f:template:
f:metadata:
.: {}
f:creationTimestamp: {}
f:spec:
f:containers: {}
f:status:
.: {}
f:completionTime: {}
f:conditions: {}
f:replicaStatuses:
.: {}
f:Master:
.: {}
f:succeeded: {}
f:Worker:
.: {}
f:succeeded: {}
manager: main
operation: Update
time: "2020-10-14T15:34:44Z"
name: lightgbm-dist-train-test
namespace: default
resourceVersion: "38923"
selfLink: /apis/xgboostjob.kubeflow.org/v1/namespaces/default/xgboostjobs/lightgbm-dist-train-test
uid: b2b887d0-445b-498b-8852-26c8edc98dc7
spec:
RunPolicy:
cleanPodPolicy: None
xgbReplicaSpecs:
Master:
replicas: 1
restartPolicy: Never
template:
metadata:
creationTimestamp: null
spec:
containers:
- args:
- --job_type=Train
- --boosting_type=gbdt
- --objective=binary
- --metric=binary_logloss,auc
- --metric_freq=1
- --is_training_metric=true
- --max_bin=255
- --data=data/binary.train
- --valid_data=data/binary.test
- --num_trees=100
- --learning_rate=01
- --num_leaves=63
- --tree_learner=feature
- --feature_fraction=0.8
- --bagging_freq=5
- --bagging_fraction=0.8
- --min_data_in_leaf=50
- --min_sum_hessian_in_leaf=50
- --is_enable_sparse=true
- --use_two_round_loading=false
- --is_save_binary_file=false
image: kubeflow/lightgbm-dist-py-test:1.0
imagePullPolicy: Never
name: xgboostjob
ports:
- containerPort: 9991
name: xgboostjob-port
resources: {}
Worker:
replicas: 2
restartPolicy: ExitCode
template:
metadata:
creationTimestamp: null
spec:
containers:
- args:
- --job_type=Train
- --boosting_type=gbdt
- --objective=binary
- --metric=binary_logloss,auc
- --metric_freq=1
- --is_training_metric=true
- --max_bin=255
- --data=data/binary.train
- --valid_data=data/binary.test
- --num_trees=100
- --learning_rate=01
- --num_leaves=63
- --tree_learner=feature
- --feature_fraction=0.8
- --bagging_freq=5
- --bagging_fraction=0.8
- --min_data_in_leaf=50
- --min_sum_hessian_in_leaf=50
- --is_enable_sparse=true
- --use_two_round_loading=false
- --is_save_binary_file=false
image: kubeflow/lightgbm-dist-py-test:1.0
imagePullPolicy: Never
name: xgboostjob
ports:
- containerPort: 9991
name: xgboostjob-port
resources: {}
status:
completionTime: "2020-10-14T15:34:44Z"
conditions:
- lastTransitionTime: "2020-10-14T15:31:23Z"
lastUpdateTime: "2020-10-14T15:31:23Z"
message: xgboostJob lightgbm-dist-train-test is created.
reason: XGBoostJobCreated
status: "True"
type: Created
- lastTransitionTime: "2020-10-14T15:31:23Z"
lastUpdateTime: "2020-10-14T15:31:23Z"
message: XGBoostJob lightgbm-dist-train-test is running.
reason: XGBoostJobRunning
status: "False"
type: Running
- lastTransitionTime: "2020-10-14T15:34:44Z"
lastUpdateTime: "2020-10-14T15:34:44Z"
message: XGBoostJob lightgbm-dist-train-test is successfully completed.
reason: XGBoostJobSucceeded
status: "True"
type: Succeeded
replicaStatuses:
Master:
succeeded: 1
Worker:
succeeded: 2
```
80 changes: 80 additions & 0 deletions examples/xgboost/lightgbm-dist/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import logging
import argparse

from train import train

from utils import generate_machine_list_file, generate_train_conf_file


logger = logging.getLogger(__name__)


def main(args, extra_args):

master_addr = os.environ["MASTER_ADDR"]
master_port = os.environ["MASTER_PORT"]
worker_addrs = os.environ["WORKER_ADDRS"]
worker_port = os.environ["WORKER_PORT"]
world_size = int(os.environ["WORLD_SIZE"])
rank = int(os.environ["RANK"])

logger.info(
"extract cluster info from env variables \n"
f"master_addr: {master_addr} \n"
f"master_port: {master_port} \n"
f"worker_addrs: {worker_addrs} \n"
f"worker_port: {worker_port} \n"
f"world_size: {world_size} \n"
f"rank: {rank} \n"
)

if args.job_type == "Predict":
logging.info("starting the predict job")

elif args.job_type == "Train":
logging.info("starting the train job")
logging.info(f"extra args:\n {extra_args}")
machine_list_filepath = generate_machine_list_file(
master_addr, master_port, worker_addrs, worker_port
)
logging.info(f"machine list generated in: {machine_list_filepath}")
local_port = worker_port if rank else master_port
config_file = generate_train_conf_file(
machine_list_file=machine_list_filepath,
world_size=world_size,
output_model="model.txt",
local_port=local_port,
extra_args=extra_args,
)
logging.info(f"config generated in: {config_file}")
train(config_file)
logging.info("Finish distributed job")


if __name__ == "__main__":
parser = argparse.ArgumentParser()

parser.add_argument(
"--job_type",
help="Job type to execute",
choices=["Train", "Predict"],
required=True,
)

logging.basicConfig(format="%(message)s")
logging.getLogger().setLevel(logging.INFO)
args, extra_args = parser.parse_known_args()
main(args, extra_args)
26 changes: 26 additions & 0 deletions examples/xgboost/lightgbm-dist/train.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import logging
import subprocess

logger = logging.getLogger(__name__)


def train(train_config_filepath: str):
cmd = ["lightgbm", f"config={train_config_filepath}"]
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE)
line = proc.stdout.readline()
while line:
logger.info((line.decode("utf-8").strip()))
line = proc.stdout.readline()
Loading

0 comments on commit c41956d

Please sign in to comment.