diff --git a/.github/workflows/config/llama-2-7b-chat-hf-vllm-fp32.yaml b/.github/workflows/config/llama-2-7b-chat-hf-vllm-fp32.yaml index d3d96a0e1..8445a8fd5 100644 --- a/.github/workflows/config/llama-2-7b-chat-hf-vllm-fp32.yaml +++ b/.github/workflows/config/llama-2-7b-chat-hf-vllm-fp32.yaml @@ -14,7 +14,7 @@ ipex: enabled: false precision: bf16 model_description: - model_id_or_path: meta-llama/Llama-2-7b-chat-hf - tokenizer_name_or_path: meta-llama/Llama-2-7b-chat-hf + model_id_or_path: NousResearch/Llama-2-7b-chat-hf + tokenizer_name_or_path: NousResearch/Llama-2-7b-chat-hf config: use_auth_token: '' diff --git a/.github/workflows/workflow_finetune.yml b/.github/workflows/workflow_finetune.yml index 8b0347de1..e993011cb 100644 --- a/.github/workflows/workflow_finetune.yml +++ b/.github/workflows/workflow_finetune.yml @@ -11,10 +11,10 @@ on: default: '10.1.2.13:5000/llmray-build' http_proxy: type: string - default: 'http://10.24.221.169:911' + default: 'http://10.24.221.169:912' https_proxy: type: string - default: 'http://10.24.221.169:911' + default: 'http://10.24.221.169:912' runner_config_path: type: string default: '/home/ci/llm-ray-actions-runner' @@ -34,7 +34,7 @@ jobs: name: finetune strategy: matrix: - model: [ EleutherAI/gpt-j-6b, meta-llama/Llama-2-7b-chat-hf, gpt2, bigscience/bloom-560m, facebook/opt-125m, mosaicml/mpt-7b, meta-llama/Llama-2-7b-hf, mistralai/Mistral-7B-v0.1, google/gemma-2b] + model: [ EleutherAI/gpt-j-6b, NousResearch/Llama-2-7b-chat-hf, gpt2, bigscience/bloom-560m, facebook/opt-125m, mosaicml/mpt-7b, NousResearch/Llama-2-7b-hf, mistralai/Mistral-7B-v0.1, google/gemma-2b] isPR: - ${{inputs.ci_type == 'pr'}} @@ -42,7 +42,7 @@ jobs: - { isPR: true } include: - { model: "EleutherAI/gpt-j-6b"} - - { model: "meta-llama/Llama-2-7b-chat-hf"} + - { model: "NousResearch/Llama-2-7b-chat-hf"} - { model: "mistralai/Mistral-7B-v0.1"} - { model: "google/gemma-2b"} @@ -65,9 +65,6 @@ jobs: - name: Checkout uses: actions/checkout@v4 - - name: Load environment variables - run: cat /root/actions-runner-config/.env >> $GITHUB_ENV - - name: Build Docker Image run: | DF_SUFFIX=".cpu_and_deepspeed" @@ -83,7 +80,7 @@ jobs: model_cache_path=${{ inputs.model_cache_path }} USE_PROXY="1" source dev/scripts/ci-functions.sh - start_docker ${TARGET} ${code_checkout_path} ${model_cache_path} ${USE_PROXY} ${{env.HF_ACCESS_TOKEN}} + start_docker ${TARGET} ${code_checkout_path} ${model_cache_path} ${USE_PROXY} - name: Run Finetune Test run: | diff --git a/.github/workflows/workflow_finetune_gpu.yml b/.github/workflows/workflow_finetune_gpu.yml index 2114b66db..37e612324 100644 --- a/.github/workflows/workflow_finetune_gpu.yml +++ b/.github/workflows/workflow_finetune_gpu.yml @@ -8,17 +8,17 @@ on: default: '10.1.2.13:5000/llmray-build' http_proxy: type: string - default: 'http://10.24.221.169:911' + default: 'http://10.24.221.169:912' https_proxy: type: string - default: 'http://10.24.221.169:911' + default: 'http://10.24.221.169:912' jobs: finetune-gpu: name: finetune-gpu strategy: matrix: - model: [ meta-llama/Llama-2-7b-chat-hf ] + model: [ NousResearch/Llama-2-7b-chat-hf ] runs-on: self-hosted defaults: diff --git a/.github/workflows/workflow_inference.yml b/.github/workflows/workflow_inference.yml index 61f458bcd..ca57affac 100644 --- a/.github/workflows/workflow_inference.yml +++ b/.github/workflows/workflow_inference.yml @@ -11,10 +11,10 @@ on: default: '10.1.2.13:5000/llmray-build' http_proxy: type: string - default: 'http://10.24.221.169:911' + default: 'http://10.24.221.169:912' https_proxy: type: string - default: 'http://10.24.221.169:911' + default: 'http://10.24.221.169:912' runner_config_path: type: string default: '/home/ci/llm-ray-actions-runner' @@ -67,9 +67,6 @@ jobs: - name: Checkout uses: actions/checkout@v4 - - name: Load environment variables - run: cat /root/actions-runner-config/.env >> $GITHUB_ENV - - name: Determine Target id: "target" run: | @@ -94,7 +91,7 @@ jobs: model_cache_path=${{ inputs.model_cache_path }} USE_PROXY="1" source dev/scripts/ci-functions.sh - start_docker ${TARGET} ${code_checkout_path} ${model_cache_path} ${USE_PROXY} ${{env.HF_ACCESS_TOKEN}} + start_docker ${TARGET} ${code_checkout_path} ${model_cache_path} ${USE_PROXY} - name: Start Ray Cluster run: | diff --git a/.github/workflows/workflow_inference_gaudi2.yml b/.github/workflows/workflow_inference_gaudi2.yml index 588e8dab0..dedeb4154 100644 --- a/.github/workflows/workflow_inference_gaudi2.yml +++ b/.github/workflows/workflow_inference_gaudi2.yml @@ -73,9 +73,6 @@ jobs: - name: Checkout uses: actions/checkout@v4 - - name: Load environment variables - run: cat /root/actions-runner-config/.env >> $GITHUB_ENV - - name: Build Docker Image run: | DF_SUFFIX=".gaudi2" @@ -98,7 +95,6 @@ jobs: cid=$(docker ps -a -q --filter "name=${TARGET}") if [[ ! -z "$cid" ]]; then docker rm $cid; fi docker run -tid --name="${TARGET}" --hostname="${TARGET}-container" --runtime=habana -v /home/yizhong/Model-References:/root/Model-References -v ${{ inputs.code_checkout_path }}:/root/llm-on-ray -v ${{ inputs.model_cache_path }}:/root/.cache/huggingface/hub/ -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --cap-add sys_ptrace --net=host --ipc=host ${TARGET}:habana - - name: Start Ray Cluster run: | TARGET=${{steps.target.outputs.target}} @@ -117,7 +113,6 @@ jobs: conf_path = "llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-vllm-hpu.yaml" with open(conf_path, encoding="utf-8") as reader: result = yaml.load(reader, Loader=yaml.FullLoader) - result['model_description']["config"]["use_auth_token"] = "${{ env.HF_ACCESS_TOKEN }}" with open(conf_path, 'w') as output: yaml.dump(result, output, sort_keys=False) EOF @@ -128,7 +123,6 @@ jobs: elif [[ ${{ matrix.model }} == "llama-2-70b-chat-hf" ]]; then docker exec "${TARGET}" bash -c "llm_on_ray-serve --config_file llm_on_ray/inference/models/hpu/llama-2-70b-chat-hf-hpu.yaml --keep_serve_terminal" elif [[ ${{ matrix.model }} == "llama-2-7b-chat-hf-vllm" ]]; then - docker exec "${TARGET}" bash -c "huggingface-cli login --token ${{ env.HF_ACCESS_TOKEN }}" docker exec "${TARGET}" bash -c "llm_on_ray-serve --config_file llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-vllm-hpu.yaml --keep_serve_terminal" fi echo Streaming query: diff --git a/.github/workflows/workflow_test_benchmark.yml b/.github/workflows/workflow_test_benchmark.yml index 2f78c997d..ee8f7f43b 100644 --- a/.github/workflows/workflow_test_benchmark.yml +++ b/.github/workflows/workflow_test_benchmark.yml @@ -11,10 +11,10 @@ on: default: '10.1.2.13:5000/llmray-build' http_proxy: type: string - default: 'http://10.24.221.169:911' + default: 'http://10.24.221.169:912' https_proxy: type: string - default: 'http://10.24.221.169:911' + default: 'http://10.24.221.169:912' runner_config_path: type: string default: '/home/ci/llm-ray-actions-runner' diff --git a/dev/scripts/ci-functions.sh b/dev/scripts/ci-functions.sh index ec790ffc7..738154fd8 100644 --- a/dev/scripts/ci-functions.sh +++ b/dev/scripts/ci-functions.sh @@ -1,8 +1,8 @@ #!/usr/bin/env bash set -eo pipefail -HTTP_PROXY='http://10.24.221.169:911' -HTTPS_PROXY='http://10.24.221.169:911' +HTTP_PROXY='http://10.24.221.169:912' +HTTPS_PROXY='http://10.24.221.169:912' MODEL_CACHE_PATH_LOACL='/root/.cache/huggingface/hub' CODE_CHECKOUT_PATH_LOCAL='/root/llm-on-ray' @@ -39,7 +39,6 @@ start_docker() { local code_checkout_path=$2 local model_cache_path=$3 local USE_PROXY=$4 - local HF_TOKEN=$5 cid=$(docker ps -q --filter "name=${TARGET}") if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid; fi @@ -66,12 +65,7 @@ start_docker() { fi echo "docker run -tid "${docker_args[@]}" "${TARGET}:latest"" - docker run -tid "${docker_args[@]}" "${TARGET}:latest" - if [ -z "$HF_TOKEN" ]; then - echo "no hf token" - else - docker exec "${TARGET}" bash -c "huggingface-cli login --token ${HF_TOKEN}" - fi + docker run -tid "${docker_args[@]}" "${TARGET}:latest" } install_dependencies(){ diff --git a/llm_on_ray/inference/models/hpu/llama-2-70b-chat-hf-hpu.yaml b/llm_on_ray/inference/models/hpu/llama-2-70b-chat-hf-hpu.yaml index ab411ff0e..4ecf45cd0 100644 --- a/llm_on_ray/inference/models/hpu/llama-2-70b-chat-hf-hpu.yaml +++ b/llm_on_ray/inference/models/hpu/llama-2-70b-chat-hf-hpu.yaml @@ -8,7 +8,7 @@ deepspeed: true workers_per_group: 8 device: hpu model_description: - model_id_or_path: meta-llama/Llama-2-70b-chat-hf - tokenizer_name_or_path: meta-llama/Llama-2-70b-chat-hf + model_id_or_path: NousResearch/Llama-2-70b-chat-hf + tokenizer_name_or_path: NousResearch/Llama-2-70b-chat-hf config: use_auth_token: '' diff --git a/llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-hpu.yaml b/llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-hpu.yaml index b7b19f02a..cb57f2768 100644 --- a/llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-hpu.yaml +++ b/llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-hpu.yaml @@ -6,7 +6,7 @@ cpus_per_worker: 8 hpus_per_worker: 1 device: hpu model_description: - model_id_or_path: meta-llama/Llama-2-7b-chat-hf - tokenizer_name_or_path: meta-llama/Llama-2-7b-chat-hf + model_id_or_path: NousResearch/Llama-2-7b-chat-hf + tokenizer_name_or_path: NousResearch/Llama-2-7b-chat-hf config: use_auth_token: '' diff --git a/llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-vllm-hpu.yaml b/llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-vllm-hpu.yaml index 869f41d7a..a9d10ccbd 100644 --- a/llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-vllm-hpu.yaml +++ b/llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-vllm-hpu.yaml @@ -16,7 +16,7 @@ ipex: enabled: false precision: bf16 model_description: - model_id_or_path: meta-llama/Llama-2-7b-chat-hf - tokenizer_name_or_path: meta-llama/Llama-2-7b-chat-hf + model_id_or_path: NousResearch/Llama-2-7b-chat-hf + tokenizer_name_or_path: NousResearch/Llama-2-7b-chat-hf config: use_auth_token: '' diff --git a/llm_on_ray/inference/models/hpu/llama-3-70b-chat-hf-hpu.yaml b/llm_on_ray/inference/models/hpu/llama-3-70b-chat-hf-hpu.yaml index 32cf9bb4e..eb3bab468 100644 --- a/llm_on_ray/inference/models/hpu/llama-3-70b-chat-hf-hpu.yaml +++ b/llm_on_ray/inference/models/hpu/llama-3-70b-chat-hf-hpu.yaml @@ -7,7 +7,7 @@ deepspeed: true workers_per_group: 8 device: hpu model_description: - model_id_or_path: meta-llama/Meta-Llama-3-70b-Instruct - tokenizer_name_or_path: meta-llama/Meta-Llama-3-70b-Instruct + model_id_or_path: NousResearch/Meta-Llama-3-70B-Instruct + tokenizer_name_or_path: NousResearch/Meta-Llama-3-70B-Instruct config: use_auth_token: '' diff --git a/llm_on_ray/inference/models/hpu/llama-3-8b-instruct-hpu.yaml b/llm_on_ray/inference/models/hpu/llama-3-8b-instruct-hpu.yaml index d57ffcc22..3789ab984 100644 --- a/llm_on_ray/inference/models/hpu/llama-3-8b-instruct-hpu.yaml +++ b/llm_on_ray/inference/models/hpu/llama-3-8b-instruct-hpu.yaml @@ -6,7 +6,7 @@ cpus_per_worker: 8 hpus_per_worker: 1 device: hpu model_description: - model_id_or_path: meta-llama/Meta-Llama-3-8b-Instruct - tokenizer_name_or_path: meta-llama/Meta-Llama-3-8b-Instruct + model_id_or_path: NousResearch/Meta-Llama-3-8B-Instruct + tokenizer_name_or_path: NousResearch/Meta-Llama-3-8B-Instruct config: use_auth_token: '' diff --git a/llm_on_ray/inference/models/llama-2-7b-chat-hf.yaml b/llm_on_ray/inference/models/llama-2-7b-chat-hf.yaml index 1f648d857..81cb74d98 100644 --- a/llm_on_ray/inference/models/llama-2-7b-chat-hf.yaml +++ b/llm_on_ray/inference/models/llama-2-7b-chat-hf.yaml @@ -12,7 +12,7 @@ ipex: enabled: false precision: bf16 model_description: - model_id_or_path: meta-llama/Llama-2-7b-chat-hf - tokenizer_name_or_path: meta-llama/Llama-2-7b-chat-hf + model_id_or_path: NousResearch/Llama-2-7b-chat-hf + tokenizer_name_or_path: NousResearch/Llama-2-7b-chat-hf config: use_auth_token: '' diff --git a/llm_on_ray/inference/models/vllm/llama-2-7b-chat-hf-vllm-autoscaling.yaml b/llm_on_ray/inference/models/vllm/llama-2-7b-chat-hf-vllm-autoscaling.yaml index 207466a63..ba32990a6 100644 --- a/llm_on_ray/inference/models/vllm/llama-2-7b-chat-hf-vllm-autoscaling.yaml +++ b/llm_on_ray/inference/models/vllm/llama-2-7b-chat-hf-vllm-autoscaling.yaml @@ -22,7 +22,7 @@ ipex: enabled: false precision: bf16 model_description: - model_id_or_path: meta-llama/Llama-2-7b-chat-hf - tokenizer_name_or_path: meta-llama/Llama-2-7b-chat-hf + model_id_or_path: NousResearch/Llama-2-7b-chat-hf + tokenizer_name_or_path: NousResearch/Llama-2-7b-chat-hf config: use_auth_token: '' diff --git a/llm_on_ray/inference/models/vllm/llama-2-7b-chat-hf-vllm.yaml b/llm_on_ray/inference/models/vllm/llama-2-7b-chat-hf-vllm.yaml index 5db264c9e..29d562aa9 100644 --- a/llm_on_ray/inference/models/vllm/llama-2-7b-chat-hf-vllm.yaml +++ b/llm_on_ray/inference/models/vllm/llama-2-7b-chat-hf-vllm.yaml @@ -15,7 +15,7 @@ ipex: enabled: false precision: bf16 model_description: - model_id_or_path: meta-llama/Llama-2-7b-chat-hf - tokenizer_name_or_path: meta-llama/Llama-2-7b-chat-hf + model_id_or_path: NousResearch/Llama-2-7b-chat-hf + tokenizer_name_or_path: NousResearch/Llama-2-7b-chat-hf config: use_auth_token: '' diff --git a/llm_on_ray/inference/predictors/deepspeed_predictor.py b/llm_on_ray/inference/predictors/deepspeed_predictor.py index e35fedbf0..2508286a7 100644 --- a/llm_on_ray/inference/predictors/deepspeed_predictor.py +++ b/llm_on_ray/inference/predictors/deepspeed_predictor.py @@ -53,11 +53,15 @@ def __init__(self, infer_conf: InferenceConfig, pad_token_id, stopping_criteria) model_desc = infer_conf.model_description model_config = model_desc.config + if infer_conf.model_description.config.use_auth_token: + auth_token = infer_conf.model_description.config.use_auth_token + else: + auth_token = None hf_config = AutoConfig.from_pretrained( model_desc.model_id_or_path, torchscript=True, trust_remote_code=model_config.trust_remote_code, - use_auth_token=infer_conf.model_description.config.use_auth_token, + use_auth_token=auth_token, ) # decide correct torch type for loading HF model @@ -75,7 +79,7 @@ def __init__(self, infer_conf: InferenceConfig, pad_token_id, stopping_criteria) self.model = PeftModel.from_pretrained( self.model, model_desc.peft_model_id_or_path, - use_auth_token=infer_conf.model_description.config.use_auth_token, + use_auth_token=auth_token, ) self.model = self.model.merge_and_unload() diff --git a/llm_on_ray/inference/predictors/hpu_predictor.py b/llm_on_ray/inference/predictors/hpu_predictor.py index 4710e0bf9..5e19c8733 100644 --- a/llm_on_ray/inference/predictors/hpu_predictor.py +++ b/llm_on_ray/inference/predictors/hpu_predictor.py @@ -314,11 +314,15 @@ def load_model(self): model = AutoModelForCausalLM.from_config(config, torch_dtype=model_dtype) checkpoints_json = tempfile.NamedTemporaryFile(suffix=".json", mode="+w") + if model_desc.config.use_auth_token: + auth_token = model_desc.config.use_auth_token + else: + auth_token = None write_checkpoints_json( model_desc.model_id_or_path, self.local_rank, checkpoints_json, - token=model_desc.config.use_auth_token, + token=auth_token, ) else: with deepspeed.OnDevice(dtype=model_dtype, device="cpu"): diff --git a/llm_on_ray/inference/predictors/transformer_predictor.py b/llm_on_ray/inference/predictors/transformer_predictor.py index 2e51ab6a8..a840ee0ff 100644 --- a/llm_on_ray/inference/predictors/transformer_predictor.py +++ b/llm_on_ray/inference/predictors/transformer_predictor.py @@ -37,11 +37,15 @@ def __init__(self, infer_conf: InferenceConfig): super().__init__(infer_conf) model_desc = infer_conf.model_description model_config = model_desc.config + if infer_conf.model_description.config.use_auth_token: + auth_token = infer_conf.model_description.config.use_auth_token + else: + auth_token = None hf_config = AutoConfig.from_pretrained( model_desc.model_id_or_path, torchscript=True, trust_remote_code=model_config.trust_remote_code, - use_auth_token=infer_conf.model_description.config.use_auth_token, + use_auth_token=auth_token, ) # decide correct torch type for loading HF model @@ -74,7 +78,7 @@ def __init__(self, infer_conf: InferenceConfig): model = PeftModel.from_pretrained( model, model_desc.peft_model_id_or_path, - use_auth_token=infer_conf.model_description.config.use_auth_token, + use_auth_token=auth_token, ) model = model.merge_and_unload() diff --git a/pyproject.toml b/pyproject.toml index 6a3c44685..5a8e89306 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,7 +20,7 @@ classifiers = [ dependencies = [ "accelerate", "datasets>=2.14.6", - "numpy", + "numpy<2.0.0", "ray>=2.10", "ray[serve,tune]>=2.10", "typing>=3.7.4.3",