Skip to content

Commit

Permalink
Fix quantization tests (#29914)
Browse files Browse the repository at this point in the history
* revert back to torch 2.1.1

* run test

* switch to torch 2.2.1

* udapte dockerfile

* fix awq tests

* fix test

* run quanto tests

* update tests

* split quantization tests

* fix

* fix again

* final fix

* fix report artifact

* build docker again

* Revert "build docker again"

This reverts commit 399a5f9.

* debug

* revert

* style

* new notification system

* testing notfication

* rebuild docker

* fix_prev_ci_results

* typo

* remove warning

* fix typo

* fix artifact name

* debug

* issue fixed

* debug again

* fix

* fix time

* test notif with faling test

* typo

* issues again

* final fix ?

* run all quantization tests again

* remove name to clear space

* revert modfiication done on workflow

* fix

* build docker

* build only quant docker

* fix quantization ci

* fix

* fix report

* better quantization_matrix

* add print

* revert to the basic one
  • Loading branch information
SunMarc authored and Ita Zaporozhets committed May 14, 2024
1 parent b361ff9 commit 2340f0d
Show file tree
Hide file tree
Showing 7 changed files with 324 additions and 30 deletions.
36 changes: 28 additions & 8 deletions .github/workflows/self-scheduled.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ env:

jobs:
setup:
if: ${{ inputs.job == 'run_tests_gpu' }}
name: Setup
strategy:
matrix:
Expand All @@ -45,6 +44,7 @@ jobs:
outputs:
folder_slices: ${{ steps.set-matrix.outputs.folder_slices }}
slice_ids: ${{ steps.set-matrix.outputs.slice_ids }}
quantization_matrix: ${{ steps.set-matrix-quantization.outputs.quantization_matrix }}
steps:
- name: Update clone
working-directory: /transformers
Expand All @@ -63,11 +63,19 @@ jobs:
run: pip freeze

- id: set-matrix
if: ${{ inputs.job == 'run_tests_gpu' }}
name: Identify models to test
working-directory: /transformers/tests
run: |
echo "folder_slices=$(python3 ../utils/split_model_tests.py --num_splits ${{ env.NUM_SLICES }})" >> $GITHUB_OUTPUT
echo "slice_ids=$(python3 -c 'd = list(range(${{ env.NUM_SLICES }})); print(d)')" >> $GITHUB_OUTPUT
- id: set-matrix-quantization
if: ${{ inputs.job == 'run_tests_quantization_torch_gpu' }}
name: Identify quantization method to test
working-directory: /transformers/tests
run: |
echo "quantization_matrix=$(python3 -c 'import os; tests = os.getcwd(); quantization_tests = os.listdir(os.path.join(tests, "quantization")); d = sorted(list(filter(os.path.isdir, [f"quantization/{x}" for x in quantization_tests]))) ; print(d)')" >> $GITHUB_OUTPUT
- name: NVIDIA-SMI
run: |
Expand Down Expand Up @@ -303,16 +311,26 @@ jobs:

run_tests_quantization_torch_gpu:
if: ${{ inputs.job == 'run_tests_quantization_torch_gpu' }}
name: Quantization tests
name: " "
strategy:
fail-fast: false
matrix:
folders: ${{ fromJson(needs.setup.outputs.quantization_matrix) }}
machine_type: [single-gpu, multi-gpu]
runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci]
container:
image: huggingface/transformers-quantization-latest-gpu
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
steps:
- name: Echo folder ${{ matrix.folders }}
shell: bash
run: |
echo "${{ matrix.folders }}"
matrix_folders=${{ matrix.folders }}
matrix_folders=${matrix_folders/'quantization/'/'quantization_'}
echo "$matrix_folders"
echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
- name: Update clone
working-directory: /transformers
run: git fetch && git checkout ${{ github.sha }}
Expand All @@ -337,19 +355,19 @@ jobs:
- name: Run quantization tests on GPU
working-directory: /transformers
run: |
python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_quantization_torch_gpu tests/quantization
python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_quantization_torch_gpu_${{ matrix.folders }} tests/${{ matrix.folders }}
- name: Failure short reports
if: ${{ failure() }}
continue-on-error: true
run: cat /transformers/reports/${{ matrix.machine_type }}_tests_quantization_torch_gpu/failures_short.txt
run: cat /transformers/reports/${{ matrix.machine_type }}_tests_quantization_torch_gpu_${{ matrix.folders }}/failures_short.txt

- name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_tests_quantization_torch_gpu"
- name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_tests_quantization_torch_gpu_${{ env.matrix_folders }}"
if: ${{ always() }}
uses: actions/upload-artifact@v3
with:
name: ${{ matrix.machine_type }}_run_tests_quantization_torch_gpu
path: /transformers/reports/${{ matrix.machine_type }}_tests_quantization_torch_gpu
name: ${{ matrix.machine_type }}_run_tests_quantization_torch_gpu_${{ env.matrix_folders }}
path: /transformers/reports/${{ matrix.machine_type }}_tests_quantization_torch_gpu_${{ matrix.folders }}

run_extract_warnings:
# Let's only do this for the job `run_tests_gpu` to simplify the (already complex) logic.
Expand Down Expand Up @@ -413,4 +431,6 @@ jobs:
slack_report_channel: ${{ inputs.slack_report_channel }}
# This would be an empty string if `setup` is skipped.
folder_slices: ${{ needs.setup.outputs.folder_slices }}
secrets: inherit
quantization_matrix: ${{ needs.setup.outputs.quantization_matrix }}

secrets: inherit
25 changes: 24 additions & 1 deletion .github/workflows/slack-report.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@ on:
folder_slices:
required: true
type: string
quantization_matrix:
required: true
type: string


jobs:
Expand All @@ -32,6 +35,7 @@ jobs:
- uses: actions/checkout@v3
- uses: actions/download-artifact@v3
- name: Send message to Slack
if: ${{ inputs.job != 'run_tests_quantization_torch_gpu' }}
env:
CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
Expand All @@ -53,7 +57,26 @@ jobs:
pip install slack_sdk
pip show slack_sdk
python utils/notification_service.py "${{ inputs.folder_slices }}"
- uses: actions/checkout@v3
- uses: actions/download-artifact@v3
- name: Send message to Slack for quantization workflow
if: ${{ inputs.job == 'run_tests_quantization_torch_gpu' }}
env:
CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
SLACK_REPORT_CHANNEL: ${{ inputs.slack_report_channel }}
CI_EVENT: scheduled
CI_SHA: ${{ github.sha }}
SETUP_STATUS: ${{ inputs.setup_status }}
# We pass `needs.setup.outputs.quantization_matrix` as the argument. A processing in `notification_service_quantization.py` to change
# `quantization/bnb` to `quantization_bnb` is required, as the artifact names use `_` instead of `/`.
run: |
sudo apt-get install -y curl
pip install slack_sdk
pip show slack_sdk
python utils/notification_service_quantization.py "${{ inputs.quantization_matrix }}"
# Upload complete failure tables, as they might be big and only truncated versions could be sent to Slack.
- name: Failure table artifacts
# Only the model testing job is concerned for this step
Expand Down
8 changes: 6 additions & 2 deletions docker/transformers-quantization-latest-gpu/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ SHELL ["sh", "-lc"]
# The following `ARG` are mainly used to specify the versions explicitly & directly in this docker file, and not meant
# to be used as arguments for docker build (so far).

ARG PYTORCH='2.2.0'
ARG PYTORCH='2.2.1'
# Example: `cu102`, `cu113`, etc.
ARG CUDA='cu118'

Expand All @@ -30,6 +30,9 @@ RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-torch]

RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate

# needed in bnb and awq
RUN python3 -m pip install --no-cache-dir einops

# Add bitsandbytes for mixed int8 testing
RUN python3 -m pip install --no-cache-dir bitsandbytes

Expand All @@ -43,7 +46,8 @@ RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/opt
RUN python3 -m pip install --no-cache-dir aqlm[gpu]==1.0.2

# Add autoawq for quantization testing
RUN python3 -m pip install --no-cache-dir https://github.com/casper-hansen/AutoAWQ/releases/download/v0.2.0/autoawq-0.2.0+cu118-cp38-cp38-linux_x86_64.whl
# >=v0.2.3 needed for compatibility with torch 2.2.1
RUN python3 -m pip install --no-cache-dir https://github.com/casper-hansen/AutoAWQ/releases/download/v0.2.3/autoawq-0.2.3+cu118-cp38-cp38-linux_x86_64.whl

# Add quanto for quantization testing
RUN python3 -m pip install --no-cache-dir quanto
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/utils/quantization_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -789,7 +789,7 @@ def post_init(self):

def get_loading_attributes(self):
attibutes_dict = copy.deepcopy(self.__dict__)
loading_attibutes = ["version", "do_fuse", "modules_to_fuse", "fuse_max_seq_len"]
loading_attibutes = ["version", "do_fuse", "modules_to_fuse", "fuse_max_seq_len", "exllama_config"]
loading_attibutes_dict = {i: j for i, j in attibutes_dict.items() if i in loading_attibutes}
return loading_attibutes_dict

Expand Down
30 changes: 14 additions & 16 deletions tests/quantization/autoawq/test_awq.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ class AwqTest(unittest.TestCase):

EXPECTED_OUTPUT = "Hello my name is Katie and I am a 20 year old student at the University of North Carolina at Chapel Hill. I am a junior and I am majoring in Journalism and minoring in Spanish"
EXPECTED_OUTPUT_BF16 = "Hello my name is Katie and I am a 20 year old student at the University of North Carolina at Chapel Hill. I am a junior and I am majoring in Exercise and Sport Science with a"

EXPECTED_OUTPUT_EXLLAMA = "Hello my name is Katie and I am a 20 year old student from the UK. I am currently studying for a degree in English Literature and History at the University of York. I am a very out"
device_map = "cuda"

# called only once for all test in this class
Expand Down Expand Up @@ -200,11 +200,11 @@ def test_quantized_model_exllama(self):

quantization_config = AwqConfig(version="exllama")
quantized_model = AutoModelForCausalLM.from_pretrained(
self.model_name, quantization_config=quantization_config
).to(torch_device)
self.model_name, quantization_config=quantization_config, device_map=torch_device
)

output = quantized_model.generate(**input_ids, max_new_tokens=40)
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT_EXLLAMA)

def test_quantized_model_no_device_map(self):
"""
Expand Down Expand Up @@ -239,7 +239,7 @@ def test_quantized_model_multi_gpu(self):

quantized_model = AutoModelForCausalLM.from_pretrained(self.model_name, device_map="auto")

self.assertTrue(set(quantized_model.hf_device_map.values()) == {0, 1, 2, 3})
self.assertTrue(set(quantized_model.hf_device_map.values()) == {0, 1})

output = quantized_model.generate(**input_ids, max_new_tokens=40)

Expand Down Expand Up @@ -272,8 +272,8 @@ class AwqFusedTest(unittest.TestCase):
model_name = "TheBloke/Mistral-7B-OpenOrca-AWQ"
model_revision = "7048b2af77d0dd1c81b000b19d73f9cc8950b510"

custom_mapping_model_id = "TheBloke/Yi-34B-AWQ"
custom_model_revision = "f1b2cd1b7459ceecfdc1fac5bb8725f13707c589"
custom_mapping_model_id = "TheBloke/Mistral-7B-v0.1-AWQ"
custom_model_revision = "f186bcfa9edbe2a4334262ec1e67f23e53ed1ae7"

mixtral_model_name = "casperhansen/mixtral-instruct-awq"
mixtral_model_revision = "87dd4ec502dde74fb3a624835c776b000d190c3b"
Expand All @@ -287,8 +287,8 @@ class AwqFusedTest(unittest.TestCase):
"You end up exactly where you started. Where are you?"
)

EXPECTED_GENERATION = prompt + "\n\nThis is a classic puzzle that has been around for"
EXPECTED_GENERATION_CUSTOM_MODEL = "HelloWorld.java:11)\r\n\tat org"
EXPECTED_GENERATION = prompt + "\n\nYou are at the starting point.\n\nIf"
EXPECTED_GENERATION_CUSTOM_MODEL = "Hello,\n\nI have a problem with my 20"
EXPECTED_GENERATION_MIXTRAL = prompt + " You're on the North Pole.\n\nThe"

def tearDown(self):
Expand Down Expand Up @@ -423,35 +423,33 @@ def test_generation_custom_model(self):
fuse_max_seq_len=512,
modules_to_fuse={
"attention": ["q_proj", "k_proj", "v_proj", "o_proj"],
"layernorm": ["ln1", "ln2", "norm"],
"mlp": ["gate_proj", "up_proj", "down_proj"],
"layernorm": ["input_layernorm", "post_attention_layernorm", "norm"],
"use_alibi": False,
"num_attention_heads": 56,
"hidden_size": 4096,
"num_attention_heads": 32,
"num_key_value_heads": 8,
"hidden_size": 7168,
},
)

model = AutoModelForCausalLM.from_pretrained(
self.custom_mapping_model_id,
quantization_config=quantization_config,
trust_remote_code=True,
device_map="balanced",
revision=self.custom_model_revision,
)

self._check_fused_modules(model)

tokenizer = AutoTokenizer.from_pretrained(
self.custom_mapping_model_id, revision=self.custom_model_revision, trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained(self.custom_mapping_model_id, revision=self.custom_model_revision)

prompt = "Hello"
inputs = tokenizer(prompt, return_tensors="pt").to(torch_device)

outputs = model.generate(**inputs, max_new_tokens=12)
self.assertEqual(tokenizer.decode(outputs[0], skip_special_tokens=True), self.EXPECTED_GENERATION_CUSTOM_MODEL)

@unittest.skip("Not enough GPU memory on CI runners")
@require_torch_multi_gpu
def test_generation_mixtral_fused(self):
"""
Expand Down
2 changes: 0 additions & 2 deletions utils/notification_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -1056,7 +1056,6 @@ def prepare_reports(title, header, reports, to_truncate=True):
"TensorFlow pipelines": "run_tests_tf_pipeline_gpu",
"Examples directory": "run_examples_gpu",
"Torch CUDA extension tests": "run_tests_torch_cuda_extensions_gpu_test_reports",
"Quantization tests": "run_tests_quantization_torch_gpu",
}

if ci_event in ["push", "Nightly CI"] or ci_event.startswith("Past CI"):
Expand All @@ -1077,7 +1076,6 @@ def prepare_reports(title, header, reports, to_truncate=True):
"run_pipelines_tf_gpu": "TensorFlow pipelines",
"run_examples_gpu": "Examples directory",
"run_all_tests_torch_cuda_extensions_gpu": "Torch CUDA extension tests",
"run_tests_quantization_torch_gpu": "Quantization tests",
}

# Remove some entries in `additional_files` if they are not concerned.
Expand Down

0 comments on commit 2340f0d

Please sign in to comment.