GPTQ feature enhance (#1104)

* fix bugs while calling inc fit api. * export perm from gptq object * move quantizers and perms to INC model * support perm in WeightOnlyLinear (#1118) * support gptq model compression with saved scale (#1119) * support gptq scale * remove zero when sym * enhance WeightOnlyLinear for parallel * add log for compression * inference device use model.to() for WeightOnlyLinear * support lm_head rtn quantize in gptq export * update regularization layer-wise config when directly call gptq_quantize. * align with ipex-gpu requirement * improve coverage --------- Signed-off-by: YIYANGCAI <yiyang.cai@intel.com> Signed-off-by: Xin He <xin3.he@intel.com> Signed-off-by: wenhuach21 <wenhua.cheng@intel.com> Signed-off-by: He, Xin3 <xin3.he@intel.com> Co-authored-by: xinhe <xin3.he@intel.com> Co-authored-by: wenhuach21 <wenhua.cheng@intel.com>
intel · Aug 1, 2023 · 6ba7837 · 6ba7837
1 parent 88adfc9
commit 6ba7837
Show file tree

Hide file tree

Showing 19 changed files with 1,965 additions and 234 deletions.
diff --git a/...nlp/huggingface_models/language-modeling/quantization/ptq_weight_only/README.md b/...nlp/huggingface_models/language-modeling/quantization/ptq_weight_only/README.md
@@ -103,3 +103,10 @@ quantized_model = load(tuned_checkpoint, model)
 ```
 --------
 For more details, please refer to the [sample code](./run_clm.py).
+
+# (May Remove Later) Run GPTQ algorithm
+```
+sh run-gptq-llm.sh
+# You may want to move script run-gptq-llm.sh to root dir of neural compressor and modify python file's path.
+# Please make sure pile dataset is downloaded.
+```
diff --git a/...ytorch/nlp/huggingface_models/language-modeling/quantization/ptq_weight_only/datautils.py b/...ytorch/nlp/huggingface_models/language-modeling/quantization/ptq_weight_only/datautils.py
@@ -0,0 +1,249 @@
+import numpy as np
+import torch
+import datasets
+
+# cache_dir = "~/.cache/"
+cache_dir = None
+
+
+def set_seed(seed):
+    np.random.seed(seed)
+    torch.random.manual_seed(seed)
+
+
+def get_wikitext2(nsamples, seed, seqlen, model):
+    from datasets import load_dataset
+    traindata = load_dataset('wikitext', 'wikitext-2-raw-v1', split='train', cache_dir=cache_dir)
+    testdata = load_dataset('wikitext', 'wikitext-2-raw-v1', split='test', cache_dir=cache_dir)
+
+    from transformers import AutoTokenizer
+    tokenizer = AutoTokenizer.from_pretrained(model, use_fast=False, cache_dir=cache_dir)
+    trainenc = tokenizer("\n\n".join(traindata['text']), return_tensors='pt')
+    testenc = tokenizer("\n\n".join(testdata['text']), return_tensors='pt')
+
+    import random
+    random.seed(seed)
+    trainloader = []
+    for _ in range(nsamples):
+        i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
+        j = i + seqlen
+        inp = trainenc.input_ids[:, i:j]
+        tar = inp.clone()
+        tar[:, :-1] = -100
+        trainloader.append((inp, tar))
+    return trainloader, testenc
+
+
+def get_ptb(nsamples, seed, seqlen, model):
+    from datasets import load_dataset
+    traindata = load_dataset('ptb_text_only', 'penn_treebank', split='train', cache_dir=cache_dir)
+    valdata = load_dataset('ptb_text_only', 'penn_treebank', split='validation', cache_dir=cache_dir)
+
+    from transformers import AutoTokenizer
+    tokenizer = AutoTokenizer.from_pretrained(model, use_fast=False, cache_dir=cache_dir)
+    trainenc = tokenizer("\n\n".join(traindata['sentence']), return_tensors='pt')
+    testenc = tokenizer("\n\n".join(valdata['sentence']), return_tensors='pt')
+
+    import random
+    random.seed(seed)
+    trainloader = []
+    for _ in range(nsamples):
+        i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
+        j = i + seqlen
+        inp = trainenc.input_ids[:, i:j]
+        tar = inp.clone()
+        tar[:, :-1] = -100
+        trainloader.append((inp, tar))
+    return trainloader, testenc
+
+
+def get_c4(nsamples, seed, seqlen, model):
+    from datasets import load_dataset
+
+    traindata = load_dataset(
+        'allenai/c4', 'allenai--c4', data_files={'train': 'en/c4-train.00000-of-01024.json.gz'}, split='train',
+        cache_dir=cache_dir
+    )
+    valdata = load_dataset(
+        'allenai/c4', 'allenai--c4', data_files={'validation': 'en/c4-validation.00000-of-00008.json.gz'},
+        split='validation',
+        cache_dir=cache_dir
+    )
+
+    from transformers import AutoTokenizer
+    tokenizer = AutoTokenizer.from_pretrained(model, use_fast=False)
+
+    import random
+    random.seed(seed)
+    trainloader = []
+    for _ in range(nsamples):
+        while True:
+            i = random.randint(0, len(traindata) - 1)
+            trainenc = tokenizer(traindata[i]['text'], return_tensors='pt')
+            if trainenc.input_ids.shape[1] >= seqlen:
+                break
+        i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
+        j = i + seqlen
+        inp = trainenc.input_ids[:, i:j]
+        tar = inp.clone()
+        tar[:, :-1] = -100
+        trainloader.append((inp, tar))
+
+    import random
+    random.seed(0)
+    valenc = []
+    for _ in range(256):
+        while True:
+            i = random.randint(0, len(valdata) - 1)
+            tmp = tokenizer(valdata[i]['text'], return_tensors='pt')
+            if tmp.input_ids.shape[1] >= seqlen:
+                break
+        i = random.randint(0, tmp.input_ids.shape[1] - seqlen - 1)
+        j = i + seqlen
+        valenc.append(tmp.input_ids[:, i:j])
+    valenc = torch.hstack(valenc)
+
+    class TokenizerWrapper:
+        def __init__(self, input_ids):
+            self.input_ids = input_ids
+
+    valenc = TokenizerWrapper(valenc)
+
+    return trainloader, valenc
+
+
+def get_pile(nsamples, seed, seqlen, model):
+    from datasets import load_dataset
+
+    traindata = load_dataset(
+        'NeelNanda/pile-10k', split='train',
+        cache_dir=cache_dir
+    )
+
+    valdata = load_dataset(
+        'allenai/c4', 'allenai--c4', data_files={'validation': 'en/c4-validation.00000-of-00008.json.gz'},
+        split='validation',
+        cache_dir=cache_dir
+    )
+
+    from transformers import AutoTokenizer
+    tokenizer = AutoTokenizer.from_pretrained(model, use_fast=False, cache_dir=cache_dir)
+
+    import random
+    random.seed(seed)
+    trainloader = []
+    for _ in range(nsamples):
+        while True:
+            i = random.randint(0, len(traindata) - 1)
+            trainenc = tokenizer(traindata[i]['text'], return_tensors='pt')
+            if trainenc.input_ids.shape[1] > seqlen:
+                break
+        i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
+        j = i + seqlen
+        inp = trainenc.input_ids[:, i:j]
+        tar = inp.clone()
+        tar[:, :-1] = -100
+        trainloader.append((inp, tar))
+
+    import random
+    random.seed(0)
+    valenc = []
+    for _ in range(256):
+        while True:
+            i = random.randint(0, len(valdata) - 1)
+            tmp = tokenizer(valdata[i]['text'], return_tensors='pt')
+            if tmp.input_ids.shape[1] >= seqlen:
+                break
+        i = random.randint(0, tmp.input_ids.shape[1] - seqlen - 1)
+        j = i + seqlen
+        valenc.append(tmp.input_ids[:, i:j])
+    valenc = torch.hstack(valenc)
+
+    class TokenizerWrapper:
+        def __init__(self, input_ids):
+            self.input_ids = input_ids
+
+    valenc = TokenizerWrapper(valenc)
+
+    return trainloader, valenc
+
+
+def get_ptb_new(nsamples, seed, seqlen, model):
+    from datasets import load_dataset
+    traindata = load_dataset('ptb_text_only', 'penn_treebank', split='train', cache_dir=cache_dir)
+    testdata = load_dataset('ptb_text_only', 'penn_treebank', split='test', cache_dir=cache_dir)
+
+    from transformers import AutoTokenizer
+    tokenizer = AutoTokenizer.from_pretrained(model, use_fast=False, cache_dir=cache_dir)
+    trainenc = tokenizer(" ".join(traindata['sentence']), return_tensors='pt')
+    testenc = tokenizer(" ".join(testdata['sentence']), return_tensors='pt')
+
+    import random
+    random.seed(seed)
+    trainloader = []
+    for _ in range(nsamples):
+        i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
+        j = i + seqlen
+        inp = trainenc.input_ids[:, i:j]
+        tar = inp.clone()
+        tar[:, :-1] = -100
+        trainloader.append((inp, tar))
+    return trainloader, testenc
+
+
+def get_c4_new(nsamples, seed, seqlen, model):
+    from datasets import load_dataset
+    traindata = load_dataset(
+        'allenai/c4', 'allenai--c4', data_files={'train': 'en/c4-train.00000-of-01024.json.gz'}, split='train'
+    )
+    valdata = load_dataset(
+        'allenai/c4', 'allenai--c4', data_files={'validation': 'en/c4-validation.00000-of-00008.json.gz'},
+        split='validation'
+    )
+
+    from transformers import AutoTokenizer
+    tokenizer = AutoTokenizer.from_pretrained(model, use_fast=False, cache_dir=cache_dir)
+
+    import random
+    random.seed(seed)
+    trainloader = []
+    for _ in range(nsamples):
+        while True:
+            i = random.randint(0, len(traindata) - 1)
+            trainenc = tokenizer(traindata[i]['text'], return_tensors='pt')
+            if trainenc.input_ids.shape[1] >= seqlen:
+                break
+        i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
+        j = i + seqlen
+        inp = trainenc.input_ids[:, i:j]
+        tar = inp.clone()
+        tar[:, :-1] = -100
+        trainloader.append((inp, tar))
+
+    valenc = tokenizer(' '.join(valdata[:1100]['text']), return_tensors='pt')
+    valenc = valenc.input_ids[:, :(256 * seqlen)]
+
+    class TokenizerWrapper:
+        def __init__(self, input_ids):
+            self.input_ids = input_ids
+
+    valenc = TokenizerWrapper(valenc)
+
+    return trainloader, valenc
+
+
+def get_loaders(
+        name, nsamples=128, seed=0, seqlen=2048, model=''
+):
+    if 'wikitext2' in name:
+        return get_wikitext2(nsamples, seed, seqlen, model)
+    if 'ptb' in name:
+        if 'new' in name:
+            return get_ptb_new(nsamples, seed, seqlen, model)
+        return get_ptb(nsamples, seed, seqlen, model)
+    if 'c4' in name:
+        if 'new' in name:
+            return get_c4_new(nsamples, seed, seqlen, model)
+        return get_c4(nsamples, seed, seqlen, model)
+    if 'pile' in name:
+        return get_pile(nsamples, seed, seqlen, model)
diff --git a/.../huggingface_models/language-modeling/quantization/ptq_weight_only/evaluation/__init__.py b/.../huggingface_models/language-modeling/quantization/ptq_weight_only/evaluation/__init__.py
@@ -0,0 +1,17 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2022 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .lm_eval.evaluator import evaluate
diff --git a/...face_models/language-modeling/quantization/ptq_weight_only/evaluation/lm_eval/__init__.py b/...face_models/language-modeling/quantization/ptq_weight_only/evaluation/lm_eval/__init__.py
@@ -0,0 +1,16 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2022 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.