add gptj-6b with lm_evaluation_harness example (#845)

intel · Apr 20, 2023 · b492f5c · b492f5c
1 parent 631ff18
commit b492f5c
Show file tree

Hide file tree

Showing 7 changed files with 343 additions and 35 deletions.
diff --git a/examples/.config/pytorch_optimize.json b/examples/.config/pytorch_optimize.json
@@ -1134,5 +1134,26 @@
         "config": "saved_results"
       }
     }
-  }
+  },
+  "gpt_j_6b_clm_ipex": {
+    "working_dir": "huggingface/pytorch/language-modeling/quantization",
+    "tune": {
+      "cmd": "bash run_tuning.sh",
+      "params": {
+        "topology": "gpt_j_6b_clm_ipex",
+        "output_model": "saved_results"
+      }
+    },
+    "benchmark": {
+      "cmd": "bash run_benchmark.sh",
+      "params": {
+        "topology": "gpt_j_6b_ipex",
+        "mode": "accuracy",
+        "batch_size": "64",
+        "iters": "100",
+        "int8": "false",
+        "config": "saved_results"
+      }
+    }
+  },
 }
diff --git a/examples/huggingface/pytorch/language-modeling/quantization/README.md b/examples/huggingface/pytorch/language-modeling/quantization/README.md
@@ -1,21 +1,53 @@
-Step-by-Step
+Step-by-Step
 ============
-The scripts `run_clm.py`, `run_mlm.py` and `run_plm.py` provide three quantization approaches respectively (PostTrainingDynamic, PostTrainingStatic and QuantizationAwareTraining) based on [Intel® Neural Compressor](https://github.com/intel/neural-compressor).
-
-# Prerequisite
-## 1. Create Environment
-Recommend python 3.7 or higher version.
-```shell
-pip install intel-extension-for-transformers
-pip install -r requirements.txt
+This document describes the step-by-step instructions to run large language models (LLMs) on 4th Gen Intel® Xeon® Scalable Processor (codenamed Sapphire Rapids) with PyTorch and Intel® Extension for PyTorch.
+
+The scripts `run_clm.py`, `run_mlm.py` and `run_plm.py` provide two quantization approaches respectively (PostTrainingDynamic, PostTrainingStatic) based on [Intel® Neural Compressor](https://github.com/intel/neural-compressor).
+
+The script `evaluate_clm.py` supports `GPTJ`, `OPT`, `LLaMA`, `BLOOM` quantization and validates accuracy with [lm_evaluation_harness](https://github.com/EleutherAI/lm-evaluation-harness.git) now, and we are adding more models.
+
+# Prerequisite
+## 1. Create Environment
+```
+WORK_DIR=$PWD
+# Create Environment (conda)
+conda create -n llm python=3.9 -y
+conda install mkl mkl-include -y
+conda install gperftools jemalloc==5.2.1 -c conda-forge -y
+
+# Installation
+pip install git+https://github.com/intel/neural-compressor.git
+pip install intel_extension_for_pytorch transformers intel_extension_for_transformers datasets accelerate
+
+# Setup Environment Variables
+export KMP_BLOCKTIME=1
+export KMP_SETTINGS=1
+export KMP_AFFINITY=granularity=fine,compact,1,0
+# IOMP
+export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libiomp5.so
+# Tcmalloc is a recommended malloc implementation that emphasizes fragmentation avoidance and scalable concurrency support.
+export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so
 ```
 
 # Run
 ## 1. Quantization
+
 Here is how to run the scripts:
 
 **Causal Language Modeling (CLM)**
 
+`evaluate_clm.py` quantizes the large language models using the dataset [NeelNanda/pile-10k](https://huggingface.co/datasets/NeelNanda/pile-10k) calibration and validates `lambada_openai`, `piqa`, `winogrande`, `hellaswag` and other datasets accuracy provided by lm_evaluation_harness, an example command is as follows.
+```bash
+# "--sq" is used to enable smooth quant
+# "--int8_bf16_mixed" is used to enable int8-bf16 mixed mode for platform that natively supports bf16
+python evaluate_clm.py \
+    --model EleutherAI/gpt-j-6B \
+    --quantize \
+    --sq \
+    --int8_bf16_mixed \
+    --output_dir "saved_results"
+```
+To do quantization based transformers language-modeling example [`run_clm.py`](https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_clm.py), please use the following command.
 ```
 python run_clm.py \
     --model_name_or_path EleutherAI/gpt-neo-125M \
@@ -60,17 +92,39 @@ python run_mlm.py \
     --overwrite_output_dir
 
 ```
+## 2. Accuracy
+```bash
+# FP32 Accuracy
+python evaluate_clm.py \
+    --model EleutherAI/gpt-j-6B \
+    --accuracy_only \
+    --batch_size 56
+
+# BF16 Accuracy
+python evaluate_clm.py \
+    --model EleutherAI/gpt-j-6B \
+    --accuracy_only \
+    --ipex_bf16 \
+    --batch_size 56
+
+# INT8 Accuracy
+python evaluate_clm.py \
+    --model EleutherAI/gpt-j-6B \
+    --accuracy_only \
+    --int8 \
+    --batch_size 56
+```
 
-## 2. Validated Model List
+## 3. Validated Model List
 
 |Type|Pretrained model|PostTrainingDynamic | PostTrainingStatic | QuantizationAwareTraining
 |---|------------------------------------|---|---|---
-|CLM|EleutherAI/gpt-neo-125M| ✅| ✅| ✅
+|CLM|EleutherAI/gpt-neo-125M| ✅| ✅| Stay tuned
 |CLM|abeja/gpt-neox-japanese-2.7b| ✅| ✅| Stay tuned
 |CLM|EleutherAI/gpt-j-6B| ✅| ✅| Stay tuned
 |CLM|bigscience/bloom-560m| ✅| ✅| Stay tuned
-|MLM|bert-base-uncased| ✅| ✅| ✅
-|PLM|xlnet-base-cased| ✅| ✅| ✅
+|MLM|bert-base-uncased| ✅| ✅| Stay tuned
+|PLM|xlnet-base-cased| ✅| ✅| Stay tuned
 
 ## 3. Bash Command
 

diff --git a/examples/huggingface/pytorch/language-modeling/quantization/evaluate_clm.py b/examples/huggingface/pytorch/language-modeling/quantization/evaluate_clm.py
@@ -0,0 +1,219 @@
+import argparse
+import os
+import time
+import json
+import fnmatch
+import re
+import numpy as np
+import torch
+from datasets import load_dataset
+from torch.nn.functional import pad
+from torch.utils.data import DataLoader
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import intel_extension_for_pytorch as ipex
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--model", nargs="?", default="EleutherAI/gpt-j-6B"
+)
+parser.add_argument("--dataset", nargs="?", default="NeelNanda/pile-10k", const="NeelNanda/pile-10k")
+parser.add_argument("--output_dir", nargs="?", default="./saved_results")
+parser.add_argument("--quantize", action="store_true")
+parser.add_argument(
+    "--ipex_bf16",
+    action="store_true",
+    help="to enable ipex amp bf16 (work on platforms like SPR)",
+)
+parser.add_argument(
+    "--int8_bf16_mixed",
+    action="store_true",
+    help="by default it is int8-fp32 mixed, to enable int8 mixed amp bf16 (work on platforms like SPR)",
+)
+parser.add_argument("--sq", action="store_true")
+parser.add_argument("--alpha", default="auto",
+                    help="Smooth quant parameter.")
+parser.add_argument("--int8", action="store_true")
+parser.add_argument("--accuracy_only", action="store_true")
+parser.add_argument("--batch_size", default=1, type=int,
+                    help="For accuracy measurement only.")
+parser.add_argument("--save_accuracy_path", default=None,
+                    help="Save accuracy results path.")
+parser.add_argument("--pad_max_length", default=196, type=int,
+                    help="Pad input ids to max length.")
+
+args = parser.parse_args()
+calib_size = 1
+
+
+class Evaluator:
+    def __init__(self, dataset, tokenizer, batch_size=8, pad_val=1, pad_max=196):
+        self.dataset = dataset
+        self.tokenizer = tokenizer
+        self.batch_size = batch_size
+        self.pad_val = pad_val
+        self.pad_max = pad_max
+
+        # tokenize the dataset
+        self.dataset = self.dataset.map(self.tokenize_function, batched=True)
+        self.dataset.set_format(type="torch", columns=["input_ids"])
+
+    @torch.no_grad()
+    def tokenize_function(self, examples):
+        example = self.tokenizer(examples["text"])
+        return example
+
+    @torch.no_grad()
+    def collate_batch(self, batch):
+
+        input_ids_padded = []
+        last_ind = []
+
+        for text in batch:
+            input_ids = text["input_ids"]
+            pad_len = self.pad_max - input_ids.shape[0]
+            last_ind.append(input_ids.shape[0] - 1)
+
+            input_ids = pad(input_ids, (0, pad_len), value=self.pad_val)
+            input_ids_padded.append(input_ids)
+
+        return (torch.vstack(input_ids_padded), torch.tensor(last_ind))
+
+    @torch.no_grad()
+    def evaluate(self, model):
+
+        model.eval()
+        # The task is to predict the last word of the input.
+        total, hit = 0, 0
+        latency = 0
+        test_dataloader = DataLoader(
+            self.dataset,
+            batch_size=self.batch_size,
+            shuffle=False,
+            collate_fn=self.collate_batch,
+        )
+        for i, (input_ids, last_ind) in enumerate(test_dataloader):
+            label = input_ids[torch.arange(len(last_ind)), last_ind]
+            input_ids[torch.arange(len(last_ind)), last_ind] = self.pad_val
+            pad_len = self.pad_max - last_ind - 1
+
+            start = time.time()
+            outputs = model(input_ids)
+            latency += time.time() - start
+
+            last_token_logits = outputs[0][torch.arange(len(last_ind)), -2 - pad_len, :]
+            pred = last_token_logits.argmax(dim=-1)
+            total += label.size(0)
+            hit += (pred == label).sum().item()
+            if i % 50 == 0:
+                print(hit / total)
+                print("Processed minibatch:", i)
+
+        acc = hit / total
+        print("Accuracy: ", acc)
+        lantecy = latency / len(self.dataset)
+        print("Latency: ", latency)
+        return acc
+
+user_model = AutoModelForCausalLM.from_pretrained(
+    args.model,
+    torchscript=True,  # torchscript will force `return_dict=False` to avoid jit errors
+)
+tokenizer = AutoTokenizer.from_pretrained(args.model)
+
+# to channels last
+user_model = user_model.to(memory_format=torch.channels_last)
+user_model.eval()
+
+
+if args.quantize:
+    # dataset
+    calib_dataset = load_dataset(args.dataset, split="train")
+    calib_dataset = calib_dataset.shuffle(seed=42)
+    calib_evaluator = Evaluator(calib_dataset, tokenizer, args.batch_size, args.pad_max_length)
+    calib_dataloader = DataLoader(
+        calib_evaluator.dataset,
+        batch_size=calib_size,
+        shuffle=False,
+        collate_fn=calib_evaluator.collate_batch,
+    )
+
+    def calib_func(prepared_model):
+        for i, calib_input in enumerate(calib_dataloader):
+            if i > 100:
+                break
+            prepared_model(calib_input[0])
+
+    from neural_compressor import PostTrainingQuantConfig, quantization
+    if re.search("gpt", args.model):
+        op_type_dict = {
+            "add": {"weight": {"dtype": ["fp32"]}, "activation": {"dtype": ["fp32"]}},
+        }
+    else:
+        op_type_dict = {}
+    excluded_precisions = [] if args.int8_bf16_mixed else ["bf16"]
+    if args.sq:
+        recipes = {"smooth_quant": True, "smooth_quant_args": {'alpha': args.alpha}}
+        conf = PostTrainingQuantConfig(
+            backend="ipex",
+            excluded_precisions=excluded_precisions,
+            op_type_dict=op_type_dict,
+            recipes=recipes,
+        )
+    else:
+        conf = PostTrainingQuantConfig(
+            backend="ipex",
+            excluded_precisions=excluded_precisions,
+            op_type_dict=op_type_dict,
+        )
+
+    q_model = quantization.fit(
+        user_model,
+        conf,
+        calib_dataloader=calib_dataloader,
+        calib_func=calib_func,
+    )
+
+    q_model.save(args.output_dir)
+    exit(0)
+
+amp_enabled = False
+amp_dtype = None
+if args.ipex_bf16:
+    amp_enabled = True
+    amp_dtype = torch.bfloat16
+    user_model = ipex.optimize(user_model, dtype=torch.bfloat16)
+    with torch.no_grad(), torch.cpu.amp.autocast():
+        user_model = torch.jit.trace(user_model, user_model.dummy_inputs["input_ids"])
+        user_model = torch.jit.freeze(user_model)
+    user_model.eval()
+
+if args.int8 or args.int8_bf16_mixed:
+    print("load int8 model")
+    from neural_compressor.utils.pytorch import load
+    user_model = load(os.path.abspath(os.path.expanduser(args.output_dir)))
+    user_model.eval()
+
+if args.accuracy_only:
+    user_model.eval()
+    def eval_func(user_model):
+        from intel_extension_for_transformers.evaluation.lm_evaluation_harness.evaluator import evaluate
+        results = evaluate(
+            model="hf-causal",
+            model_args='pretrained='+args.model+',tokenizer='+args.model+',dtype=float32',
+            user_model=user_model,
+            batch_size=args.batch_size,
+            tasks=["lambada_openai", "piqa", "winogrande", "hellaswag"],
+        )
+        dumped = json.dumps(results, indent=2)
+        if args.save_accuracy_path:
+            with open(args.save_accuracy_path, "w") as f:
+                f.write(dumped)
+        print('Accuracy for lambada_openai is ', results["results"]["lambada_openai"]["acc"])
+        print('Accuracy for piqa is ', results["results"]["piqa"]["acc"])
+        print('Accuracy for winogrande is ', results["results"]["winogrande"]["acc"])
+        print('Accuracy for hellaswag is ', results["results"]["hellaswag"]["acc"])
+
+    with torch.no_grad(), torch.cpu.amp.autocast(enabled=amp_enabled, dtype=amp_dtype):
+        eval_func(user_model)
+
+
+