In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
prompt_root_dir = './few_shot_prompts/'

In [3]:
# read in the preprocessed prompts
prompts_dir = os.listdir(prompt_root_dir)
records = {}
for pd in prompts_dir:
    records[pd] = {}
    prompts = []
    files = os.listdir(prompt_root_dir+pd)
    
    # store labels
    label_file = prompt_root_dir+pd+'/labels'
    with open(label_file) as lf:
        records[pd]['labels'] = lf.read().splitlines() 
    files = [file for file in files if file!='labels']
    records[pd]['prompts'] = []
    records[pd]['predictions'] = []
    for file in files:
        with open(prompt_root_dir+pd+'/'+file) as f:
            records[pd]['prompts'].append(f.read())
print(records)   

{'text0': {'labels': ['Anticipated Effort', 'Situational Control', 'Objective Experience', 'Pleasantness', 'No annotations', 'Pleasantness', 'Pleasantness', 'Pleasantness', 'Objective Experience', 'Pleasantness', 'Anticipated Effort', 'Pleasantness'], 'prompts': ['Appraisal is defined as a description of a person’s experience in an event or situation. Most often, an appraisal is describing one aspect of the experience. A person may have multiple appraisals of the same event/situation.\nWe define the person who posts as "target", and the person who responds as "observer".\nAppraisal labels include:\nPleasantness - how pleasant the situation was;\nAnticipated effort - how much effort was needed to deal with the situation; \nSituational Control - how much the situation was out of anyone’s control;\nSelf-Other Agency - how much oneself or another person was responsible for the situation;\nAttentional Activity - how much their attention was drawn to the situation rather than diverted away f

By Jules Gagnon-Marchand, jules.gagnonm.alt@gmail.com. 

## Description:
Example on running very large models like OPT-66B, OPT-175B, BLOOM-176B on the cluster on a single 6-8 GPUs node, in INT8, fp16 or fp32 mode.

## Instructions:
Install the following. Transformers needs to be the beta version, from the git repo.

```bash
pip install accelerate
pip install deepspeed
pip install bitsandbytes
pip install git+https://github.com/huggingface/transformers.git
```

- You can change the `MODEL_NAME`. It is the HuggingFace model name.
- Set `USE_8BIT` to `False` if you don't want to use 8-bit mode. 
- Set `DTYPE_NOT_8BIT` to `torch.fp32` if you want to use fp16 (and you're not in 8-bit mode).
- `MAX_MEMORY_GB` is very important. It's the max memory to use to *load* the model per GPU. If you don't set a limit, HuggingFace will fill up the first GPUs and not use the rest, and the first GPUs will blow up when you try to run the model.
- 32-bit mode required an `OFFLOAD_FOLDER` to be set.

## What we tested:
We only tested on 6xA100 40GB GPUs. Here's what works. Only tested inference for all of these.
- BLOOM 176B works on INT8 mode only.
- OPT 66B works on INT8 and fp16 mode, but not fp32.


## Reference:
Reference document by HuggingFace on the INT8 mode: 
https://docs.google.com/document/d/1JxSo4lQgMDBdnd19VBEoaG-mMfQupQ3XvOrgmRAVtpU/edit


In [4]:
import accelerate
import deepspeed
import rich
import torch
import transformers

print(f"{accelerate.__version__    = }")
print(f"{deepspeed.__version__     = }")
print(f"{torch.__version__         = }")
print(f"{transformers.__version__  = }")


accelerate.__version__    = '0.13.2'
deepspeed.__version__     = '0.7.4'
torch.__version__         = '1.13.0.dev20220622+cu113'
transformers.__version__  = '4.21.3'


In [5]:
MODEL_NAME = "bigscience/bloom-7b1"

USE_8BIT = True

DTYPE_NOT_8BIT = torch.float16
MAX_MEMORY_GB = 30.75
OFFLOAD_FOLDER = ""

In [6]:
qty_mem = torch.cuda.mem_get_info()[1] / (1024 ** 3) 
n_gpus = torch.cuda.device_count()
max_memory = {i: MAX_MEMORY_GB * 1e9 for i in range(n_gpus)}

rich.print(f"{qty_mem     = :0.2f}")
rich.print(f"{n_gpus      = }")
rich.print(f"{max_memory  = }")

In [7]:
optional_kwargs = {}

if not USE_8BIT:
    optional_kwargs["torch_dtype"] = DTYPE_NOT_8BIT

rich.print(f"{optional_kwargs = }")

model = transformers.AutoModelForCausalLM.from_pretrained(
    MODEL_NAME, 
    #load_in_8bit=USE_8BIT,
    device_map="auto", 
    max_memory=max_memory, 
    offload_folder=str(OFFLOAD_FOLDER),
    cache_dir='',
    **optional_kwargs,
)

Downloading config.json:   0%|          | 0.00/734 [00:00<?, ?B/s]

Downloading pytorch_model.bin.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.29G [00:00<?, ?B/s]

Downloading pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.88G [00:00<?, ?B/s]

In [8]:
tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False)

In [9]:
input_str = records['text0']['prompts'][0]
tokenized = tokenizer(input_str, return_tensors="pt").input_ids

In [10]:
output = model.greedy_search(tokenized, max_length=128)



In [23]:
decoded = tokenizer.decode(output[0], skip_special_tokens=True)
assert decoded.startswith(input_str)
rich.print(input_str + decoded[len(input_str):])