In [2]:
weights_path = '/home/llajan/OPT-175B-HF'
fp16 = False

In [None]:
import json
from transformers import OPTConfig
from accelerate import init_empty_weights, dispatch_model, infer_auto_device_map, load_checkpoint_in_model
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load config
with open(os.path.join(weights_path, 'config.json'), 'r') as f:
    config = json.load(f)
config["_name_or_path"] = weights_path
config = OPTConfig(**config)

# Initializes an empty shell with the model. This is instant and does not take any RAM.
with init_empty_weights():
    model = AutoModelForCausalLM.from_config(config)
# Initialize the model under the previous context manager breaks the tied weights.
model.tie_weights()

# Infer device map automatically
# There is an issue with device map inference using the fp32 version. It places some layers on cpu.
# To avoid this, we infer device map using the fp16 version.
# This evenly distributes the layers on 9 A100s. 
device_map = infer_auto_device_map(model.model, no_split_module_classes=["OPTDecoderLayer"], dtype='float16')

# Load weights
load_checkpoint_in_model(
    model.model, 
    weights_path, 
    device_map=device_map,
    offload_folder=None, 
    dtype='float16' if fp16 else 'float32',
    offload_state_dict=True
)
model.tie_weights()

# Without this part, torch complains about tensors being in different devices
full_model_device_map = {f"model.{k}": v for k, v in device_map.items()}
full_model_device_map["lm_head"] = 0
dispatch_model(model, device_map=full_model_device_map)

In [16]:
# Inference
tokenizer = AutoTokenizer.from_pretrained('facebook/opt-30b')

prompt = "Even though Federer is a better player than Nadal"

inputs = tokenizer(prompt, return_tensors="pt")

output = model.generate(inputs["input_ids"].to(0), max_length=40, do_sample=True)

print(tokenizer.decode(output[0].tolist()))

<s>Even though Federer is a better player than Nadal, he is not a better player on clay.

Click to expand...

I think that's a bit of a stretch.
