## Use huggingface model locally with langchain
- 

In [1]:
##  !pip install -qU transformers accelerate einops langchain wikipedia xformers
from torch import cuda, bfloat16
import transformers

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
cache_dir = '/data/hf_cache'

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

model = transformers.AutoModelForCausalLM.from_pretrained(
    'mosaicml/mpt-7b-instruct',
    trust_remote_code=True,
    torch_dtype=bfloat16,
    max_seq_len=2048,
    cache_dir=cache_dir
)
model.eval()
model.to(device)
print(f"Model loaded on {device}")

Downloading (…)lve/main/config.json: 100%|██████████| 1.23k/1.23k [00:00<00:00, 1.36MB/s]
Downloading (…)configuration_mpt.py: 100%|██████████| 9.20k/9.20k [00:00<00:00, 8.45MB/s]
A new version of the following files was downloaded from mosaicml/mpt-7b-instruct:
- configuration_mpt.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
Downloading (…)main/modeling_mpt.py: 100%|██████████| 18.4k/18.4k [00:00<00:00, 17.8MB/s]
A new version of the following files was downloaded from mosaicml/mpt-7b-instruct:
- modeling_mpt.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
Downloading (…)model.bin.index.json: 100%|██████████| 16.0k/16.0k [00:00<00:00, 15.6MB/s]
Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

You are using config.init_device='cpu', but you can also use config.init_device="meta" with Composer + FSDP for fast initialization.


Loading checkpoint shards: 100%|██████████| 2/2 [00:50<00:00, 25.30s/it]
Downloading (…)neration_config.json: 100%|██████████| 91.0/91.0 [00:00<00:00, 94.1kB/s]


Model loaded on cuda:0


- specify stoping criteria on stopping id

In [4]:
import torch
from transformers import StoppingCriteria, StoppingCriteriaList

tokenizer = transformers.AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b",cache_dir = cache_dir)

# mtp-7b is trained to add "<|endoftext|>" at the end of generations
stop_token_ids = tokenizer.convert_tokens_to_ids(["<|endoftext|>"])

# define custom stopping criteria object
class StopOnTokens(StoppingCriteria):
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        for stop_id in stop_token_ids:
            if input_ids[0][-1] == stop_id:
                return True
        return False

stopping_criteria = StoppingCriteriaList([StopOnTokens()])

Downloading (…)okenizer_config.json: 100%|██████████| 156/156 [00:00<00:00, 61.3kB/s]
Downloading (…)olve/main/vocab.json: 100%|██████████| 1.08M/1.08M [00:00<00:00, 92.4MB/s]
Downloading (…)olve/main/merges.txt: 100%|██████████| 457k/457k [00:00<00:00, 78.9MB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 2.11M/2.11M [00:00<00:00, 78.1MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 90.0/90.0 [00:00<00:00, 106kB/s]


In [5]:
generate_text = transformers.pipeline(
    model=model, tokenizer=tokenizer,
    return_full_text=True,  # langchain expects the full text, including the question
    task='text-generation',
    device=device,
    # we pass model parameters here too
    stopping_criteria=stopping_criteria,  # without this model will ramble
    temperature=0.1,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
    top_p=0.15,  # select from top tokens whose probability add up to 15%
    top_k=0,  # select from top 0 tokens (because zero, relies on top_p)
    max_new_tokens=64,  # mex number of tokens to generate in the output
    repetition_penalty=1.1  # without this output begins repeating
)

Xformers is not installed correctly. If you want to use memorry_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.
The model 'MPTForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MvpForCausalLM', 'OpenLlamaForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'PegasusF

In [6]:
res = generate_text("Explain to me the difference between nuclear fission and fusion.")
print(res[0]["generated_text"])

Explain to me the difference between nuclear fission and fusion.
Nuclear Fission is a process that splits heavy atoms into smaller, lighter ones by splitting their nuclei apart using high-energy particles or neutrons (a type of subatomic particle). Nuclear Fusion occurs when two light atomic nuclei are combined together in such a way as to form one heavier nucleus with the release of energy


- see [here](https://github.com/pinecone-io/examples/blob/master/generation/llm-field-guide/mpt-7b/mpt-7b-huggingface-langchain.ipynb) for a faster Triton optimized implementation 

### Implement it in Langchain

In [7]:
from langchain import PromptTemplate, LLMChain
from langchain.llms import HuggingFacePipeline

# template for an instruction with no input
prompt = PromptTemplate(
    input_variables=["instruction"],
    template="{instruction}"
)

llm = HuggingFacePipeline(pipeline=generate_text)

llm_chain = LLMChain(llm=llm, prompt=prompt)

In [8]:

print(llm_chain.predict(
    instruction="Explain to me the difference between nuclear fission and fusion."
).lstrip())

Nuclear Fission is a process that splits heavy atoms into smaller, lighter ones by splitting their nuclei apart using high-energy particles or neutrons (a type of subatomic particle). Nuclear Fusion occurs when two light atomic nuclei are combined together in such a way as to form one heavier nucleus with the release of energy


### try 30 b model 

In [2]:
from torch import cuda, bfloat16
import transformers

In [3]:
name = 'mosaicml/mpt-30b-instruct'

#config = transformers.AutoConfig.from_pretrained(name, trust_remote_code=True)
cache_dir = '/data/hf_cache'
#device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'
#config.attn_config['attn_impl'] = 'triton'  # change this to use triton-based FlashAttention
#config.init_device = device # For fast initialization directly on GPU!


model = transformers.AutoModelForCausalLM.from_pretrained(
  name,
  #config=config,
  torch_dtype=bfloat16, # Load model weights in bfloat16
  #trust_remote_code=True,
  device_map='auto', 
  trust_remote_code=True,
  cache_dir=cache_dir
)
model.eval()
#model.to(device)

Downloading (…)lve/main/config.json: 100%|██████████| 1.23k/1.23k [00:00<00:00, 395kB/s]
Downloading (…)configuration_mpt.py: 100%|██████████| 9.20k/9.20k [00:00<00:00, 3.40MB/s]
A new version of the following files was downloaded from https://huggingface.co/mosaicml/mpt-30b-instruct:
- configuration_mpt.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


You are using config.init_device='cpu', but you can also use config.init_device="meta" with Composer + FSDP for fast initialization.


The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.
Loading checkpoint shards: 100%|██████████| 7/7 [05:26<00:00, 46.61s/it]


MPTForCausalLM(
  (transformer): MPTModel(
    (wte): SharedEmbedding(50432, 7168)
    (emb_drop): Dropout(p=0, inplace=False)
    (blocks): ModuleList(
      (0): MPTBlock(
        (norm_1): LPLayerNorm((7168,), eps=1e-05, elementwise_affine=True)
        (attn): MultiheadAttention(
          (Wqkv): Linear(in_features=7168, out_features=21504, bias=False)
          (out_proj): Linear(in_features=7168, out_features=7168, bias=False)
        )
        (norm_2): LPLayerNorm((7168,), eps=1e-05, elementwise_affine=True)
        (ffn): MPTMLP(
          (up_proj): Linear(in_features=7168, out_features=28672, bias=False)
          (act): GELU(approximate='none')
          (down_proj): Linear(in_features=28672, out_features=7168, bias=False)
        )
        (resid_attn_dropout): Dropout(p=0, inplace=False)
        (resid_ffn_dropout): Dropout(p=0, inplace=False)
      )
      (1): MPTBlock(
        (norm_1): LPLayerNorm((7168,), eps=1e-05, elementwise_affine=True)
        (attn): Multihead

In [4]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('mosaicml/mpt-30b')

In [5]:
from transformers import pipeline

pipe = pipeline(
    "text-generation",
    model=model, 
    tokenizer=tokenizer, 
    max_length=512,
    temperature=0.7,
    top_p=0.95,
    repetition_penalty=1.15
)

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.
The model 'MPTForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MvpForCausalLM', 'OpenLlamaForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'PegasusFo

In [7]:
import torch 

- This model was trained on data formatted as follows:

In [8]:
def format_prompt(instruction):
    template = "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n###Instruction\n{instruction}\n\n### Response\n"
    return template.format(instruction=instruction)

example = "Tell me a funny joke.\nDon't make it too funny though."
fmt_ex = format_prompt(instruction=example)

with torch.autocast('cuda', dtype=torch.bfloat16):
    res = pipe(fmt_ex,
            max_new_tokens=100,
            do_sample=True,
            use_cache=True)

Both `max_new_tokens` (=100) and `max_length`(=512) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


In [None]:
print(res)