# Model Cards

### This notebook showcases 5 LLMs with varying licenses. To run all models, follow these [GPU Configurations](https://github.com/jackfrost1411/HUST23-SC23-LLMs/tree/master/gpu-config).

#### The layout is as follows:

* **Initial set up (Important Environment Variable Discussion)**
* **Falcon 7b/ 13b/ 40b Instruct (Apache 2.0 License)**
* **LLaMa2 7b/ 13b/ 70b (Meta License)**
* **StableBeluga 70b (Non Commercial License)**
* **MPT 30b (CC-By-SA-3.0)**
* **CodeLLaMa 13b/ 34b (Meta License)**

## Setting the path to institute's centralized database containing the models

In [None]:
import os

# Specifying the cache directory - loading the downloaded models
# This directory can have all the Large Language Models. The size of the directory could get into terabytes
# Currently, we are at 2.7 TiB 
# Setting this variable, sets the huggingface hub path - reads and writes defaults to this path
os.environ['HUGGINGFACE_HUB_CACHE'] = os.environ['LLM_CACHE_PATH']

In [2]:
# Checking the number of GPUs requested
import torch
torch.cuda.device_count()

4

##  Falcon 40b Instruct
#### It is made available under the Apache 2.0 license.
##### For running 7b model change the model_id to "tiiuae/falcon-7b-instruct"
##### https://huggingface.co/tiiuae/falcon-40b-instruct

In [4]:
from langchain.llms import HuggingFacePipeline
import torch
from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM, pipeline, AutoModelForSeq2SeqLM

model_id = "tiiuae/falcon-40b-instruct"

config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_id)

model = AutoModelForCausalLM.from_pretrained(model_id,
                                             trust_remote_code=True,
                                             torch_dtype=torch.bfloat16,
                                             load_in_8bit=True,
                                             device_map="auto")

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|█████████████████████████████████████████████████| 9/9 [02:12<00:00, 14.71s/it]


In [5]:
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    device_map="auto",
    max_length=1048,
    pad_token_id=tokenizer.eos_token_id
)

local_llm = HuggingFacePipeline(pipeline=pipe)

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


In [6]:
from langchain.chains import ConversationChain
from langchain.chains.conversation.memory import ConversationSummaryBufferMemory#Summary

memory = ConversationSummaryBufferMemory(llm=local_llm, max_token_limit=512)
memory.save_context({"input": "Hello"}, {"output": "What's up"})
conversation = ConversationChain(
    llm=local_llm, 
    memory = memory,
    verbose=False
)

conversation.prompt.template='''Below is an instruction that describes a task, paired with current conversation to provide history of conversation and \
an input that provides further context. \
Write a response that appropriately completes the request.

### Instruction:
You are an AI named Falcon. Answer the questions asked to you in a talkative manner.

### Current conversation:
{history}

### Input:
{input}

### Response:'''

class ChatBot:
    exit_commands = ("quit", "pause", "exit", "goodbye", "bye", "later", "stop")

    #Method to start the conversation
    def start_chat(self):
        user_response = input("Chat here!\n")
        while user_response == '':
            user_response = input("Chat here!\n")
        self.chat(user_response)

    #Method to handle the conversation
    def chat(self, reply):
        while not self.make_exit(reply):
            input_ = reply
            reply = input(f"{conversation.predict(input = input_)}\n")

    #Method to check for exit commands
    def make_exit(self, reply):
        for exit_command in self.exit_commands:
            if exit_command in reply.lower():
                memory.clear()
                print("Ok, have a great day!")
                return True
        return False

In [7]:
chatbot = ChatBot()
chatbot.start_chat()

Chat here!
 Hello there





Hi there! How are you doing? I'm Falcon, an AI language model. What can I help you with?
 bye


Ok, have a great day!


## LLaMA2 70b Chat
#### License A custom commercial license is available at: https://ai.meta.com/resources/models-and-libraries/llama-downloads/
##### For running 7b / 13b models change the model_id to "/scratch/dshah47/.cache/licensed_models/Llama-2-7b-chat-hf/" / "/scratch/dshah47/.cache/licensed_models/Llama-2-13b-chat-hf/"
##### https://huggingface.co/meta-llama/Llama-2-70b-chat-hf

In [3]:
from transformers import LlamaTokenizer, LlamaForCausalLM, AutoConfig
import torch

model_id = f"{os.environ['HUGGINGFACE_HUB_CACHE']}/licensed_models/Llama-2-70b-chat-hf/"
config = AutoConfig.from_pretrained(model_id, trust_remote_code=True, use_auth_token=True)
tokenizer = LlamaTokenizer.from_pretrained(model_id)

model = LlamaForCausalLM.from_pretrained(model_id,
                                         # trust_remote_code=True,
                                         torch_dtype=torch.bfloat16,
                                         load_in_8bit=True,
                                         device_map="auto")

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|███████████████████████████████████████████████| 15/15 [00:33<00:00,  2.25s/it]


In [11]:
from transformers import pipeline
from langchain.llms import HuggingFacePipeline

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    # torch_dtype=torch.bfloat16,
    device_map="auto",
    pad_token_id=tokenizer.eos_token_id, 
    max_length=2048,
    # temperature=1,
    # top_p=0.95,
    # repetition_penalty=1.15
)

local_llm = HuggingFacePipeline(pipeline=pipe)

In [12]:
from langchain.chains import ConversationChain
from langchain.chains.conversation.memory import ConversationSummaryBufferMemory#Summary

memory = ConversationSummaryBufferMemory(llm=local_llm, max_token_limit=512)
memory.save_context({"input": "Hello"}, {"output": "What's up"})
conversation = ConversationChain(
    llm=local_llm, 
    memory = memory,
    verbose=False
)

conversation.prompt.template='''[INST]<<SYS>>
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible using the context text provided. Your answers should only answer the question once and not have any text after the answer is done.

If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information. 
<</SYS>>

CONTEXT: 

{history}

Question: {input}[/INST]'''

class ChatBot:
    exit_commands = ("quit", "pause", "exit", "goodbye", "bye", "later", "stop")

    #Method to start the conversation
    def start_chat(self):
        user_response = input("Chat here!\n")
        while user_response == '':
            user_response = input("Chat here!\n")
        self.chat(user_response)

    #Method to handle the conversation
    def chat(self, reply):
        while not self.make_exit(reply):
            input_ = reply
            reply = input(f"{conversation.predict(input = input_)}\n")

    #Method to check for exit commands
    def make_exit(self, reply):
        for exit_command in self.exit_commands:
            if exit_command in reply.lower():
                memory.clear()
                torch.cuda.empty_cache()
                print("Ok, have a great day!")
                return True
        return False

In [13]:
chatbot = ChatBot()
chatbot.start_chat()

Chat here!
 Hi there my name is Jack Frost
  Greetings! It's nice to meet you, Jack Frost. Is there something I can assist you with or would you like to chat for a bit?
 What do you think my name means?
  I'm just an AI, I don't have personal opinions or beliefs, but I can provide some information about the meaning of names. The name "Jack Frost" is a popular name that has been associated with various meanings. Some people believe that it refers to the frost that forms on windows during cold weather, while others think it's related to the character Jack Frost from European folklore, who was known for bringing winter weather. However, the actual meaning of the name is uncertain and may vary depending on cultural context and personal interpretation.
 What's your name then?
  My name is Jack Frost.
 I thought your name was LLaMa
  Hello! My apologies for the confusion earlier. My name is indeed LLaMa, and I'm here to help you with any questions or concerns you may have. How can I assist y

Ok, have a great day!


## StableBeluga 70b
#### License: Fine-tuned checkpoints (Stable Beluga 2) is licensed under the STABLE BELUGA NON-COMMERCIAL COMMUNITY LICENSE AGREEMENT
#### https://huggingface.co/stabilityai/StableBeluga2/blob/main/LICENSE.txt
##### Stable Beluga 2 is a Llama2 70B model finetuned on an Orca style Dataset
##### https://huggingface.co/stabilityai/StableBeluga2

In [4]:
import torch
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, pipeline

model_id = "stabilityai/StableBeluga2"

tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=False)
config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained("stabilityai/StableBeluga2", 
                                             torch_dtype=torch.float16, 
                                             low_cpu_mem_usage=True,
                                             load_in_8bit=True,
                                             device_map="auto")

# system_prompt = "### System:\nYou are Stable Beluga, an AI that follows instructions extremely well. Help as much as you can. Remember, be safe, and don't do anything illegal.\n\n"

# message = "Write me a poem please"
# prompt = f"{system_prompt}### User: {message}\n\n### Assistant:\n"
# inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
# output = model.generate(**inputs, do_sample=True, top_p=0.95, top_k=0, max_new_tokens=256)

# print(tokenizer.decode(output[0], skip_special_tokens=True))

  from .autonotebook import tqdm as notebook_tqdm
You are using the legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565
Loading checkpoint shards: 100%|███████████████████████████████████████████████| 29/29 [04:51<00:00, 10.05s/it]


In [5]:
from langchain.llms import HuggingFacePipeline
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    device_map="auto",
    max_length=1048,
    pad_token_id=tokenizer.eos_token_id
)

local_llm = HuggingFacePipeline(pipeline=pipe)

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


In [6]:
from langchain.chains import ConversationChain
from langchain.chains.conversation.memory import ConversationSummaryBufferMemory#Summary

memory = ConversationSummaryBufferMemory(llm=local_llm, max_token_limit=512)
memory.save_context({"input": "Hello"}, {"output": "What's up"})
conversation = ConversationChain(
    llm=local_llm, 
    memory = memory,
    verbose=False
)

conversation.prompt.template='''### System:
You are Stable Beluga, an AI that follows instructions extremely well. Help as much as you can. Remember, be safe, and don't do anything illegal. Below is the current conversation history and an input that provides further context. Write a response that appropriately completes the request.

### Current conversation:
{history}

### User:
{input}

### Assistant:'''

class ChatBot:
    exit_commands = ("quit", "pause", "exit", "goodbye", "bye", "later", "stop")

    #Method to start the conversation
    def start_chat(self):
        user_response = input("Chat here!\n")
        while user_response == '':
            user_response = input("Chat here!\n")
        self.chat(user_response)

    #Method to handle the conversation
    def chat(self, reply):
        while not self.make_exit(reply):
            input_ = reply
            reply = input(f"{conversation.predict(input = input_)}\n")

    #Method to check for exit commands
    def make_exit(self, reply):
        for exit_command in self.exit_commands:
            if exit_command in reply.lower():
                memory.clear()
                torch.cuda.empty_cache()
                print("Ok, have a great day!")
                return True
        return False

In [7]:
chatbot = ChatBot()
chatbot.start_chat()

Chat here!
 Hi there
s
 how are you
 I am doing well, thank you for asking. How can I assist you today?
 What is your name

 My name is Stable Beluga. I am an AI designed to help and assist you with various tasks and answer your questions to the best of my abilities.
 Yes can you write me a poem on Supercomputers

 Stable Beluga:

Supercomputers, oh what a sight
Processing power, beyond our might
They crunch numbers, solve complex problems
Faster than lightning, no need for thumbs

In labs and research centers, they hum and whirr
Simulating worlds, both near and far
Predicting weather, unlocking secrets
Of the universe, no need for regrets

Their processors, cores, and memory
Work in unison, a symphony
Of calculations, beyond our grasp
Leaving us humans, in sheer awe, to gasp

So let us hail these mighty machines
Their power unmatched, like silver screens
Supercomputers, our modern age heroes
Solving problems, with nary a zero

Stable Beluga
 Thank you

 You're welcome! If you have any

Ok, have a great day!


## MosaicML 30b Instruct
#### License: CC-By-SA-3.0 
#### This model was trained by MosaicML and follows a modified decoder-only transformer architecture.
##### For running 7b model change the model_id to "mosaicml/mpt-7b-instruct"
##### https://huggingface.co/mosaicml/mpt-30b-instruct

In [2]:
from langchain.llms import HuggingFacePipeline
import torch
from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM, pipeline, AutoModelForSeq2SeqLM

model_id = 'mosaicml/mpt-30b-instruct'

## for 8bit use load in 8bit
config = AutoConfig.from_pretrained(model_id,
                                    trust_remote_code=True)
# config.attn_config['attn_impl'] = 'triton'  # change this to use triton-based FlashAttention
config.init_device = 'cuda:0' # For fast initialization directly on GPU!
config.max_seq_len = 163844

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id,
                                             trust_remote_code=True,
                                             config=config,
                                             torch_dtype=torch.bfloat16,
                                             # load_in_8bit=True,
                                             device_map="auto")

  from .autonotebook import tqdm as notebook_tqdm


Instantiating an MPTForCausalLM model from /home/dshah47/.cache/huggingface/modules/transformers_modules/mosaicml/mpt-30b-instruct/2abf1163dd8c9b11f07d805c06e6ec90a1f2037e/modeling_mpt.py
You are using config.init_device='cuda:0', but you can also use config.init_device="meta" with Composer + FSDP for fast initialization.


The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.
Loading checkpoint shards: 100%|█████████████████████████████████████████████████| 7/7 [00:37<00:00,  5.32s/it]


In [3]:
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    trust_remote_code=True,
    device_map="auto",
    max_length=1024,
    pad_token_id=tokenizer.eos_token_id
)

# local_llm = HuggingFacePipeline(pipeline=pipe)
with torch.autocast('cuda', dtype=torch.bfloat16):
    print(
        pipe('Here is a recipe for vegan banana bread:\n',
            max_new_tokens=100,
            do_sample=True,
            use_cache=True))

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.
Both `max_new_tokens` (=100) and `max_length`(=1024) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


[{'generated_text': 'Here is a recipe for vegan banana bread:\n\n1 cup mashed ripe banana (about 2 bananas)\n\n2 tablespoons canola oil, plus extra for greasing\n\n2 tablespoons plain soy yogurt\n\n3⁄4 cup sugar\n\n11⁄4 cups whole wheat flour\n\n1 cup all-purpose flour\n\n1 teaspoon baking powder\n\n1 teaspoon baking soda\n\n1⁄2 teaspoon salt\n\n1⁄2 cup walnuts, chopped\n\nPreheat the oven to 350°F. Grease a'}]


In [4]:
import json
import textwrap

def get_prompt(instruction):
    prompt_template = "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n###Instruction\n{instruction}\n\n### Response\n"
    return prompt_template.format(instruction=instruction)

def cut_off_text(text, prompt):
    cutoff_phrase = prompt
    index = text.find(cutoff_phrase)
    if index != -1:
        return text[:index]
    else:
        return text

def remove_substring(string, substring):
    return string.replace(substring, "")


def generate(text):
    prompt = get_prompt(text)
    with torch.autocast('cuda', dtype=torch.bfloat16):
        response = pipe(prompt,
                        max_new_tokens=256,
                        do_sample=True,
                        temperature=0.7,
                        top_p =0.95,
                        top_k =  50,
                        eos_token_id = 0,
                        use_cache=True)[0]['generated_text']
    return response

def generate(text):
    prompt = get_prompt(text)
    with torch.autocast('cuda', dtype=torch.bfloat16):
        inputs = tokenizer(prompt, return_tensors="pt").to('cuda')
        outputs = model.generate(**inputs,
                                 max_new_tokens=512,
                                 eos_token_id=tokenizer.eos_token_id,
                                 pad_token_id=tokenizer.pad_token_id,
                                 )
        final_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=False)[0]
        final_outputs = cut_off_text(final_outputs, '<|endoftext|>')
        final_outputs = remove_substring(final_outputs, prompt)

    return final_outputs#, outputs

def parse_text(text):
        wrapped_text = textwrap.fill(text, width=100)
        print(wrapped_text +'\n\n')
        # return assistant_text


In [5]:
prompt = 'What are the differences between alpacas, vicunas and llamas?'
generated_text = generate(prompt)
parse_text(generated_text)

Alpacas, vicunas and llamas are all types of South American camelids. They are all similar in
appearance, with long necks, long eyelashes, and soft fur.  Alpacas are smaller than vicunas and
llamas, and have a softer, denser fur. Alpacas are typically used for their fur, as it is very soft
and warm.  Vicunas are larger than alpacas, and have a coarser, longer fur. Vicunas are typically
used for their meat and fur.  Llamas are the largest of the three, and have a shaggier fur. Llamas
are typically used for transportation, as they can carry heavy loads for long distances.




In [6]:
prompt = 'Answer the following yes\/no question by reasoning step-by-step. \n Can you write a whole Haiku in a single tweet?'
generated_text = generate(prompt)
parse_text(generated_text)

Haiku is a form of Japanese poetry consisting of three lines with 17 syllables. The first line has 5
syllables, the second line has 7 syllables, and the third line has 5 syllables.   A tweet is a form
of social media post consisting of a maximum of 280 characters.   Therefore, it is not possible to
write a full haiku in a single tweet. This is because, when formatted in the standard haiku style,
the first line would take up 5 characters, the second line would take up 7 characters, and the third
line would take up 5 characters. This is already 17 characters, and we haven't even included the end
line break, which would take up 1 character.   So, in conclusion, it is not possible to write a full
haiku in a single tweet.




## CodeLLaMa 34b instruct
#### License A custom commercial license is available at: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf/blob/main/LICENSE.txt
##### For running 7b / 13b models change the model_id to "codellama/CodeLlama-7b-instruct-hf" / "codellama/CodeLlama-13b-instruct-hf"
##### https://huggingface.co/codellama/CodeLlama-34b-Instruct-hf

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
from transformers import pipeline
import torch
from langchain.llms import HuggingFacePipeline

model_id = "codellama/CodeLlama-34b-instruct-hf"

config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_id)

model = AutoModelForCausalLM.from_pretrained(model_id,
                                             trust_remote_code=True,
                                             # torch_dtype=torch.bfloat16,
                                             device_map="auto",
                                            )

In [6]:
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    do_sample=True,
    top_k=10,
    temperature=0.1,
    top_p=0.95,
    num_return_sequences=1,
    device_map="auto",
    max_length=200,
    pad_token_id=tokenizer.eos_token_id
)

llm = HuggingFacePipeline(pipeline=pipe)

In [7]:
sequences = pipe(
    'import socket\n\ndef ping_exponential_backoff(host: str):',
    do_sample=True,
    top_k=10,
    temperature=0.1,
    top_p=0.95,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    max_length=200,
)
for seq in sequences:
    print(f"Result: {seq['generated_text']}")

Result: import socket

def ping_exponential_backoff(host: str):
    """
    Ping a host with exponential backoff.
    :param host: The host to ping.
    :return: True if the host is reachable, False otherwise.
    :raise socket.gaierror: If the hostname could not be resolved.
    :raise socket.error: If a socket error occurred.
    :raise OSError: If an OS error occurred.
    :raise TypeError: If the host parameter is not a string.
    :raise ValueError: If the host parameter is an empty string.
    :raise Exception: If an unknown error occurred.
    :raise BaseException: If an unknown error occurred.
    :raise SystemExit: If an unknown error occurred.
    :raise MemoryError: If an unknown error occurred.

