Importing Libraries

In [1]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu117
!pip install accelerate transformers

Looking in indexes: https://download.pytorch.org/whl/cu117
Collecting accelerate
  Downloading accelerate-0.23.0-py3-none-any.whl (258 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m258.1/258.1 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers
  Downloading transformers-4.33.3-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub (from accelerate)
  Downloading huggingface_hub-0.17.3-py3-none-any.whl (295 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m21.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting s

Parameters defining

In [2]:
import torch


def top_k_sampling(logits, k):
    top_k = torch.topk(logits, k)
    top_k_indices = top_k.indices
    top_k_values = top_k.values
    probabilities = torch.softmax(top_k_values, dim=-1)
    choice = torch.multinomial(probabilities, num_samples=1)
    token_id = int(top_k_indices[choice])
    return token_id


def top_p_sampling(logits, p):
    sorted_logits, sorted_indices = torch.sort(logits, descending=True)
    cumulative_probs = torch.cumsum(torch.softmax(sorted_logits, dim=-1), dim=-1)

    sorted_indices_to_remove = cumulative_probs > p
    sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
    sorted_indices_to_remove[..., 0] = 0

    logits[sorted_indices] = sorted_indices_to_remove.type(logits.dtype) * -1e10
    probabilities = torch.softmax(logits, dim=-1)
    token_id = int(torch.multinomial(probabilities, num_samples=1))

    return token_id


def top_k_p_sampling(logits, k, p):
    # Apply top-k sampling
    top_k = torch.topk(logits, k)
    top_k_indices = top_k.indices
    top_k_values = top_k.values

    # Apply top-p sampling on top-k logits
    sorted_logits, sorted_indices = torch.sort(top_k_values, descending=True)
    cumulative_probs = torch.cumsum(torch.softmax(sorted_logits, dim=-1), dim=-1)

    sorted_indices_to_remove = cumulative_probs > p
    sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
    sorted_indices_to_remove[..., 0] = 0

    top_k_values[sorted_indices] = sorted_indices_to_remove.type(top_k_values.dtype) * -1e10
    probabilities = torch.softmax(top_k_values, dim=-1)
    choice = torch.multinomial(probabilities, num_samples=1)
    token_id = int(top_k_indices[choice])

    return token_id

Response Generating and Memory handling

In [3]:
import torch
memory_id=[]



def process_chat(model, tokenizer, device, params):
    stream_interval = 1

    prompt = params["prompt"]

    temperature = float(params.get("temperature", 1.0))
    max_new_tokens = int(params.get("max_new_tokens", 256))
    context_len = int(params.get("context_len", 1024))
    stop_strs = params.get("stop_strs", None)
    force_set_bos_token_id = params.get("force_set_bos_token_id", None)
    force_set_eos_token_id = params.get("force_set_eos_token_id", None)
    use_top_k_sampling = params.get("use_top_k_sampling", False)
    use_top_p_sampling = params.get("use_top_p_sampling", False)
    top_k_value = params.get("top_k_value", 10)
    top_p_value = params.get("top_p_value", 0.9)

    use_bos_for_input = params.get("use_bos_for_input", False)

    if force_set_bos_token_id:
        # patch for open_llama_7b_preview_300bt
        tokenizer.bos_token_id = force_set_bos_token_id

    if force_set_eos_token_id:
        # patch for open_llama_7b_preview_300bt
        stop_token_ids = params.get("stop_ids", [force_set_eos_token_id])
    else:
        stop_token_ids = params.get("stop_ids", [tokenizer.eos_token_id])

    l_prompt = len(prompt)
    if use_bos_for_input:
        input_ids = [tokenizer.bos_token_id] + tokenizer(prompt).input_ids
        l_prompt -= len(tokenizer.decode([tokenizer.bos_token_id]))
    else:
        input_ids = tokenizer(prompt).input_ids

    output_token_ids = list(input_ids)

    max_src_len = context_len - max_new_tokens - 8
    input_ids = input_ids[-max_src_len:]
    with torch.no_grad():
        for i in range(max_new_tokens):
            # print(i,"i value-----")
            if i == 0:
                out = model(input_ids=torch.as_tensor([input_ids], device=device), use_cache=True)
                logits = out.logits
                past_key_values = out.past_key_values
            else:
                out = model(
                    input_ids=torch.as_tensor([[token_id]], device=device),
                    use_cache=True,
                    past_key_values=past_key_values,
                )
                logits = out.logits
                past_key_values = out.past_key_values

            last_token_logits = logits[0][-1]

            if device == "mps":
                last_token_logits = last_token_logits.float().to("cpu")

            if temperature < 1e-4:
                token_id = int(torch.argmax(last_token_logits))
            else:
                # Adjust with Softmax with temperature
                # very nice article below
                # https://shivammehta25.github.io/posts/temperature-in-language-models-open-ai-whisper-probabilistic-machine-learning/
                probabilities = torch.softmax(last_token_logits / temperature, dim=-1)

                if use_top_k_sampling and use_top_k_sampling:
                    token_id = top_k_p_sampling(last_token_logits, top_k_value, top_p_value)
                elif use_top_k_sampling:
                    token_id = top_k_sampling(last_token_logits)
                elif use_top_p_sampling:
                    token_id = top_p_sampling(last_token_logits)
                else:
                    token_id = int(torch.multinomial(probabilities, num_samples=1))

            output_token_ids.append(token_id)

            if token_id in stop_token_ids:
                stopped = True
            else:
                stopped = False

            if i % stream_interval == 0 or i == max_new_tokens - 1 or stopped:
                output = tokenizer.decode(output_token_ids, skip_special_tokens=True)

                if stop_strs:
                    for stop_str in stop_strs:
                        if stop_str:

                            pos = output.rfind(stop_str, l_prompt)
                            is_stop_str_found = (pos != -1)
                            if is_stop_str_found:
                                output = output[:pos]
                                stopped = True

                yield output

            if stopped:
                break
    # print(tokenizer.decode(past_key_values, skip_special_tokens=True))
    memory_id.append(past_key_values)
    del past_key_values

Shifting Model on GPU

In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM


def load_hf_model(model_path: str, device: str = "cuda", num_gpus: int = None, max_gpu_memory: str = None):
    if device == "cpu":
        # When using Redpajama-Incite for CPU-based inference,
        # bfloat16 was recommended, but I thought it was faster to specify no bfloat16.
        kwargs = {}  # "torch_dtype": torch.bfloat16}
    elif device == "cuda":
        kwargs = {"torch_dtype": torch.float16}
        if num_gpus is None:
            num_gpus = 1
            kwargs["device_map"] = "auto"
        elif num_gpus == 1:
            pass
        elif num_gpus > 1:

            kwargs["device_map"] = "auto"

            if max_gpu_memory is None:
                kwargs["device_map"] = "sequential"

                available_gpu_memory_list = get_available_gpu_memory_list(num_gpus)

                max_memory_dict = {}
                for i in range(num_gpus):
                    memory = available_gpu_memory_list[i] * 0.85
                    memory_str = str(int(memory)) + "GiB"
                    max_memory_dict[i] = memory_str
                kwargs["max_memory"] = max_memory_dict
                # for example
                # max_memory_dict= { 0: "8GiB", 1: "10GiB", 2: "6GiB", 3: "13GiB" }
            else:
                max_memory_dict = {}
                for i in range(num_gpus):
                    max_memory_dict[i] = max_gpu_memory
                kwargs["max_memory"] = max_memory_dict


    elif device == "mps":
        kwargs = {"torch_dtype": torch.float16}
    else:
        raise ValueError(f"Invalid device: {device}")

    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForCausalLM.from_pretrained(model_path,offload_folder="offload", offload_state_dict = True,
                                                 **kwargs)

    if (device == "cuda" and num_gpus == 1) or device == "mps":
        model.to(device)
    return model, tokenizer, device


def get_available_gpu_memory_list(max_gpus=None):
    available_gpu_count = torch.cuda.device_count()

    if max_gpus is None:
        num_gpus = available_gpu_count
    else:
        num_gpus = min(max_gpus, available_gpu_count)

    gpu_memory_list = []

    for gpu_id in range(num_gpus):
        with torch.cuda.device(gpu_id):
            device = torch.cuda.current_device()
            gpu_properties = torch.cuda.get_device_properties(device)
            total_memory = gpu_properties.total_memory / (1024 ** 3)
            allocated_memory = torch.cuda.memory_allocated() / (1024 ** 3)
            available_memory = total_memory - allocated_memory
            gpu_memory_list.append(available_memory)
    return gpu_memory_list

Chat prompt Function

In [5]:
class ChatContent:
    def __init__(self, role: str, msg: str = ""):
        self.role = role
        self.message = msg

    def get_role(self):
        return self.role

    def get_message(self):
        return self.message


class ChatPrompt:
    """
    A builder to build chat prompts according to the characteristics of each language model.
    """

    def __init__(self):
        self.system = ""
        self.chat_contents = []
        self.responder_messages = []
        self.requester_messages = []
        self.requester = ""
        self.responder = ""

    def set_system(self, system):
        """
        Set initial prompts for "system."
        :param system:
        :return:
        """
        self.system = system

    def set_requester(self, requester):
        """
        Sets the role name of the requester (=user)
        :param requester:
        :return:
        """
        self.requester = requester

    def set_responder(self, responder):
        """
        Sets the role name of the responder (=AI)
        :param responder:
        :return:
        """
        self.responder = responder

    def add_requester_msg(self, message):
        self._add_msg(ChatContent(role=self.requester, msg=message))

    def add_responder_msg(self, message):
        self._add_msg(ChatContent(role=self.responder, msg=message))

    def set_responder_last_msg(self, message):
        self.responder_messages[-1].message = message

    def get_requester_last_msg(self):
        """
        Retrieve the latest message from the requester
        :return:
        """
        return self.requester_messages[-1].message

    def _add_msg(self, msg):
        self.chat_contents.append(msg)
        if msg.role == self.responder:
            self.responder_messages.append(msg)
        elif msg.role == self.requester:
            self.requester_messages.append(msg)

    def is_requester_role(self, role):
        if self.requester == role:
            return True
        else:
            return False

    def get_skip_len(self):
        """
        （Get the length to skip (already entered as a prompt)
        :return:
        """
        current_prompt = self.create_prompt()

        skip_echo_len = len(current_prompt)
        return skip_echo_len

    def get_stop_strs(self):
        return [
            '<|endoftext|>',
            '\n<'
            # Safety stop valve when the model generates not only AI conversations but also human parts of the conversation.
        ]

    def create_prompt(self):
        """
        Build prompts according to the characteristics of each language model
        :return:
        """
        ret = self.system;
        for chat_content in self.chat_contents:
            chat_content_role = chat_content.get_role()
            chat_content_message = chat_content.get_message()
            if chat_content_role:
                if chat_content_message:
                    merged_message = chat_content_role + ": " + chat_content_message + "\n"
                else:
                    merged_message = chat_content_role + ":"
                ret += merged_message

        return ret


# portable UT
if False:
    chatPrompt = ChatPrompt()

    chatPrompt.set_requester("<human>")
    chatPrompt.set_responder("<bot>")
    chatPrompt.add_requester_msg("Who is Alan Turing")
    chatPrompt.add_responder_msg(None)

    assert """<human>: Who is Alan Turing
<bot>:""" == chatPrompt.create_prompt()

Loading Trained model

In [6]:
!pip install -Uqq  git+https://github.com/huggingface/peft.git
from peft import PeftModel, PeftConfig

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for peft (pyproject.toml) ... [?25l[?25hdone


In [9]:
  peft_model_id = '/content/drive/MyDrive/All models/llama(hiv)'# Uncomment to use locally saved adapter weights if you trained above

  # Load the LoRA model
  # model = PeftModel.from_pretrained(model, peft_model_id, device_map={"":0})

Main function

In [10]:
from transformers import set_seed


# Fix seed value for verification.
seed_value = 42
set_seed(seed_value)

model_path = "togethercomputer/RedPajama-INCITE-Chat-3B-v1"

model, tokenizer, device = load_hf_model(model_path=model_path, device="cuda")  # Works if "cpu", but it's slow.
model = PeftModel.from_pretrained(model, peft_model_id, device_map={"":0})
chatPrompt = ChatPrompt()

chatPrompt.set_requester("<human>")
chatPrompt.set_responder("<bot>")

chat_mode = True  # You can do multi-round chats while keeping context.

while True:
    user_input = input("YOU: ")
    if user_input.lower() == "exit":
        break

    if chat_mode:
        chatPrompt.add_requester_msg(user_input)
        chatPrompt.add_responder_msg(None)
        prompt = chatPrompt.create_prompt()
        stop_strs = chatPrompt.get_stop_strs()

    else:
        prompt = user_input
        stop_str = None

    params = {
        "prompt": prompt,
        "temperature": 0.7,
        "max_new_tokens": 100,
        "context_len": 1024,
        "use_top_k_sampling": True,
        "top_k_value": 50,
        "use_top_p_sampling": True,
        "top_p_value": 0.7,
        "stop_strs": stop_strs,
    }

    generator = process_chat(model, tokenizer, device, params)

    prev = ""

    for index, response_text in enumerate(generator):

        if index == 0:
            print("AI : ", end="", flush=True)

        if chat_mode:
            response_text = response_text[chatPrompt.get_skip_len():].strip()
        else:
            # response_text = response_text[len(prompt):].strip()
            pass

        updated_text = response_text[len(prev):]

        print(updated_text, end="", flush=True)

        prev = response_text

    print()

    if chat_mode:
        chatPrompt.set_responder_last_msg(response_text.strip())


Downloading (…)okenizer_config.json:   0%|          | 0.00/237 [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/630 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/5.69G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

YOU: what is HIV?
AI : HIV is a virus that infects the immune system. HIV destroys immune system cells and makes copies of itself in their place. This damages the immune system and leaves people with HIV vulnerable to a wide range of infections and illnesses. The virus usually takes many years to cause the most serious effects of HIV. The immune system can usually fight off some infections but HIV weakens the immune system so infections can take hold and lead to severe illnesses.
YOU: what are the main symptoms of this disease?
AI : The main symptoms of HIV are flu-like symptoms such as aching muscles, joint pain, chills, and fever. If not treated, HIV can cause a range of symptoms, such as swollen lymph nodes, weight loss, liver problems, eye infections, and pneumonia. The best way to prevent HIV is to avoid sexual contact and avoid sharing needles for drug use. The best way to treat HIV is to take anti-HIV medications called combination antiretroviral therapy (cART) daily for the
YOU