In [1]:
class color:
    PURPLE = '\033[95m'
    CYAN = '\033[96m'
    DARKCYAN = '\033[36m'
    BLUE = '\033[94m'
    GREEN = '\033[92m'
    YELLOW = '\033[93m'
    RED = '\033[91m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'
    END = '\033[0m'

In [25]:
from transformers import (
    LlamaForCausalLM,
    pipeline,
    AutoTokenizer,
    LlamaModel,
    LlamaConfig, Qwen2VLForConditionalGeneration
)
import transformers
import json

import torch
from langchain import PromptTemplate, LLMChain
from langchain.llms import HuggingFacePipeline

In [None]:
Qwen2VLForConditionalGeneration.from_pretrained()

In [3]:
model_dir = "/data/Models/llama3_8B_Base"

In [11]:
tokenizer = AutoTokenizer.from_pretrained(model_dir, truncation=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"
# tokenizer("Microservice Resource Provision is Improtant to Quality of Service and Energy Conservation")
tokenizer("Please clarify the importantce of diverse resource provision policies on microservice quality of service and run-time energy conservation.")

{'input_ids': [128000, 5618, 38263, 279, 3062, 346, 315, 17226, 5211, 17575, 10396, 389, 8162, 8095, 4367, 315, 2532, 323, 1629, 7394, 4907, 29711, 13], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [5]:
LLM_model = LlamaForCausalLM.from_pretrained(
    model_dir,
    load_in_8bit=False,
    torch_dtype=torch.float16,
    device_map='auto',
)
base_model = LLM_model

Loading checkpoint shards: 100%|█████████████████████████████████████████| 4/4 [00:02<00:00,  1.35it/s]


In [None]:
pipe = pipeline(
    "text-generation",
    model=LLM_model,
    tokenizer=tokenizer,
    max_length=256,
    temperature=0.05,
    top_p=0.9,
    repetition_penalty=1
)

task_agent = HuggingFacePipeline(pipeline=pipe)

orchestrator_agent_prompt_template = """
You are an expert in wireless communication developed by Nokia Bell Labs China. Below is a query that describes a task of communication. Please give your response.
### Query：
{query}
### Response:
"""
orchestrator_prompt_template = PromptTemplate(template=orchestrator_agent_prompt_template, input_variables=["query"])
OrchestratorAgent = LLMChain(prompt=orchestrator_prompt_template, llm=task_agent)
orchestrator_out = OrchestratorAgent.run(query=q)

In [16]:
LLM_model.lm_head
# LLM_model.get_input_embeddings

Linear(in_features=4096, out_features=128256, bias=False)

In [7]:
LLM_model.model.layers[0]

LlamaDecoderLayer(
  (self_attn): LlamaAttention(
    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
    (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
    (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
    (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
  )
  (mlp): LlamaMLP(
    (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
    (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
    (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
    (act_fn): SiLU()
  )
  (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
  (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
)

In [8]:
LLM_model.model.layers[1]

LlamaDecoderLayer(
  (self_attn): LlamaAttention(
    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
    (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
    (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
    (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
  )
  (mlp): LlamaMLP(
    (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
    (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
    (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
    (act_fn): SiLU()
  )
  (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
  (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
)

In [31]:
LLM_model.model.layers[-1].self_attn

LlamaAttention(
  (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
  (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
  (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
  (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
)

In [27]:
pipeline = transformers.pipeline(
    "text-generation",
    model=model_dir,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto",
    max_length=128,
)

Loading checkpoint shards: 100%|█████████████████████████████████████████| 4/4 [00:03<00:00,  1.29it/s]
Device set to use cuda:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


[{'generated_text': 'Hey how are you doing today? I’m doing fine. I’m here with a little video to show you how to make the easiest and fastest slime you will ever make. It’s a fluffy slime and it’s super easy to make. You need one cup of glue, one cup of water, one cup of shaving cream, one teaspoon of baking soda and two tablespoons of contact solution. So I’m going to start by adding my glue and water. Then I’m going to add my shaving cream. Then I’m going to add my baking soda. Then I’m going to add my contact solution. So I’m going to start'}]

In [29]:
pipeline("Show me a story about thief")

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


[{'generated_text': 'Show me a story about thief who stole a purse and I will show you a story about a thief who stole a car.\nShow me a story about a thief who stole a car and I will show you a story about a thief who stole a boat.\nShow me a story about a thief who stole a boat and I will show you a story about a thief who stole a plane.\nShow me a story about a thief who stole a plane and I will show you a story about a thief who stole a train.\nShow me a story about a thief who stole a train and I will show you a story about a thief who stole a'}]