In [1]:
import transformers
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from typing import List
import torch
from torch import cuda, bfloat16
from datasets import load_dataset
import pandas as pd
 
import matplotlib.pyplot as plt
import matplotlib as mpl
# import seaborn as sns
from pylab import rcParams
import os
 
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DEVICE

'cuda'

In [2]:
model_id = 'hyonbokan/BGP-llama2'
llama2_13 = 'meta-llama/Llama-2-13b-chat-hf'

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

bnb_config = transformers.BitsAndBytesConfig(
    load_in_8bit=True,
)

# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_use_double_quant=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype=torch.bfloat16
# )

# Need auth token for these
hf_token = os.environ.get('hf_token')
hf_auth = hf_token
model_config = transformers.AutoConfig.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)

model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map='auto',
    use_auth_token=hf_auth
)

model.eval()
print(f"Model loaded on {device}")

Downloading (…)lve/main/config.json:   0%|          | 0.00/966 [00:00<?, ?B/s]

You passed `quantization_config` to `from_pretrained` but the model you're loading already has a `quantization_config` attribute. The `quantization_config` attribute will be overwritten with the one you passed to `from_pretrained`.


Downloading (…)model.bin.index.json:   0%|          | 0.00/71.0k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)l-00001-of-00002.bin:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

Downloading (…)l-00002-of-00002.bin:   0%|          | 0.00/4.28G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of the model checkpoint at hyonbokan/BGP-llama2 were not used when initializing LlamaForCausalLM: ['model.layers.30.self_attn.v_proj.lora_B.default.weight', 'model.layers.7.self_attn.q_proj.lora_B.default.weight', 'model.layers.26.self_attn.v_proj.lora_A.default.weight', 'model.layers.2.self_attn.v_proj.lora_B.default.weight', 'model.layers.5.self_attn.v_proj.lora_B.default.weight', 'model.layers.30.self_attn.v_proj.lora_A.default.weight', 'model.layers.0.self_attn.q_proj.lora_A.default.weight', 'model.layers.17.self_attn.q_proj.lora_A.default.weight', 'model.layers.11.self_attn.q_proj.lora_A.default.weight', 'model.layers.32.self_attn.v_proj.lora_B.default.weight', 'model.layers.2.self_attn.v_proj.lora_A.default.weight', 'model.layers.31.self_attn.v_proj.lora_B.default.weight', 'model.layers.30.self_attn.q_proj.lora_B.default.weight', 'model.layers.14.self_attn.q_proj.lora_B.default.weight', 'model.layers.25.self_attn.v_proj.lora_A.default.weight', 'model.layers.6.self_at

Downloading (…)neration_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Model loaded on cuda:0


In [3]:
tokenizer = transformers.AutoTokenizer.from_pretrained(
    llama2_13,
    use_auth_token=hf_auth
)

In [4]:
logging.set_verbosity(logging.CRITICAL)

# Run text generation pipeline with our next model
prompt = "For Packet Tracer BGP Configuration, firstly we need to configure the IP addresses of interfaces as other examples. To do this, as a better network engineering rule, firstly make your IP plan or, use the existing one. Acording to my basic IP plan, I used the below IPs for my interfaces. Explain how this is done"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=512)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

<s>[INST] For Packet Tracer BGP Configuration, firstly we need to configure the IP addresses of interfaces as other examples. To do this, as a better network engineering rule, firstly make your IP plan or, use the existing one. Acording to my basic IP plan, I used the below IPs for my interfaces. Explain how this is done [/INST]essoessoessoessoessoachenachenachenachenachenachenachenachenachenachenachenachenachenachenessoachenachenachenachenachenachenachenachenachenachenachenachenachenachenachenachenachenessoachenachenachenachenachenachenachenachenachenachenachenachenachenachenachenachenachenachenachenachenachenachenachenachenachenachenachenachenachenachenHCachenachenHCachenachenachenachenachenachenachenachenachenachenachenachenachenachenachenachenHCachenachenachenachenachenachenachenHCachenHCachenHCachenHCachenachenHCHCHCachenHCachenHCHCachenHCachenHCHCachenachenachenachenachenachenachenachenHCachenachenachenHCachenachenachenachenHCHCachenachenHCachenHCachenachenachenHCachenachenHCHCac