In [1]:
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA version: {torch.version.cuda}")

PyTorch version: 2.2.0
CUDA version: 12.1


In [2]:
from unsloth import FastLanguageModel
import torch
from torch import cuda, bfloat16
from datasets import load_dataset
import os

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DEVICE

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


Unsloth: OpenAI failed to import - ignoring for now.
🦥 Unsloth Zoo will now patch everything to make training faster!


'cuda'

In [3]:
model_id = "unsloth/Meta-Llama-3.1-8B",

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
torch.cuda.set_device(device)

max_seq_length = 2048
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Optional with limited VRAM

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    # load_in_4bit = load_in_4bit,
    token = "meta-llama/Meta-Llama-3.1-8B-Instruct",
)

model.eval()
print(f"Model loaded on {device}")

==((====))==  Unsloth 2024.12.12: Fast Llama patching. Transformers: 4.47.1.
   \\   /|    GPU: NVIDIA RTX A6000. Max memory: 47.529 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.2.0. CUDA: 8.6. CUDA Toolkit: 12.1. Triton: 2.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.24. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Model loaded on cuda:0


In [4]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 64, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.12.12 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [6]:
data = load_dataset("json", data_files="/path_to_train_dataset")
data["train"]

Dataset({
    features: ['instruction', 'input', 'output', 'most_similar_instructions', 'avg_similarity_score'],
    num_rows: 10110
})

In [7]:
CUTOFF_LEN = 2048

def generate_prompt(data_point):
    """
    Create the text prompt from your instruction, input, and output fields.
    """
    return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{data_point["instruction"]}

### Input:
{data_point["input"]}

### Response:
{data_point["output"]}"""

def tokenize(prompt, add_eos_token=True):
    """
    Tokenizes the prompt. Optionally pads to max_length=2048 and appends an EOS token.
    Copies input_ids to labels for causal LM.
    """
    # Here, we use padding="max_length" to get uniform-length sequences of 2048.
    # Alternatively, you can use padding=False and rely on a data collator.
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=CUTOFF_LEN,
        padding="max_length",   # or padding=False + data_collator
        return_tensors=None,    # return raw Python lists
    )

    input_ids = result["input_ids"]
    attention_mask = result["attention_mask"]

    # Optionally place an EOS token at the very end if there's room
    if (
        add_eos_token
        and len(input_ids) == CUTOFF_LEN
        and input_ids[-1] != tokenizer.eos_token_id
    ):
        # Replace last token with EOS if you'd like
        input_ids[-1] = tokenizer.eos_token_id
        attention_mask[-1] = 1

    labels = input_ids.copy()
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels,
    }

def generate_and_tokenize_prompt(data_point):
    """
    Combines prompt generation with tokenization.
    """
    full_prompt = generate_prompt(data_point)
    return tokenize(full_prompt)

# Example: split the "train" set into train/val
train_val = data["train"].train_test_split(test_size=1000, shuffle=True, seed=42)
train_data = train_val["train"].map(generate_and_tokenize_prompt)
val_data   = train_val["test"].map(generate_and_tokenize_prompt)


Map: 100%|██████████| 9110/9110 [01:02<00:00, 145.78 examples/s]
Map: 100%|██████████| 1000/1000 [00:08<00:00, 117.67 examples/s]


In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_data,
    eval_dataset = val_data,
    dataset_text_field = "output",
    logging_steps = 200,
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 4,
        gradient_accumulation_steps = 1,
        # warmup_steps = 5,
        warmup_ratio = 0.05,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 5000,
        learning_rate = 1e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "paged_adamw_32bit",
        weight_decay = 0.01,
        lr_scheduler_type = "cosine",
        seed = 3407,
        output_dir = "/outputs",
        report_to = "none",
    ),
)


In [None]:
trainer_stats = trainer.train()

## Saving the model

In [2]:
new_model = "/finetuned_model"
trainer.model.save_pretrained(new_model)
tokenizer.save_pretrained(new_model)

## Quick evaluation

In [3]:
from peft import LoraConfig, PeftModel
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    pipeline,
    logging,
    TextStreamer
)
import torch
from torch import cuda, bfloat16

model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"

# Reload model in FP16 and merge it with LoRA weights
base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map='auto',
)
model = PeftModel.from_pretrained(base_model, new_model) # make sure to check if the models are correct!

model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

tokenizer.padding_side = "right"

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 4/4 [00:11<00:00,  2.79s/it]
The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


In [5]:
logging.set_verbosity(logging.CRITICAL)

pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=1012)
result = pipe(f"Summarize the AS paths for each prefix associated with ASN AS4766 over the period oct 28 13:00 to oct 28 13:15, 2024. Provide minimum, maximum, and median AS path lengths and highlight any significant path changes observed in BGP updates.")
print(result[0]['generated_text'])

Summarize the AS paths for each prefix associated with ASN AS4766 over the period oct 28 13:00 to oct 28 13:15, 2024. Provide minimum, maximum, and median AS path lengths and highlight any significant path changes observed in BGP updates. 
# Import necessary libraries
import pybgpstream
import statistics

# Define the time window
start_time = "2024-10-28 13:00:00"
end_time = "2024-10-28 13:15:00"

# Initialize the BGPStream
stream = pybgpstream.BGPStream(
    from_time=start_time,
    until_time=end_time,
    record_type="updates",
    filter="peer AS4766"
)

# Dictionary to store AS paths for each prefix
prefix_as_paths = {}

# Process BGP records
for rec in stream.records():
    for elem in rec:
        if 'as-path' in elem.fields:
            prefix = elem.fields['prefix']
            as_path = elem.fields['as-path'].split(' ')
            as_path_length = len(as_path)
            if prefix not in prefix_as_paths:
                prefix_as_paths[prefix] = []
            prefix_as_pa

## Uploading to hf

In [None]:
model.push_to_hub('your_hf_acc/repo_name')
tokenizer.push_to_hub('your_hf_acc/repo_name')