In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
from trl import SFTConfig, SFTTrainer
from datasets import Dataset
import pandas as pd
import torch

In [None]:
print(torch.__version__)
print(torch.cuda.is_available())

In [None]:
model_name = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, device_map="auto")
model.generation_config = GenerationConfig.from_pretrained(model_name)
model.generation_config.pad_token_id = model.generation_config.eos_token_id

In [None]:
messages = [
    {"role": "user", "content": "Who are you?"}
]
input_tensor = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt")
outputs = model.generate(input_tensor.to(model.device), max_new_tokens=100)

result = tokenizer.decode(outputs[0][input_tensor.shape[1]:], skip_special_tokens=True)
print(result)

In [None]:
messages = [
    {"role": "user", "content": "Write a haiku about programming"}
]
input_tensor = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt")
outputs = model.generate(input_tensor.to(model.device), max_new_tokens=1200)

result = tokenizer.decode(outputs[0][input_tensor.shape[1]:], skip_special_tokens=True)
print(result)

In [None]:
# Load the dataset
df = pd.read_csv("datasets/attacks_types.csv")

df["text"] = df["Input"] + "\n\n" + df["Additional Context"]

# Convert to a HF dataset
dataset = Dataset.from_pandas(df[["text"]])

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length")

tokenized_dataset = dataset.map(tokenize_function, batched=True)

In [None]:
# Set the name to upload into HuggingFace
finetune_name = "DeepSeek-R1-Distill-Llama-8B-SC-vulnerabilities-list"
finetune_tags = ["DeepSeek-R1-Distill", "smart-contracts"]

In [None]:
# Configure the SFTTrainer
sft_config = SFTConfig(
    output_dir="./sft_output",
    max_steps=1000,  # Adjust based on dataset size and desired training duration
    per_device_train_batch_size=1,  # Set according to your GPU memory capacity
    learning_rate=5e-5,  # Common starting point for fine-tuning
    logging_steps=10,  # Frequency of logging training metrics
    save_steps=100,  # Frequency of saving model checkpoints
    eval_strategy="no",
    hub_model_id=finetune_name,  # Set a unique name for your model
    dataset_text_field="text",  # Name of the text field in the dataset
)

# Initialize the SFTTrainer
trainer = SFTTrainer(
    model=model,
    args=sft_config,
    train_dataset=tokenized_dataset,
    processing_class=tokenizer)

In [None]:
# Train the model
trainer.train()

# Save the model
trainer.save_model(f"./{finetune_name}")

In [None]:
# Test the fine-tuned model on the same prompt
fine_tuned_model_name = "./sft_output/checkpoint-60/"
tokenizer = AutoTokenizer.from_pretrained(fine_tuned_model_name)
fine_tuned_model = AutoModelForCausalLM.from_pretrained(fine_tuned_model_name, torch_dtype=torch.bfloat16, device_map="auto")
fine_tuned_model.generation_config = GenerationConfig.from_pretrained(fine_tuned_model_name)
fine_tuned_model.generation_config.pad_token_id = fine_tuned_model.generation_config.eos_token_id

prompt = "Test question"

messages = [
    {"role": "user", "content": "Who are you?"}
]
fine_tuned_input_tensor = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt")
fine_tuned_outputs = model.generate(fine_tuned_input_tensor.to(model.device), max_new_tokens=100)

fine_tuned_result = tokenizer.decode(fine_tuned_outputs[0][fine_tuned_input_tensor.shape[1]:], skip_special_tokens=True)
print(fine_tuned_result)