In [None]:
!pip install -r requirements.txt

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
import gc

# Set CUDA device explicitly first
torch.cuda.set_device(0)
device = torch.device("cuda:0")
print(f"Using CUDA device: {torch.cuda.get_device_name(0)}")

In [None]:
# Configure quantization for CodeGen-350M
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

# Model directory - CodeGen-350M
model_dir = "Salesforce/codegen-350M-multi"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_dir, use_fast=True)

# Add pad token if it doesn't exist (common issue with CodeGen)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Load model
model = AutoModelForCausalLM.from_pretrained(
    model_dir,
    quantization_config=bnb_config,   
    device_map={"": torch.cuda.current_device()},
    torch_dtype=torch.bfloat16,
    trust_remote_code=True             
)

model.config.use_cache = False
model.config.pretraining_tp = 1

print(f"Model loaded: {model_dir}")
print(f"Tokenizer vocab size: {tokenizer.vocab_size}")

In [None]:
# Define the training prompt style for cybersecurity CWE classification
# Option 1: Instruction-style (your current approach)
train_prompt_style_instruction = """Below is an instruction that describes a task, paired with an input that provides further context. 
Write a response that appropriately completes the request. 
Before answering, think carefully about the question and create a step-by-step chain of thoughts to ensure a logical and accurate response.

### Instruction:
You are a cybersecurity expert with advanced knowledge in software security and common weakness enumerations(CWE). 
Look at the following code and classify it with the apropriate CWE's if it has any. 

### Code:
{}

### Response:
{}"""

# Option 2: Code-comment style (better for CodeGen)
train_prompt_style_comment = """/*
Security Analysis Task:
Analyze the following code for Common Weakness Enumerations (CWE).
Identify any security vulnerabilities and classify them with appropriate CWE numbers.

Code to analyze:
*/
{}

/*
Security Analysis Result:
{}
*/"""

# Option 3: Simple completion style
train_prompt_style_simple = """// Security vulnerability analysis
// Code:
{}

// CWE Classification:
{}"""

# Option 4: Q&A style (simpler than instruction)
train_prompt_style_qa = """Question: What CWE vulnerabilities are present in this code?

Code:
{}

Answer: {}"""

# Choose which style to use (default to comment style for CodeGen)
train_prompt_style = train_prompt_style_comment

EOS_TOKEN = tokenizer.eos_token  # Must add EOS_TOKEN

def formatting_prompts_func(examples):
    inputs = examples["input"]
    outputs = examples["output"]
    texts = []
    for code, response in zip(inputs, outputs):
        # Append the EOS token to the response if it's not already there
        if not response.endswith(tokenizer.eos_token):
            response += tokenizer.eos_token
        text = train_prompt_style.format(code, response)
        texts.append(text)
    return {"text": texts}

In [None]:
from datasets import load_dataset

# Load your dataset (adjust path as needed)
dataset = load_dataset(
    "json",
    data_dir="/local/s3905020/code/dataset-creation"
)["train"]

# Format the dataset output field (convert list to string if needed)
dataset = dataset.map(
    lambda x: {"output": [" ".join(map(str, out)) if isinstance(out, list) else str(out) for out in x["output"]]}, 
    batched=True
)

# Apply formatting function
dataset = dataset.map(
    formatting_prompts_func,
    batched=True,
)

# Inspect some examples
print("Dataset size:", len(dataset))
print("\nExample 1:")
print(dataset["text"][10][:500] + "..." if len(dataset["text"][10]) > 500 else dataset["text"][10])

print("\nExample 2:")
print(dataset["text"][100][:500] + "..." if len(dataset["text"][100]) > 500 else dataset["text"][100])


In [None]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

In [None]:
# Match the training prompt style
inference_prompt_style_instruction = """Below is an instruction that describes a task, paired with an input that provides further context. 
Write a response that appropriately completes the request. 
Before answering, think carefully about the question and create a step-by-step chain of thoughts to ensure a logical and accurate response.

### Instruction:
You are a cybersecurity expert with advanced knowledge in software security and common weakness enumerations (CWE). 
Please look at the following code and classify it with the appropriate CWE's if it has any.

### Question:
{}

### Response:
<think>
"""

inference_prompt_style_comment = """/*
Security Analysis Task:
Analyze the following code for Common Weakness Enumerations (CWE).
Identify any security vulnerabilities and classify them with appropriate CWE numbers.

Code to analyze:
*/
{}

/*
Security Analysis Result:
"""

inference_prompt_style_simple = """// Security vulnerability analysis
// Code:
{}

// CWE Classification:
"""

inference_prompt_style_qa = """Question: What CWE vulnerabilities are present in this code?

Code:
{}

Answer: """

# Match the training style
inference_prompt_style = inference_prompt_style_comment

In [None]:
# Clean up memory
gc.collect()
torch.cuda.empty_cache()

# Use the device that the model was loaded on
device = model.device
print(f"Model is on device: {device}")

# Test base model before fine-tuning
question = dataset[10]['input']
inputs = tokenizer(
    [inference_prompt_style.format(question)],
    return_tensors="pt",
    truncation=True,
    max_length=2048
).to(device)

print("Testing base model...")
with torch.no_grad():
    outputs = model.generate(
        input_ids=inputs.input_ids,
        attention_mask=inputs.attention_mask,
        max_new_tokens=1200,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id,
        use_cache=True,
        do_sample=True,
        temperature=0.7,
        top_p=0.9
    )

response = tokenizer.batch_decode(outputs, skip_special_tokens=True)
print("Base model response:")
print(response[0].split("### Response:")[1] if "### Response:" in response[0] else response[0])


In [None]:
from peft import LoraConfig, get_peft_model

# LoRA config adapted for CodeGen-350M
peft_config = LoraConfig(
    lora_alpha=16,                           # Scaling factor for LoRA
    lora_dropout=0.05,                       # Add slight dropout for regularization
    r=64,                                    # Rank of the LoRA update matrices
    bias="none",                             # No bias reparameterization
    task_type="CAUSAL_LM",                   # Task type: Causal Language Modeling
    target_modules=[
        "qkv_proj",                          # CodeGen uses qkv_proj instead of separate q,k,v
        "out_proj",                          # Output projection
        "fc_in",                             # Feed-forward input
        "fc_out",                            # Feed-forward output
    ],  # Target modules for LoRA (adapted for CodeGen architecture)
)

# Apply LoRA to model
model = get_peft_model(model, peft_config)

# Print trainable parameters
model.print_trainable_parameters()

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments

# Training Arguments (adjusted for CodeGen-350M)
training_arguments = TrainingArguments(
    output_dir="codegen_output",
    per_device_train_batch_size=2,           # Increased batch size for smaller model
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    num_train_epochs=3,                      # Reduced epochs for faster training
    logging_steps=0.1,                       # More frequent logging
    warmup_steps=10,
    logging_strategy="steps",
    learning_rate=3e-4,                      # Slightly higher LR for smaller model
    fp16=False,
    bf16=True,                               # Use bf16 if available
    group_by_length=True,
    report_to="wandb",
    save_strategy="steps",
    save_steps=500,
    evaluation_strategy="no",
    max_grad_norm=1.0,
    remove_unused_columns=False,
)

# Cell 11: Initialize Trainer
# Initialize the SFT Trainer
trainer = SFTTrainer(
    model=model,
    args=training_arguments,
    train_dataset=dataset,
    peft_config=peft_config,
    data_collator=data_collator,
    dataset_text_field="text",               # Specify the text field
    max_seq_length=2048,                     # Set max sequence length
    tokenizer=tokenizer,
)

In [None]:
# Clean up memory before training
gc.collect()
torch.cuda.empty_cache()

# Ensure the trainer uses the same device as the model
device = model.device
print(f"Training on device: {device}")

# Check if the model is properly loaded on the expected device
if hasattr(model, 'base_model'):
    print(f"Base model device: {model.base_model.device}")
elif hasattr(model, 'model'):
    print(f"Model's model device: {model.model.device}")

# Make sure all model parts are on the same device
model = model.to(device)
model.config.use_cache = False

In [None]:
print("Starting training...")
trainer.train()

In [None]:
# Save the fine-tuned model
model.save_pretrained("codegen_final_model")
tokenizer.save_pretrained("codegen_final_model")
print("Model saved to codegen_final_model/")

In [None]:
print("Testing fine-tuned model...")

# Test on a different sample
question = dataset[100]['input']
inputs = tokenizer(
    [inference_prompt_style.format(question)],
    return_tensors="pt",
    truncation=True,
    max_length=2048
).to(device)

with torch.no_grad():
    outputs = model.generate(
        input_ids=inputs.input_ids,
        attention_mask=inputs.attention_mask,
        max_new_tokens=1200,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id,
        use_cache=True,
        do_sample=True,
        temperature=0.7,
        top_p=0.9
    )

response = tokenizer.batch_decode(outputs, skip_special_tokens=True)
print("Fine-tuned model response:")
print(response[0])

In [None]:
def load_model_and_tokenizer_codegen(model_dir="Salesforce/codegen-350M-multi", 
                                     checkpoint_path="./codegen_output/checkpoint-XXX",
                                     device_str="cuda:0"):
    """Load fine-tuned CodeGen model for evaluation"""
    print(f"Loading model from {model_dir} and checkpoint from {checkpoint_path}")
    
    # Configure quantization
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=False,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )
    
    # Set device
    if device_str.startswith("cuda"):
        device_id = int(device_str.split(":")[-1])
        torch.cuda.set_device(device_id)
        print(f"Using CUDA device: {torch.cuda.get_device_name(device_id)}")
    
    device = torch.device(device_str)
    
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_dir, use_fast=True)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    # Load base model with quantization
    base_model = AutoModelForCausalLM.from_pretrained(
        model_dir,
        quantization_config=bnb_config,
        device_map={"": device.index if device.type == "cuda" else "cpu"},
        torch_dtype=torch.bfloat16,
        trust_remote_code=True
    )
    
    # Load the PEFT adapter
    from peft import PeftModel
    model = PeftModel.from_pretrained(base_model, checkpoint_path)
    model.config.use_cache = True
    model.eval()
    
    return model, tokenizer, device

In [None]:
def filter_dataset_by_tokens_codegen(file_path, max_tokens=100000, output_file=None):
    """Filter dataset for CodeGen tokenizer"""
    import json
    from tqdm import tqdm
    
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained("Salesforce/codegen-350M-multi", use_fast=True)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    filtered_data = []
    excluded_count = 0
    total_count = 0
    
    print(f"Filtering dataset: {file_path}")
    print(f"Maximum tokens allowed: {max_tokens}")
    
    # Process each line in the JSONL file
    with open(file_path, 'r') as f:
        for line in tqdm(f):
            total_count += 1
            entry = json.loads(line)
            
            # Tokenize the input text
            tokens = tokenizer(entry["input"], return_length=True, truncation=False)
            token_count = len(tokens["input_ids"])
            
            # Include entry if it's below the token limit
            if token_count < max_tokens:
                filtered_data.append(entry)
            else:
                excluded_count += 1
    
    # Save filtered dataset if output file is specified
    if output_file:
        with open(output_file, 'w') as f:
            for entry in filtered_data:
                f.write(json.dumps(entry) + '\n')
        print(f"Filtered dataset saved to: {output_file}")
    
    # Print statistics
    print(f"Total entries: {total_count}")
    print(f"Entries kept: {len(filtered_data)} ({len(filtered_data)/total_count*100:.2f}%)")
    print(f"Entries excluded: {excluded_count} ({excluded_count/total_count*100:.2f}%)")
    
    return filtered_data

In [None]:
def evaluate_codegen_model(model, tokenizer, test_data, device, output_file="codegen_evaluation_results.jsonl"):
    """Evaluate CodeGen model on test data"""
    import json
    from tqdm import tqdm
    
    print(f"Evaluating CodeGen model on {len(test_data)} test samples")
    results = []
    
    for idx, item in enumerate(tqdm(test_data)):
        code = item["input"]
        expected_output = item["output"]
        
        prompt = inference_prompt_style.format(code)
        
        inputs = tokenizer(
            [prompt], 
            return_tensors="pt", 
            truncation=True, 
            max_length=2048
        ).to(device)
        
        with torch.no_grad():
            outputs = model.generate(
                input_ids=inputs.input_ids,
                attention_mask=inputs.attention_mask,
                max_new_tokens=1200,
                eos_token_id=tokenizer.eos_token_id,
                pad_token_id=tokenizer.pad_token_id,
                use_cache=True,
                do_sample=True,
                temperature=0.7,
                top_p=0.9
            )
        
        generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
        response = generated_text.split("### Response:")[1].strip() if "### Response:" in generated_text else generated_text
        
        result = {
            "index": idx,
            "input": code,
            "expected_output": expected_output,
            "model_output": response,
            "full_response": generated_text
        }
        
        results.append(result)
        
        # Save results periodically
        if idx % 10 == 0:
            with open(output_file, 'w') as f:
                for res in results:
                    f.write(json.dumps(res) + '\n')
    
    # Final save
    with open(output_file, 'w') as f:
        for res in results:
            f.write(json.dumps(res) + '\n')
    
    print(f"Evaluation complete! Results saved to {output_file}")
    return results

In [None]:
def cleanup_memory():
    """Clean up GPU memory"""
    gc.collect()
    torch.cuda.empty_cache()
    if torch.cuda.is_available():
        torch.cuda.synchronize()

In [None]:
"""
evaluation:

# 1. Filter test dataset
test_file = "/path/to/test.jsonl"
filtered_file = "/path/to/test_filtered.jsonl"
filtered_data = filter_dataset_by_tokens_codegen(test_file, max_tokens=80000, output_file=filtered_file)

# 2. Load fine-tuned model
model, tokenizer, device = load_model_and_tokenizer_codegen(
    checkpoint_path="./codegen_output/checkpoint-1000"
)

# 3. Load test data
import json
test_data = []
with open(filtered_file, 'r') as f:
    for line in f:
        test_data.append(json.loads(line))

# 4. Run evaluation
results = evaluate_codegen_model(model, tokenizer, test_data, device)

# 5. Calculate metrics (use the same metrics functions from the original notebook)
# You can copy the calculate_metrics and visualization functions from the original notebook
"""

print("CodeGen-350M fine-tuning notebook ready!")
print("Adjust the dataset paths and run the cells sequentially.")
print("The model architecture differences from Qwen have been handled in the LoRA configuration.")