Importing Required Libraries and Packages

In [None]:
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import subprocess
import json
import os
import tempfile
import sys
import re
import sacrebleu

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.optim import AdamW
from torch.optim.lr_scheduler import StepLR

from torchvision import datasets
from torchvision.transforms import ToTensor

from tqdm import tqdm, auto

from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM, 
    AutoModelForSequenceClassification, 
    pipeline, 
    TrainingArguments, 
    Trainer, 
    EvalPrediction
)

from trl import (
    SFTTrainer, 
    RewardTrainer, 
    RewardConfig, 
    PPOTrainer, 
    PPOConfig, 
    AutoModelForCausalLMWithValueHead
)

from datasets import Dataset, DatasetDict, load_dataset
from codebleu import calc_codebleu
import bitsandbytes as bnb
from torch.distributions import Bernoulli
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction

Initializing Tokenizer and Setting Device

In [5]:
model_name = "codellama/CodeLlama-7b-hf"
# Initialize the tokenizer from a pre-trained model
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
# Set padding token to be the end-of-sequence token
tokenizer.pad_token = tokenizer.eos_token
# Set padding side to left
tokenizer.padding_side = 'left'

# Set the device to the first CUDA device
device = torch.device('cuda:0')
torch.cuda.set_device(device)

Loading and Preprocessing Data

In [6]:
# Load buggy and fixed data from text files
buggy_data = Path('./FixJS/input/50/before_tokenized.txt').read_text(encoding='utf-8').splitlines()
fixed_data = Path('./FixJS/input/50/after_tokenized.txt').read_text(encoding='utf-8').splitlines()

# Get the length of the data and create an array of indices
data_len = len(buggy_data)
indices = np.arange(data_len)

# Shuffle the indices for randomizing the data
np.random.seed(13)
np.random.shuffle(indices)

# Apply the shuffled indices to the data
buggy_data = np.array(buggy_data, dtype=object)[indices].tolist()
fixed_data = np.array(fixed_data, dtype=object)[indices].tolist()

# Function to add a function name in place within the data
def add_function_name_in_place(data, function_name):
    for i in range(len(data)):
        index = data[i].find('(')
        if index != -1:
            data[i] = data[i][:index] + function_name + ' ' + data[i][index:]

# Define the function name to be added
function_name = 'Function'

# Add the function name to the buggy and fixed data
add_function_name_in_place(buggy_data, function_name)
add_function_name_in_place(fixed_data, function_name)

Splitting Data into Training, Validation, and Test Sets

In [7]:
# Calculate the starting indices for validation and test sets
valid_start = int(data_len * 0.8)
test_start = valid_start + int(data_len * 0.1)

# Split the data into training, validation, and test sets
train_input, train_target = buggy_data[:valid_start], fixed_data[:valid_start]
valid_input, valid_target = buggy_data[valid_start:test_start], fixed_data[valid_start:test_start]
test_input, test_target = buggy_data[test_start:], fixed_data[test_start:]

Processing Data for Model Inputs

In [8]:
# Function to process data and convert it into model inputs
def process_data_to_model_inputs(buggy_data, fixed_data, device):
    model_inputs = []
    for buggy, fixed in zip(buggy_data, fixed_data):
        # Tokenize buggy data
        inputs = tokenizer(buggy, max_length=50, truncation=True, padding='max_length', return_tensors='pt').to(device)
        # Tokenize fixed data
        outputs = tokenizer(fixed, max_length=50, truncation=True, padding='max_length', return_tensors='pt').to(device)

        # Append processed data to model inputs list
        model_inputs.append({
            'input_ids': inputs.input_ids.squeeze(0),
            'attention_mask': inputs.attention_mask.squeeze(0),
            'labels': outputs.input_ids.squeeze(0)
        })
    return model_inputs

# Process training, validation, and test datasets
train_dataset = process_data_to_model_inputs(train_input, train_target, device)
val_dataset = process_data_to_model_inputs(valid_input, valid_target, device)
test_dataset = process_data_to_model_inputs(test_input, test_target, device)

 Calculating Code Quality and AST Reward

In [None]:
# Function to calculate code quality score using Semgrep
def calculate_code_quality_score(code, penalize_parsing_errors=False):
    # Define the scoring system
    max_score = 100
    points_deduct_per_issue = {
        "find_eval": 10,  
        "user-input-in-code": 10,  
        "plaintext-sensitive-info": 20
        
    }

    # Setup environment for Node.js and Semgrep
    node_directory = '/home/user/.nvm/versions/node/v21.6.2/bin'
    env = os.environ.copy()
    env['PATH'] = node_directory + os.pathsep + env['PATH']
    rule_file_js='/home/user/vul.yaml'

    # Create a temporary file with the input code
    with tempfile.NamedTemporaryFile(suffix=".js", delete=False) as temp_file:
        temp_file_path = temp_file.name
        temp_file.write(code.encode('utf-8'))

    try:
        # Execute Semgrep on the temporary file
        command = ['npx', 'semgrep', rule_file_js, temp_file_path, '--format=json']
        result = subprocess.run(command, capture_output=True, text=True, env=env)

        # Start with the maximum score
        score = max_score

        if result.stdout:
            semgrep_output = json.loads(result.stdout)
            for file_result in semgrep_output:
                for message in file_result['messages']:
                    # Deduct points for parsing errors specifically
                    if 'message' in message and 'error' in message['message']:
                        if penalize_parsing_errors:
                            score -= 50
                    else:
                        rule_id = message.get('ruleId', 'default')
                        deduction = points_deduct_per_issue.get(rule_id, points_deduct_per_issue['default'])
                        score -= deduction

            # Ensure score does not go below 0
            score = max(0, score)

    finally:
        # Clean up by deleting the temporary file
        os.remove(temp_file_path)

    return score

# Example calculation of code quality score
code_quality_score = calculate_code_quality_score("var x = 1;")
print(f"Code Quality Score: {code_quality_score}")

# Function to calculate AST reward based on Semgrep quality scores
def ast_reward(generated_code, target_code):
    # Calculate Semgrep-based quality scores for both pieces of code
    generated_code_score = calculate_code_quality_score(generated_code, penalize_parsing_errors=True)
    target_code_score = calculate_code_quality_score(target_code, penalize_parsing_errors=False)
    alignment_penalty = abs(generated_code_score - target_code_score)

    # Define the total score, possibly adjusting the weights as necessary
    # Assuming both scores and alignment are equally important
    total_score = (generated_code_score + (100 - alignment_penalty)) / 2

    return total_score / 100

Code Quality Score: 98


Generating and Comparing Control Flow Graphs (CFGs)

In [None]:
def get_cfg_from_code(code):
    # Convert the code to a format that can be passed as a command line argument
    formatted_code = code.replace('"', '\\"')

    # Path to your Node.js executable
    node_path = '/home/user/.nvm/versions/node/v21.6.2/bin/node'

    # Path to your JavaScript file that generates the CFG
    script_path = '/home/user/PPOFixer/generate_cfg.js'

    try:
        # Execute the Node.js script and capture the output
        result = subprocess.run([node_path, script_path, formatted_code], capture_output=True, text=True, check=True)

        # Attempt to parse the output as JSON
        cfg_or_error = json.loads(result.stdout)
        cfg_or_error = json.loads(cfg_or_error)

        # Check if the output is an error message
        if isinstance(cfg_or_error, dict) and cfg_or_error.get('error'):
            # Handle the syntax error or other errors reported by the Node.js script
            return None
        else:
            # If there's no error, return the CFG
            return cfg_or_error

    except subprocess.CalledProcessError as e:
        # This catches errors from the subprocess itself, such as if the script fails to run
        return None
    except json.JSONDecodeError as e:
        # This catches errors in parsing the output from the script
        return None
    except Exception as e:
        # Catch-all for any other unexpected errors
        return None

    # Return None in case of any error
    return None

# Example usage
code1 = 'function Function ( ) { this. socket. close ( ) ; }'
code2 = 'function Function ( ) { node. socket. close ( ) ; }'
cfg1 = get_cfg_from_code(code1)
cfg2 = get_cfg_from_code(code2)
print(cfg1)

def operation_similarity(edge1, edge2):
    # Ensure 'data' key exists and its value is not None and is subscriptable (e.g., a dictionary)
    if ('data' not in edge1 or edge1['data'] is None or not isinstance(edge1['data'], dict)) or \
       ('data' not in edge2 or edge2['data'] is None or not isinstance(edge2['data'], dict)):
        return False
    
    # After ensuring 'data' is a valid dictionary, check for 'type' key
    if 'type' not in edge1['data'] or 'type' not in edge2['data']:
        return False

    # Proceed with the comparison if all checks pass
    return edge1['data']['type'] == edge2['data']['type']


def compute_node_similarity(cfg1, cfg2):
    
    nodes1 = {node['id']: node['type'] for node in cfg1['program']['flowGraph']['nodes']}
    nodes2 = {node['id']: node['type'] for node in cfg2['program']['flowGraph']['nodes']}
    common_nodes = set(nodes1.items()) & set(nodes2.items())
    total_nodes = set(nodes1.items()) | set(nodes2.items())
    return len(common_nodes) / len(total_nodes)

def compute_edge_similarity(cfg1, cfg2):
    
    edges1 = cfg1['program']['flowGraph']['edges']
    edges2 = cfg2['program']['flowGraph']['edges']

    # Count how many edges in edges1 have a similar operation in edges2
    similar_count = 0
    for edge1 in edges1:
        for edge2 in edges2:
            if operation_similarity(edge1, edge2):
                similar_count += 1
                break  # Assuming each edge in edges1 is only compared once

    # Calculate similarity score based on the number of similar operations to the total unique operations
    total_unique_edges = len(edges1) + len(edges2) - similar_count
    return similar_count / total_unique_edges if total_unique_edges else 1




def compute_path_similarity(cfg1, cfg2):
   
    path_count_cfg1 = len(cfg1['program']['flowGraph']['edges'])  # Simplified placeholder
    path_count_cfg2 = len(cfg2['program']['flowGraph']['edges'])  # Simplified placeholder
    
    # Calculate similarity as inverse of difference; other metrics could be more sophisticated
    similarity = 1 - abs(path_count_cfg1 - path_count_cfg2) / max(path_count_cfg1, path_count_cfg2)
    return similarity

def compute_structural_differences(cfg_buggy, cfg_fixed):

    nodes_buggy = set((node['id'], node['type']) for node in cfg_buggy['program']['flowGraph']['nodes'])
    nodes_fixed = set((node['id'], node['type']) for node in cfg_fixed['program']['flowGraph']['nodes'])
    edges_buggy = set(((edge['from'], edge['to']), edge['type']) for edge in cfg_buggy['program']['flowGraph']['edges'])
    edges_fixed = set(((edge['from'], edge['to']), edge['type']) for edge in cfg_fixed['program']['flowGraph']['edges'])
    
    # Count nodes and edges present in cfg_fixed but not in cfg_buggy
    new_nodes = nodes_fixed - nodes_buggy
    new_edges = edges_fixed - edges_buggy

    # Count nodes and edges removed from cfg_buggy to cfg_fixed
    removed_nodes = nodes_buggy - nodes_fixed
    removed_edges = edges_buggy - edges_fixed

    # The difference could be a simple sum of new and removed components
    total_difference = len(new_nodes) + len(new_edges) + len(removed_nodes) + len(removed_edges)
    
    return total_difference

def cfg_reward(generated_code, target_code):

    cfg_generated = get_cfg_from_code(generated_code)
    cfg_target = get_cfg_from_code(target_code)

    if cfg_generated is None or cfg_target is None:
        
        return 0

    node_similarity = compute_node_similarity(cfg_generated, cfg_target)
    edge_similarity = compute_edge_similarity(cfg_generated, cfg_target)
    path_similarity = compute_path_similarity(cfg_generated, cfg_target)
    structural_difference = compute_structural_differences(cfg_generated, cfg_target)
    
    # Composite score calculation example (weights can be adjusted)
    score = (node_similarity + edge_similarity + path_similarity + 1 - structural_difference) / 4
    return score


print(cfg_reward(code1, code2))


{'program': {'flowGraph': {'nodes': [{'id': 1, 'type': 'Entry'}, {'id': 2, 'type': 'SuccessExit'}], 'edges': [{'from': 1, 'to': 2, 'type': 'Epsilon', 'label': '', 'data': None}]}}, 'functions': [{'id': 1, 'name': 'Function', 'flowGraph': {'nodes': [{'id': 4, 'type': 'Entry'}, {'id': 5, 'type': 'SuccessExit'}, {'id': 7, 'type': 'Normal'}], 'edges': [{'from': 4, 'to': 7, 'type': 'Normal', 'label': 'this.socket.close()', 'data': {'type': 'CallExpression', 'callee': {'type': 'MemberExpression', 'computed': False, 'object': {'type': 'MemberExpression', 'computed': False, 'object': {'type': 'ThisExpression'}, 'property': {'type': 'Identifier', 'name': 'socket'}}, 'property': {'type': 'Identifier', 'name': 'close'}}, 'arguments': []}}, {'from': 7, 'to': 5, 'type': 'AbruptCompletion', 'label': 'return undefined', 'data': {'type': 'ReturnStatement', 'argument': {'type': 'Identifier', 'name': 'undefined'}}}]}}]}
0.75


Defining and Training the Critic Model

In [None]:
# Definition of the Critic Model
class CriticModel(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(CriticModel, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, 1)  # Output a scalar value as the expected return

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)  # Directly output the expected return without activation function
        return x

# Definition of the Environment Class
class CodeEnvironment:
    def calculate_reward(self, generated_code, target_code):
        quality_score = calculate_code_quality_score(generated_code)
        cfg_score = cfg_reward(generated_code, target_code)
        return (quality_score + cfg_score) / 2

# Load model and tokenizer
actor_model = AutoModelForCausalLM.from_pretrained('Salesforce/codegen-350m-nl', trust_remote_code=True)

# Set the device
actor_model = actor_model.to(device)
critic = CriticModel(input_dim=50, hidden_dim=256).to(device)  # Adjust input dimensions as needed
optimizer = optim.Adam(critic.parameters(), lr=0.01)
environment = CodeEnvironment()

# Training data loader
train_dataloader = DataLoader(train_dataset, batch_size=400, shuffle=True)

# Define training loop
num_epochs = 3  # Adjust the number of epochs as needed
for epoch in range(num_epochs):
    for batch in train_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Use the actor model to generate code
        generated_ids = actor_model.generate(
            input_ids,
            attention_mask=attention_mask,
            max_new_tokens=50  # Allowable number of additional tokens
        )
        
        # Convert generated IDs to code text
        generated_code = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

        # Convert label IDs to target code text
        target_code = tokenizer.decode(labels[0], skip_special_tokens=True)

        # Calculate the reward
        reward = environment.calculate_reward(generated_code, target_code)

        # Expand reward tensor to match the size of predicted_value
        reward_tensor = torch.full((input_ids.size(0),), reward, device=device)

        # Get the expected return from the critic
        predicted_value = critic(input_ids.float())  # Ensure the input type is correct

        # Calculate the loss
        loss = F.mse_loss(predicted_value.squeeze(), reward_tensor)
        # Backpropagation and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch + 1}, Loss: {loss.item()}")

# Save the critic model
torch.save(critic.state_dict(), 'critic_model.pth')
print("Critic model saved successfully!")

Defining the Router Model

In [None]:
class TrainingMethodSelector(nn.Module):
    def __init__(self, input_size=5, hidden_size=64):
        super(TrainingMethodSelector, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, 1)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return torch.sigmoid(x)


Training with PPO and SFT

In [None]:
# Define Critic Model

class CriticModel(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(CriticModel, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, 1)  # Output a scalar value as the expected return

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)  # Directly output the expected return without activation function
        return x

# Define reward calculation function
def calculate_rewards(policy_output, labels, critic_model, tokenizer):
    generated_texts = [tokenizer.decode(g, skip_special_tokens=True) for g in policy_output]
    target_texts = [tokenizer.decode(t, skip_special_tokens=True) for t in labels]
    
    rewards = []
    for gen_text, target_text in zip(generated_texts, target_texts):
        quality_score = calculate_code_quality_score(gen_text)
        cfg_score = cfg_reward(gen_text, target_text)
        
        # Encode the generated text to token IDs
        gen_text_ids = tokenizer.encode(gen_text, return_tensors='pt').to(device)
        
        # Ensure gen_text_ids shape matches CriticModel's input expectations
        if gen_text_ids.shape[1] < critic_model.fc1.in_features:
            padding = torch.zeros(1, critic_model.fc1.in_features - gen_text_ids.shape[1]).to(device)
            gen_text_ids = torch.cat((gen_text_ids, padding), dim=1)
        elif gen_text_ids.shape[1] > critic_model.fc1.in_features:
            gen_text_ids = gen_text_ids[:, :critic_model.fc1.in_features]
        
        critic_score = critic_model(gen_text_ids.float())
        total_reward = (-quality_score - cfg_score + critic_score.item()) / 3
        rewards.append(total_reward)
    
    return torch.tensor(rewards).float().to(device)

# Define PPO step function
def ppo_step(model, inputs, old_log_probs, values, rewards, advantages, optimizer, clip_param=0.2):
    # Forward pass to get new log_probs and value estimates
    outputs = model(**inputs)
    new_log_probs = torch.log_softmax(outputs.logits, dim=-1)
    
    # Match shapes of new_log_probs and old_log_probs
    new_log_probs = new_log_probs.view_as(old_log_probs)

    # Calculate probability ratios
    ratios = torch.exp(new_log_probs - old_log_probs)
    
    # Ensure advantages shape matches ratios shape
    advantages = advantages.unsqueeze(-1).expand_as(ratios)

    # Calculate the clipped objective function
    surr1 = ratios * advantages
    surr2 = torch.clamp(ratios, 1.0 - clip_param, 1.0 + clip_param) * advantages
    policy_loss = -torch.min(surr1, surr2).mean()

    # Calculate value loss (e.g., mean squared error loss)
    value_loss = F.mse_loss(values, rewards.unsqueeze(-1))

    # Total loss
    loss = policy_loss + 0.5 * value_loss

    # Optimization step
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    return loss.item()

# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, device_map="auto", load_in_4bit=True, quantization_config=None)
#model.to(device)

# Freeze most parameters
'''for name, parameter in model.named_parameters():
    if not ("decoder.block.4" in name or "decoder.block.5" in name or "lm_head" in name):
        parameter.requires_grad = False
'''
for name, parameter in model.named_parameters():
    if not (("model.layers.28" in name or 
             "model.layers.29" in name or 
             "model.layers.30" in name or 
             "model.layers.31" in name or 
             "lm_head" in name)):
        parameter.requires_grad = False
# Load critic model
critic_model = CriticModel(input_dim=512, hidden_dim=256).to(device)  # Adjust input_dim as per actual data

critic_model.to(device)

# Define data loaders
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)  # Reduce batch size
val_dataloader = DataLoader(val_dataset, batch_size=16)

# Optimizer and scheduler
optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=5e-5)
scheduler = StepLR(optimizer, step_size=1, gamma=0.95)

# Initialize the controller network and its optimizer
training_method_selector = TrainingMethodSelector(input_size=5, hidden_size=64).to(device)
controller_optimizer = torch.optim.Adam(training_method_selector.parameters(), lr=1e-3)

num_epochs = 10
ppo_max_steps = 3  # Maximum number of PPO training steps allowed
ppo_steps = 0

train_losses = []
val_losses = []
best_val_loss = float('inf')

# Baseline loss for reward calculation and moving average smoothing coefficient
baseline_loss = 1.0  
alpha = 0.9  

for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs}")
    model.train()
    running_loss = 0.0
    num_batches = len(train_dataloader)
    
    for batch in tqdm(train_dataloader, desc=f"Training Epoch {epoch+1}"):
        # Compute feature vector for the current batch
        features = compute_batch_features(batch)  # shape: [1, 5]
        
        # Controller network outputs decision probability
        prob = training_method_selector(features)  # Output shape: [1,1]
        m = Bernoulli(prob)
        action = m.sample()  # 0 for SFT training, 1 for PPO training
        log_prob = m.log_prob(action)
        
        # Select training method based on controller decision
        if action.item() == 1 and ppo_steps < ppo_max_steps:
            print("PPO training for this batch")
            # PPO training branch
            # Move all batch data (except 'code') to the device
            batch = {k: v.to(device) for k, v in batch.items() if k != 'code'}
            outputs = model(**batch)
            predictions = torch.argmax(outputs.logits, dim=-1)
            old_log_probs = torch.log_softmax(outputs.logits, dim=-1)
            rewards = calculate_rewards(predictions, batch['labels'], critic_model, tokenizer)
            
            # Adjust input_ids shape to match the CriticModel's input requirements
            input_ids = batch['input_ids'].float()
            if input_ids.shape[1] < critic_model.fc1.in_features:
                padding = torch.zeros(input_ids.shape[0],
                                      critic_model.fc1.in_features - input_ids.shape[1]).to(device)
                input_ids = torch.cat((input_ids, padding), dim=1)
            elif input_ids.shape[1] > critic_model.fc1.in_features:
                input_ids = input_ids[:, :critic_model.fc1.in_features]
                
            values = critic_model(input_ids)
            advantages = (rewards - values.squeeze().detach()).unsqueeze(-1)
            
            # Perform a PPO training step (assume ppo_step returns the training loss for this batch)
            training_loss_value = ppo_step(model, batch, old_log_probs, values, rewards, advantages, optimizer)
            ppo_steps += 1
        else:
            # SFT training branch
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            sft_loss = outputs.loss
            sft_loss.backward()
            optimizer.step()
            training_loss_value = sft_loss.item()
        
        running_loss += training_loss_value
        
        # Calculate reward: reduction in loss relative to the baseline (positive if loss decreased)
        reward = baseline_loss - training_loss_value
        # Update baseline using a moving average (alpha is the smoothing factor)
        baseline_loss = alpha * baseline_loss + (1 - alpha) * training_loss_value
        
        # Update the controller network using a policy gradient objective (maximize reward)
        controller_loss = -log_prob * reward  # Both log_prob and reward are scalars
        controller_optimizer.zero_grad()
        controller_loss.backward()
        controller_optimizer.step()
        
    avg_loss = running_loss / num_batches
    print(f"Epoch {epoch+1}/{num_epochs} | Average Training Loss: {avg_loss}")
    train_losses.append(avg_loss)
    
    # Validation phase (using SFT for validation)
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for batch in tqdm(val_dataloader, desc=f"Validation Epoch {epoch+1}"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            val_loss += loss.item()
    val_loss /= len(val_dataloader)
    val_losses.append(val_loss)
    
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), 'best_model.pth')
        print(f"Model saved at epoch {epoch+1} with validation loss {val_loss}")
    
    scheduler.step()

# Plot training and validation loss curves
plt.figure(figsize=(10, 5))
plt.plot(train_losses, label='Training Loss')
plt.plot(val_losses, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss Over Epochs')
plt.legend()
plt.show()

Generating Predictions and Evaluating Code Quality with CodeBLEU

In [None]:
# Define test data loader
test_dataloader = DataLoader(test_dataset, batch_size=200)

# Function to generate predictions from the model
def generate_predictions(data_loader, max_length=512):
    model.eval()
    predictions = []
    references = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            # Note the max_length parameter set to 512
            outputs = model.generate(input_ids, attention_mask=attention_mask, max_length=max_length)
            decoded_preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in outputs]
            decoded_labels = [tokenizer.decode(l, skip_special_tokens=True, clean_up_tokenization_spaces=True) for l in batch['labels']]
            predictions.extend(decoded_preds)
            references.extend(decoded_labels)
    return predictions, references

# Use the updated function to customize max_length as needed
predictions, references = generate_predictions(test_dataloader, max_length=64)

# Filter predictions and references
predictions = predictions[:300] + predictions[401:]
references = references[:300] + references[401:]

# Calculate CodeBLEU score
res = calc_codebleu(predictions, references, "javascript")

# Calculate CrystalBLEU score
res2 = corpus_bleu(references, predictions, weights=[0.1, 0.1, 0.1, 0.1], smoothing_function=SmoothingFunction().method4)

print(res, res2)