In [1]:
# uv pip install torch transformers numpy

In [1]:
# PyTorch imports
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset

# Model imports
from model import SmartContractVulnerabilityGAN

# Training imports
from train import VulnerabilityDetectionTrainer

# Data processing imports
from data_processing import SmartContractDataset, preprocess_contract

# Optional but useful imports
import numpy as np
from tqdm import tqdm  # for progress bars
import logging 


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.5 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "/usr/lib/python3.11/runpy.py", line 198, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.11/runpy.py", line 88, in _run_code
    exec(code, run_globals)
  File "/home/m20180848/pytorch_env/lib/python3.11/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/home/m20180848/pytorch_env/lib/python3.11/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/home/m20180848/pytorch_env

In [2]:
print(f"CUDA is available: {torch.cuda.is_available()}")
print(f"Number of GPUs: {torch.cuda.device_count()}")

CUDA is available: True
Number of GPUs: 2


In [3]:
from datasets import load_dataset

ds = load_dataset("jainabh/smart_contracts_malicious")

In [4]:
ds

DatasetDict({
    train: Dataset({
        features: ['contract_source', 'malicious'],
        num_rows: 2000
    })
})

In [5]:
2000 * 0.7

1400.0

In [6]:
# Initialize model with correct dimensions
model = SmartContractVulnerabilityGAN(d_model=768)  # Use 768 to match CodeBERT's output

# Move model to CUDA
model = model.cuda()

# Load and preprocess data
train_contracts = ds['train'][0:1400]['contract_source']  # Changed from 'contract_source' to 'source_code'
train_labels = ds['train'][0:1400]['malicious']
val_contracts = ds['train'][1400:-1]['contract_source']  # Changed from 'contract_source' to 'source_code'
val_labels = ds['train'][1400:-1]['malicious']

In [7]:
import re
from typing import List, Dict, Any
import json

def parse_solidity_to_ast(code: str) -> Dict[str, Any]:
    """
    Parse Solidity code into a simplified AST structure
    """
    def extract_contract_info(code: str) -> Dict[str, Any]:
        # Extract contract name
        contract_match = re.search(r'contract\s+(\w+)', code)
        contract_name = contract_match.group(1) if contract_match else "Unknown"
        
        # Extract functions
        functions = []
        function_pattern = r'function\s+(\w+)\s*\(([^)]*)\)\s*(?:public|private|internal|external)?\s*(?:view|pure|payable)?\s*(?:returns\s*\(([^)]*)\))?\s*{'
        for match in re.finditer(function_pattern, code):
            func_name = match.group(1)
            params = match.group(2).split(',') if match.group(2) else []
            returns = match.group(3).split(',') if match.group(3) else []
            
            functions.append({
                'name': func_name,
                'parameters': [p.strip() for p in params],
                'returns': [r.strip() for r in returns]
            })
        
        # Extract state variables
        variables = []
        var_pattern = r'(?:uint|address|string|bool|mapping)\s+(?:\w+)\s+(\w+)'
        for match in re.finditer(var_pattern, code):
            variables.append(match.group(1))
        
        return {
            'type': 'Contract',
            'name': contract_name,
            'functions': functions,
            'variables': variables
        }
    
    try:
        # Clean the code
        code = re.sub(r'//.*?\n|/\*.*?\*/', '', code)  # Remove comments
        code = re.sub(r'\s+', ' ', code)  # Normalize whitespace
        
        # Parse the code
        ast = extract_contract_info(code)
        return ast
    except Exception as e:
        print(f"Error parsing code: {str(e)}")
        return None

def prepare_code2vec_input(ast: Dict[str, Any]) -> List[str]:
    """
    Convert AST to code2vec input format
    """
    paths = []
    
    def extract_paths(node: Dict[str, Any], current_path: List[str] = None):
        if current_path is None:
            current_path = []
            
        # Add current node to path
        if 'name' in node:
            current_path.append(node['name'])
            
        # Process functions
        if 'functions' in node:
            for func in node['functions']:
                func_path = current_path + [func['name']]
                paths.append(' '.join(func_path))
                
                # Add parameter paths
                for param in func['parameters']:
                    param_path = func_path + [param]
                    paths.append(' '.join(param_path))
                
                # Add return paths
                for ret in func['returns']:
                    ret_path = func_path + [ret]
                    paths.append(' '.join(ret_path))
        
        # Process variables
        if 'variables' in node:
            for var in node['variables']:
                var_path = current_path + [var]
                paths.append(' '.join(var_path))
    
    extract_paths(ast)
    return paths

# Example usage:
def process_contract_for_code2vec(code: str) -> List[str]:
    """
    Process a Solidity contract for code2vec
    """
    # Parse code to AST
    ast = parse_solidity_to_ast(code)
    if ast is None:
        return []
    
    # Convert AST to code2vec input format
    paths = prepare_code2vec_input(ast)
    return paths

In [8]:
# Create custom dataset that includes path embeddings
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, AutoModel  # Add these imports

class SmartContractDatasetWithPaths(Dataset):
    def __init__(self, contracts, labels, tokenizer, code2vec_model):
        self.contracts = contracts
        self.labels = labels
        self.tokenizer = tokenizer
        self.code2vec_model = code2vec_model
        
    def __len__(self):
        return len(self.contracts)
    
    def __getitem__(self, idx):
        contract = self.contracts[idx]
        label = self.labels[idx]
        
        # Parse contract to AST and generate paths
        ast = parse_solidity_to_ast(contract)
        paths = prepare_code2vec_input(ast)
        
        # Convert paths to string for tokenization
        paths_str = ' '.join([' '.join(path) for path in paths])
        
        # Tokenize contract
        contract_inputs = self.tokenizer(
            contract,
            padding='max_length',
            truncation=True,
            max_length=512,
            return_tensors="pt"
        )
        
        # Tokenize paths
        path_inputs = self.tokenizer(
            paths_str,
            padding='max_length',
            truncation=True,
            max_length=512,
            return_tensors="pt"
        )
        
        return {
            'input_ids': contract_inputs['input_ids'].squeeze(0),
            'attention_mask': contract_inputs['attention_mask'].squeeze(0),
            'path_input_ids': path_inputs['input_ids'].squeeze(0),
            'path_attention_mask': path_inputs['attention_mask'].squeeze(0),
            'label': torch.tensor(label, dtype=torch.float)
        }

In [9]:
# Initialize tokenizer and code2vec model
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
code2vec_model = AutoModel.from_pretrained('microsoft/codebert-base').cuda()

# Create datasets
train_dataset = SmartContractDatasetWithPaths(
    train_contracts, 
    train_labels,
    tokenizer,
    code2vec_model
)

val_dataset = SmartContractDatasetWithPaths(
    val_contracts,
    val_labels,
    tokenizer,
    code2vec_model
)

# Create dataloaders
train_dataloader = DataLoader(
    train_dataset,
    batch_size=32,
    shuffle=True
)

val_dataloader = DataLoader(
    val_dataset,
    batch_size=32,
    shuffle=False
)

In [None]:
# Initialize trainer
from train import VulnerabilityDetectionTrainer
import time
from datetime import datetime
import torch
import os

# Create a directory for checkpoints
checkpoint_dir = 'v4-checkpoints'
os.makedirs(checkpoint_dir, exist_ok=True)

# Initialize trainer
trainer = VulnerabilityDetectionTrainer(
    model,
    train_dataloader,
    val_dataloader
)

# Training loop
num_epochs = 100
best_val_loss = float('inf')

for epoch in range(num_epochs):
    # Start timer for this epoch
    epoch_start_time = time.time()
    
    # Training
    g_loss, d_loss, decoder_loss = trainer.train_epoch()
    val_loss = trainer.validate()
    
    # Calculate epoch time
    epoch_time = time.time() - epoch_start_time
    
    # Print training progress
    print(f"Epoch {epoch+1}/{num_epochs}")
    print(f"Generator Loss: {g_loss:.4f}")
    print(f"Discriminator Loss: {d_loss:.4f}")
    print(f"Decoder Loss: {decoder_loss:.4f}")
    print(f"Validation Loss: {val_loss:.4f}")
    print(f"Epoch Time: {epoch_time:.2f}s")
    
    # Save model checkpoint
    if (epoch + 1) % 10 == 0:
        checkpoint = {
            # Model states
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'generator_state_dict': model.generator.state_dict(),
            'discriminator_state_dict': model.discriminator.state_dict(),
            'decoder_state_dict': model.decoder.state_dict(),
    
            # Optimizer states
            'optimizer_G_state_dict': trainer.optimizer_G.state_dict(),
            'optimizer_D_state_dict': trainer.optimizer_D.state_dict(),
            'optimizer_decoder_state_dict': trainer.optimizer_decoder.state_dict(),
    
            # Loss values
            'g_loss': g_loss,
            'd_loss': d_loss,
            'decoder_loss': decoder_loss,
            'val_loss': val_loss,
    
            # Model configuration
            'model_config': {
                'd_model': model.d_model,
                'vocab_size': model.vocab_size,
                'max_length': model.max_length
            },
    
            # Training metadata
            'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            'epoch_time': epoch_time
        }
    
        # Save regular checkpoint
        checkpoint_path = os.path.join(checkpoint_dir, f'checkpoint_epoch_{epoch+1}_model_v4.pt')
        torch.save(checkpoint, checkpoint_path)
        print(f"Saved checkpoint to {checkpoint_path}")
    
        # Save best model if validation loss improved
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model_path = os.path.join(checkpoint_dir, 'best_model_v4.pt')
            torch.save(checkpoint, best_model_path)
            print(f"New best model saved with validation loss: {val_loss:.4f}")


print("\nTraining completed!")
print(f"Best validation loss: {best_val_loss:.4f}")


Starting training epoch...


In [None]:
print('Done')

## Re-train model:

In [11]:
# Initialize trainer
from train import VulnerabilityDetectionTrainer
import time
from datetime import datetime
import torch
import os

# Create a directory for checkpoints
checkpoint_dir = 'v3-checkpoints'
os.makedirs(checkpoint_dir, exist_ok=True)

# Initialize trainer
trainer = VulnerabilityDetectionTrainer(
    model,
    train_dataloader,
    val_dataloader
)

In [14]:
checkpoint_path

'v3-checkpoints/checkpoint_epoch_30_model_v3.pt'

In [16]:
# Load checkpoint
checkpoint_path = os.path.join(checkpoint_dir, 'checkpoint_epoch_20_model_v3.pt')  # Change this to your checkpoint file
checkpoint = torch.load(checkpoint_path)

# Load model states
model.load_state_dict(checkpoint['model_state_dict'])
model.generator.load_state_dict(checkpoint['generator_state_dict'])
model.discriminator.load_state_dict(checkpoint['discriminator_state_dict'])
model.decoder.load_state_dict(checkpoint['decoder_state_dict'])

# Load optimizer states
trainer.optimizer_G.load_state_dict(checkpoint['optimizer_G_state_dict'])
trainer.optimizer_D.load_state_dict(checkpoint['optimizer_D_state_dict'])
trainer.optimizer_decoder.load_state_dict(checkpoint['optimizer_decoder_state_dict'])

# Get the epoch to start from and best validation loss
start_epoch = checkpoint['epoch']
best_val_loss = checkpoint['val_loss']

print(f"Loaded checkpoint from epoch {start_epoch + 1}")
print(f"Previous validation loss: {best_val_loss:.4f}")

Loaded checkpoint from epoch 20
Previous validation loss: 0.0070


In [None]:
# Training loop - start from the next epoch
num_epochs = 120

for epoch in range(start_epoch + 1, num_epochs):  # Start from the next epoch
    # Start timer for this epoch
    epoch_start_time = time.time()
    
    # Training
    g_loss, d_loss, decoder_loss = trainer.train_epoch()
    val_loss = trainer.validate()
    
    # Calculate epoch time
    epoch_time = time.time() - epoch_start_time
    
    # Print training progress
    print(f"Epoch {epoch+1}/{num_epochs}")
    print(f"Generator Loss: {g_loss:.4f}")
    print(f"Discriminator Loss: {d_loss:.4f}")
    print(f"Decoder Loss: {decoder_loss:.4f}")
    print(f"Validation Loss: {val_loss:.4f}")
    print(f"Epoch Time: {epoch_time:.2f}s")
    
    # Save model checkpoint
    if (epoch + 1) % 10 == 0:
        checkpoint = {
            # Model states
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'generator_state_dict': model.generator.state_dict(),
            'discriminator_state_dict': model.discriminator.state_dict(),
            'decoder_state_dict': model.decoder.state_dict(),
            
            # Optimizer states
            'optimizer_G_state_dict': trainer.optimizer_G.state_dict(),
            'optimizer_D_state_dict': trainer.optimizer_D.state_dict(),
            'optimizer_decoder_state_dict': trainer.optimizer_decoder.state_dict(),
            
            # Loss values
            'g_loss': g_loss,
            'd_loss': d_loss,
            'decoder_loss': decoder_loss,
            'val_loss': val_loss,
            
            # Model configuration
            'model_config': {
                'vocab_size': model.decoder.vocab_size,
                'max_length': model.decoder.max_length
            },
            
            # Training metadata
            'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            'epoch_time': epoch_time
        }
        
        # Save regular checkpoint
        checkpoint_path = os.path.join(checkpoint_dir, f'checkpoint_epoch_{epoch+1}_model_v3.pt')
        torch.save(checkpoint, checkpoint_path)
        print(f"Saved checkpoint for epoch {epoch+1}")
        
        # Save best model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model_path = os.path.join(checkpoint_dir, 'best_model_v3.pt')
            torch.save(checkpoint, best_model_path)
            print(f"New best model saved with validation loss: {val_loss:.4f}")

print("\nTraining completed!")
print(f"Best validation loss: {best_val_loss:.4f}")


Starting training epoch...


In [None]:
print('Done')

NOTES:

1. Input Processing:
Initial input: [32, 512] (batch_size=32, sequence_length=512)
After embedding: [32, 512, 512] (batch_size=32, sequence_length=512, embedding_dim=512)
This is correct because the embedding layer converts each token to a 512-dimensional vector

2. Path Embeddings Processing:
Initial path embeddings: [32, 768] (batch_size=32, code2vec_dim=768)
After path embedding layer: [32, 512] (batch_size=32, transformer_dim=512)
The linear layer converts from code2vec's 768 dimensions to transformer's 512 dimensions
After expansion: [32, 512, 512] (batch_size=32, sequence_length=512, transformer_dim=512)
The path embeddings are expanded to match the sequence length

3. Final Shape:
[32, 512, 512] (batch_size=32, sequence_length=512, transformer_dim=512)
This is the correct shape for the transformer layers


In [15]:
checkpoint = {
    # Model states
    'epoch': epoch,
    'model_state_dict': model.state_dict(),
    'generator_state_dict': model.generator.state_dict(),
    'discriminator_state_dict': model.discriminator.state_dict(),
    'decoder_state_dict': model.decoder.state_dict(),
    
    # Optimizer states
    'optimizer_G_state_dict': trainer.optimizer_G.state_dict(),
    'optimizer_D_state_dict': trainer.optimizer_D.state_dict(),
    'optimizer_decoder_state_dict': trainer.optimizer_decoder.state_dict(),
    
    # Loss values
    'g_loss': g_loss,
    'd_loss': d_loss,
    'decoder_loss': decoder_loss,
    'val_loss': val_loss,
    
    # Model configuration
    'model_config': {
        #'d_model': model.d_model,
        'vocab_size': model.decoder.vocab_size,
        'max_length': model.decoder.max_length
    },
    
    # Training metadata
    'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
    'epoch_time': epoch_time
}

# Save regular checkpoint
checkpoint_path = os.path.join(checkpoint_dir, f'checkpoint_epoch_{epoch+1}_model_v3.pt')
torch.save(checkpoint, checkpoint_path)
print(f"Saved checkpoint for epoch {epoch+1}")

Saved checkpoint for epoch 10


In [25]:
print(f"Epoch [{epoch}/{num_epochs}]")
print(f"Generator Loss: {g_loss:.4f}")
print(f"Discriminator Loss: {d_loss:.4f}")
print(f"Validation Loss: {val_loss:.4f}")

Epoch [119/120]
Generator Loss: 10.5703
Discriminator Loss: 0.0062
Validation Loss: 0.0002


## Save model:

In [26]:
# After training loop
# Save the final model and training state
torch.save({
    'epoch': num_epochs,
    'model_state_dict': model.state_dict(),
    'optimizer_G_state_dict': trainer.optimizer_G.state_dict(),
    'optimizer_D_state_dict': trainer.optimizer_D.state_dict(),
    'g_loss': g_loss,
    'd_loss': d_loss,
    'val_loss': val_loss,
    'model_config': {
        'd_model': 768,
    }
}, 'final_model_v3.pt')

# If you want to save just the model for inference
torch.save(model.state_dict(), 'model_weights_v3.pt')

# If you want to save the entire model
torch.save(model, 'full_model_v3.pt')

In [27]:
# Save model with additional information
torch.save({
    'epoch': num_epochs,
    'model_state_dict': model.state_dict(),
    'optimizer_G_state_dict': trainer.optimizer_G.state_dict(),
    'optimizer_D_state_dict': trainer.optimizer_D.state_dict(),
    'g_loss': g_loss,
    'd_loss': d_loss,
    'val_loss': val_loss,
    'model_config': {
        'd_model': 768
    },
    'training_config': {
        'learning_rate': 0.0002,
        'beta1': 0.5,
        'batch_size': 32
    },
    'training_history': {
        'g_losses': g_loss,  # List of generator losses
        'd_losses': d_loss,  # List of discriminator losses
        'val_losses': val_loss  # List of validation losses
    }
}, 'final_model_v3_with_history.pt')

# Load Model:

In [10]:
# Load the full training state
checkpoint = torch.load('final_model_v3.pt')
model = SmartContractVulnerabilityGAN(**checkpoint['model_config'])
model.load_state_dict(checkpoint['model_state_dict'])
model = model.cuda() 

# Initialize trainer with loaded model
trainer = VulnerabilityDetectionTrainer(
    model,
    train_dataloader,
    val_dataloader
)

# Load optimizer states if needed
trainer.optimizer_G.load_state_dict(checkpoint['optimizer_G_state_dict'])
trainer.optimizer_D.load_state_dict(checkpoint['optimizer_D_state_dict'])


In [21]:
# Or if you just want to load the model weights
model = SmartContractVulnerabilityGAN(d_model=768)
model.load_state_dict(torch.load('model_weights.pt'))
model = model.cuda()  # Move to GPU if needed


In [None]:
# Or if you saved the entire model
model = torch.load('full_model.pt')
model = model.cuda()  # Move to GPU if needed

## Model Exploration:

In [13]:
print("Type of model:", type(model))

# If it's a SmartContractVulnerabilityGAN object, we can inspect its components directly
print("\nModel Components:")
print("-" * 50)

# Print model attributes
print("\nModel Attributes:")
for attr in dir(model):
    if not attr.startswith('_'):  # Skip private attributes
        print(f"- {attr}")

# Print model structure
print("\nModel Structure:")
print(model)


Type of model: <class 'model.SmartContractVulnerabilityGAN'>

Model Components:
--------------------------------------------------

Model Attributes:
- T_destination
- add_module
- apply
- bfloat16
- buffers
- call_super_init
- children
- codebert
- compile
- cpu
- cuda
- decode_embeddings
- decoder
- discriminator
- double
- dump_patches
- eval
- extra_repr
- float
- forward
- generate_code
- generator
- get_buffer
- get_extra_state
- get_parameter
- get_submodule
- half
- ipu
- load_state_dict
- modules
- named_buffers
- named_children
- named_modules
- named_parameters
- parameters
- register_backward_hook
- register_buffer
- register_forward_hook
- register_forward_pre_hook
- register_full_backward_hook
- register_full_backward_pre_hook
- register_load_state_dict_post_hook
- register_module
- register_parameter
- register_state_dict_pre_hook
- requires_grad_
- set_extra_state
- share_memory
- state_dict
- to
- to_empty
- tokenizer
- train
- training
- transformer
- type
- xpu
- zer

In [14]:
print("\nModel State Dict:")
for key in model.state_dict().keys():
    print(f"- {key}")


Model State Dict:
- codebert.embeddings.word_embeddings.weight
- codebert.embeddings.position_embeddings.weight
- codebert.embeddings.token_type_embeddings.weight
- codebert.embeddings.LayerNorm.weight
- codebert.embeddings.LayerNorm.bias
- codebert.encoder.layer.0.attention.self.query.weight
- codebert.encoder.layer.0.attention.self.query.bias
- codebert.encoder.layer.0.attention.self.key.weight
- codebert.encoder.layer.0.attention.self.key.bias
- codebert.encoder.layer.0.attention.self.value.weight
- codebert.encoder.layer.0.attention.self.value.bias
- codebert.encoder.layer.0.attention.output.dense.weight
- codebert.encoder.layer.0.attention.output.dense.bias
- codebert.encoder.layer.0.attention.output.LayerNorm.weight
- codebert.encoder.layer.0.attention.output.LayerNorm.bias
- codebert.encoder.layer.0.intermediate.dense.weight
- codebert.encoder.layer.0.intermediate.dense.bias
- codebert.encoder.layer.0.output.dense.weight
- codebert.encoder.layer.0.output.dense.bias
- codebert.e

In [15]:
print("\nParameter Shapes:")
for name, param in model.named_parameters():
    print(f"- {name}: {param.shape}")


Parameter Shapes:
- codebert.embeddings.word_embeddings.weight: torch.Size([50265, 768])
- codebert.embeddings.position_embeddings.weight: torch.Size([514, 768])
- codebert.embeddings.token_type_embeddings.weight: torch.Size([1, 768])
- codebert.embeddings.LayerNorm.weight: torch.Size([768])
- codebert.embeddings.LayerNorm.bias: torch.Size([768])
- codebert.encoder.layer.0.attention.self.query.weight: torch.Size([768, 768])
- codebert.encoder.layer.0.attention.self.query.bias: torch.Size([768])
- codebert.encoder.layer.0.attention.self.key.weight: torch.Size([768, 768])
- codebert.encoder.layer.0.attention.self.key.bias: torch.Size([768])
- codebert.encoder.layer.0.attention.self.value.weight: torch.Size([768, 768])
- codebert.encoder.layer.0.attention.self.value.bias: torch.Size([768])
- codebert.encoder.layer.0.attention.output.dense.weight: torch.Size([768, 768])
- codebert.encoder.layer.0.attention.output.dense.bias: torch.Size([768])
- codebert.encoder.layer.0.attention.output.La

In [21]:
print("\nModel Configuration:")
print(f"- d_model: {model.d_model if hasattr(model, 'd_model') else 'N/A'}")
print(f"- vocab_size: {model.decoder.vocab_size if hasattr(model, 'decoder') else 'N/A'}")
print(f"- max_length: {model.decoder.max_length if hasattr(model, 'decoder') else 'N/A'}")


Model Configuration:
- d_model: N/A
- vocab_size: 50000
- max_length: 512


In [22]:
if hasattr(model, 'generator'):
    print("\nGenerator Architecture:")
    print(model.CodeDecoder)


Generator Architecture:


AttributeError: 'SmartContractVulnerabilityGAN' object has no attribute 'CodeDecoder'

### This is a GAN (Generative Adversarial Network) combined with a Transformer architecture for smart contract vulnerability detection.

#### Here's the technical breakdown:
#### 1. Architecture Components:
-Transformer Encoder: Processes smart contract code using self-attention
-Generator: Creates synthetic vulnerable code patterns
-Discriminator: Distinguishes between real and synthetic vulnerabilities

#### 2. Input Processing:
-Takes smart contract code and its AST (Abstract Syntax Tree) paths
-Uses CodeBERT to generate embeddings (768-dimensional vectors)
-Processes both contract code and path information

#### 3. Training Process:
3.1. Generator Training:
-Takes random noise and contract embeddings
-Generates synthetic vulnerable code patterns
-Tries to fool the discriminator

3.2. Discriminator Training:
-Takes real contract embeddings and generator outputs
-Learns to distinguish real from synthetic vulnerabilities
-Uses binary classification (real/fake)

#### 4. Output:
-Vulnerability scores for input contracts
-Synthetic vulnerable code patterns for training
-Binary classification of real vs. synthetic vulnerabilities

#### The model essentially learns to:
-Understand code patterns through the transformer
-Generate realistic vulnerable code examples
-Detect vulnerabilities in real contracts
-Improve detection through adversarial training

#### This approach combines the strengths of:
Transformers for code understanding
GANs for synthetic data generation
Binary classification for vulnerability detection

# 3. Usage:

In [36]:
import torch
from transformers import AutoTokenizer, AutoModel
from datetime import datetime
import numpy as np

class VulnerabilityPredictor:
    def __init__(self, model_path):
        """
        Initialize the vulnerability predictor
        Args:
            model_path: Path to the trained model checkpoint
        """
        try:
            # Load model and tokenizer
            self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
            
            # Load the checkpoint
            checkpoint = torch.load(model_path, map_location=self.device)
            print(f"Loaded checkpoint type: {type(checkpoint)}")
            
            # Initialize the model architecture
            self.model = SmartContractVulnerabilityGAN(
                d_model=768,  # CodeBERT's hidden size
            ).to(self.device)
            
            # Load the state dict
            if isinstance(checkpoint, dict):
                if 'model_state_dict' in checkpoint:
                    self.model.load_state_dict(checkpoint['model_state_dict'])
                    print("Loaded model from model_state_dict")
                else:
                    self.model.load_state_dict(checkpoint)
                    print("Loaded model from checkpoint dict")
            else:
                self.model = checkpoint
                print("Loaded full model from checkpoint")
                
            self.model.eval()
            
            # Initialize CodeBERT
            self.tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
            self.codebert = AutoModel.from_pretrained("microsoft/codebert-base").to(self.device)
            
            # Get decoder from model
            self.decoder = self.model.decoder
            
            print("Model loaded successfully")
            print(f"Using device: {self.device}")
            
        except Exception as e:
            print(f"Error initializing model: {str(e)}")
            raise
    
    def preprocess_contract(self, contract_code):
        """
        Preprocess contract code for model input
        """
        # Tokenize contract
        tokens = self.tokenizer(
            contract_code,
            padding='max_length',
            truncation=True,
            max_length=512,
            return_tensors='pt'
        )
        
        # Move to device
        input_ids = tokens['input_ids'].to(self.device)
        attention_mask = tokens['attention_mask'].to(self.device)
        
        return input_ids, attention_mask
    
    def predict_vulnerability(self, contract_code, threshold=0.5):
        """
        Predict if a contract is vulnerable and generate synthetic vulnerable code
        Returns: (is_vulnerable, vulnerability_score, synthetic_code)
        """
        with torch.no_grad():
            try:
                # Preprocess contract
                input_ids, attention_mask = self.preprocess_contract(contract_code)
                
                # Get model prediction
                outputs = self.model(
                    contract_ids=input_ids,
                    path_ids=input_ids,
                    contract_attention_mask=attention_mask,
                    path_attention_mask=attention_mask
                )
                
                # Get vulnerability score
                real_scores = outputs['real_scores']
                vulnerability_score = torch.sigmoid(real_scores).item()
                is_vulnerable = vulnerability_score > threshold
                
                # Generate synthetic vulnerable code if contract is vulnerable
                synthetic_code = None
                if is_vulnerable:
                    try:
                        # Get synthetic embeddings from generator
                        synthetic_embeddings = outputs['synthetic']
                        
                        # Decode synthetic embeddings to code
                        decoded_code = outputs['decoded_code']
                        tokens = torch.argmax(decoded_code, dim=-1)
                        
                        # Convert tokens to code
                        synthetic_code = self.tokenizer.decode(tokens[0])
                    except Exception as e:
                        print(f"Warning: Failed to generate synthetic code: {str(e)}")
                        synthetic_code = None
                
                return is_vulnerable, vulnerability_score, synthetic_code
                
            except Exception as e:
                print(f"Error in vulnerability prediction: {str(e)}")
                return False, 0.0, None
    
    def analyze_vulnerability(self, contract_code):
        """
        Comprehensive vulnerability analysis
        Returns: Dictionary containing vulnerability analysis results
        """
        try:
            is_vulnerable, score, synthetic_code = self.predict_vulnerability(contract_code)
            
            analysis = {
                'is_vulnerable': is_vulnerable,
                'vulnerability_score': score,
                'synthetic_vulnerable_code': synthetic_code if is_vulnerable else None,
                'original_code': contract_code,
                'analysis_timestamp': datetime.now().isoformat()
            }
            
            return analysis
        except Exception as e:
            print(f"Error in vulnerability analysis: {str(e)}")
            return {
                'is_vulnerable': False,
                'vulnerability_score': 0.0,
                'synthetic_vulnerable_code': None,
                'original_code': contract_code,
                'analysis_timestamp': datetime.now().isoformat(),
                'error': str(e)
            }
    
    def generate_synthetic_vulnerable_code(self, num_samples=1):
        """
        Generate synthetic vulnerable code samples
        Args:
            num_samples: Number of samples to generate
        Returns: List of generated code samples
        """
        with torch.no_grad():
            try:
                samples = []
                for _ in range(num_samples):
                    # Generate code using model's generate_code method
                    code = self.model.generate_code(num_samples=1)
                    samples.append(code)
                
                return samples
            except Exception as e:
                print(f"Error generating synthetic code: {str(e)}")
                return []

# Example usage
if __name__ == "__main__":
    # Initialize predictor
    predictor = VulnerabilityPredictor('final_model_v3.pt')
    
    # Example contract
    contract_code = """
    pragma solidity ^0.8.0;
    contract Example {
        uint256 private value;
        
        function setValue(uint256 _value) public {
            value = _value;
        }
        
        function getValue() public view returns (uint256) {
            return value;
        }
    }
    """
    
    # Get comprehensive analysis
    analysis = predictor.analyze_vulnerability(contract_code)
    print(f"Vulnerability Score: {analysis['vulnerability_score']}")
    print(f"Is Vulnerable: {analysis['is_vulnerable']}")
    if analysis['synthetic_vulnerable_code']:
        print("\nSynthetic Vulnerable Code:")
        print(analysis['synthetic_vulnerable_code'])
    
    # Generate synthetic vulnerable code
    synthetic_samples = predictor.generate_synthetic_vulnerable_code(num_samples=1)
    print("\nGenerated Synthetic Code:")
    for i, code in enumerate(synthetic_samples, 1):
        print(f"\nSample {i}:")
        print(code)

Loaded checkpoint type: <class 'dict'>
Loaded model from model_state_dict
Model loaded successfully
Using device: cuda
Vulnerability Score: 0.731036365032196
Is Vulnerable: True

Synthetic Vulnerable Code:
<s>pragma <pad><pad>

Generated Synthetic Code:

Sample 1:
<s>/** tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem tradem 

In [35]:
def analyze_contract(model, contract_code):
    # First, let's prepare the input
    inputs = model.tokenizer(
        contract_code,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=512
    )
    
    # Move inputs to the same device as the model
    device = next(model.parameters()).device
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    # Create path inputs with the same length as contract inputs
    path_inputs = model.tokenizer(
        "dummy path" * 100,  # Make it longer to match contract length
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=inputs['input_ids'].size(1)  # Match contract length
    )
    path_inputs = {k: v.to(device) for k, v in path_inputs.items()}
    
    # Print shapes for debugging
    print(f"Contract input shape: {inputs['input_ids'].shape}")
    print(f"Path input shape: {path_inputs['input_ids'].shape}")
    
    # Get the model outputs
    with torch.no_grad():
        # Get CodeBERT embeddings for contract
        contract_outputs = model.codebert(
            input_ids=inputs['input_ids'],
            attention_mask=inputs['attention_mask']
        )
        contract_embeddings = contract_outputs.last_hidden_state
        
        # Get CodeBERT embeddings for paths
        path_outputs = model.codebert(
            input_ids=path_inputs['input_ids'],
            attention_mask=path_inputs['attention_mask']
        )
        path_embeddings = path_outputs.last_hidden_state
        
        # Combine embeddings
        combined_embeddings = contract_embeddings + path_embeddings
        
        # Process through transformer
        transformed = model.transformer(combined_embeddings)
        
        # Get mean representation
        mean_embeddings = transformed.mean(dim=1)
        
        # Get vulnerability score from discriminator
        vulnerability_score = model.discriminator(mean_embeddings).item()
        
        # Generate synthetic code
        synthetic_embeddings = model.generator(mean_embeddings)
        decoded_code = model.decoder(synthetic_embeddings, transformed)
        tokens = torch.argmax(decoded_code, dim=-1)
        generated_code = model.tokenizer.decode(tokens[0])
    
    return {
        'vulnerability_score': vulnerability_score,
        'generated_code': generated_code
    }

# Example usage:
contract_code = """
pragma solidity ^0.8.0;

contract Example {
    mapping(address => uint) private balances;
    
    function deposit() public payable {
        balances[msg.sender] += msg.value;
    }
    
    function withdraw(uint amount) public {
        require(balances[msg.sender] >= amount);
        balances[msg.sender] -= amount;
        payable(msg.sender).transfer(amount);
    }
}
"""

# Make sure the model is in eval mode
model.eval()

# Get the analysis
results = analyze_contract(model, contract_code)

print(f"Vulnerability Score: {results['vulnerability_score']:.4f}")
print("\nGenerated Code:")
print(results['generated_code'])

Contract input shape: torch.Size([1, 146])
Path input shape: torch.Size([1, 146])
Vulnerability Score: 0.0000

Generated Code:
<s>/** tradem                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             


In [32]:
print('Done')

Done
