# Enhanced CodeBERT for Swift Code Understanding (Debug Version)

This is a modified version of the original notebook with debugging enhancements to troubleshoot training issues.

In [None]:
# Install required libraries
!pip install transformers datasets evaluate torch scikit-learn tqdm dropbox requests


In [None]:
# Important: These imports must be properly separated
import os
import json
import torch
import random
import numpy as np
import time
import gc
import re
import collections
import psutil  # Add psutil for memory monitoring
from tqdm.auto import tqdm
from datasets import load_dataset, ClassLabel
from sklearn.metrics import accuracy_score, f1_score, precision_recall_fscore_support
from sklearn.utils.class_weight import compute_class_weight
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification,
    RobertaForSequenceClassification,
    Trainer, 
    TrainingArguments,
    set_seed,
    DataCollatorWithPadding,
    EarlyStoppingCallback,
    get_scheduler
)

# Import AdamW from torch.optim instead of transformers.optimization
from torch.optim import AdamW
from transformers.trainer_utils import get_last_checkpoint

# Set a seed for reproducibility
set_seed(42)

# Add memory management function with more detailed reporting
def cleanup_memory():
    """Force garbage collection and clear CUDA cache if available."""
    # Get memory usage before cleanup
    before = psutil.Process(os.getpid()).memory_info().rss / 1024 ** 2
    
    # Perform cleanup
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    
    # Get memory usage after cleanup
    after = psutil.Process(os.getpid()).memory_info().rss / 1024 ** 2
    print(f"Memory cleaned up. Before: {before:.2f} MB, After: {after:.2f} MB, Freed: {before - after:.2f} MB")
    
    # Print system memory info
    mem = psutil.virtual_memory()
    print(f"System memory: {mem.percent}% used, {mem.available / 1024 / 1024:.2f} MB available")


In [None]:
# Check if GPU is available
import torch

if torch.cuda.is_available():
    device = torch.device('cuda')
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device('cpu')
    print("Using CPU - Note: Training will be much slower on CPU")

# Set random seeds for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)


## Dataset and Model Configuration

Let's define the model and dataset we'll be using, with reduced batch size for CPU training:

In [None]:
# Dataset configuration
DATASET_ID = "mvasiliniuc/iva-swift-codeint"# Model configuration - reduced batch size and epochs for debugging
MODEL_NAME = "Ct1tz/Codebert-Base-B2D4G5"
MAX_LENGTH = 10000  # Extended from 512 to 10000 tokens
BATCH_SIZE = 16  # Reduced from 16 to 4 for CPU training
LEARNING_RATE = 2e-5
WEIGHT_DECAY = 0.01
NUM_EPOCHS = 5  # Reduced from 5 to 1 for debugging
WARMUP_STEPS = 500  # Reduced from 500 to 100
GRADIENT_ACCUMULATION_STEPS = 4  # Reduced from 4 to 2
# Add a small dataset size for debugging
DEBUG_MODE = False  # Set to False for full training
DEBUG_SAMPLE_SIZE = 500  # Number of examples to use in debug mode
print("Using debug configuration with reduced parameters for CPU training.")

In [None]:
# Function to load dataset with retry logic
def load_dataset_with_retry(dataset_id, max_retries=3, retry_delay=5):
    """Load a dataset with retry logic."""
    for attempt in range(max_retries):
        try:
            print(f"Loading dataset (attempt {attempt+1}/{max_retries})...")
            data = load_dataset(dataset_id, trust_remote_code=True)
            print(f"Dataset loaded successfully with {len(data['train'])} examples")
            return data
        except Exception as e:
            print(f"Error loading dataset (attempt {attempt+1}/{max_retries}): {e}")
            if attempt < max_retries - 1:
                print(f"Retrying in {retry_delay} seconds...")
                time.sleep(retry_delay)
            else:
                print("Maximum retries reached. Could not load dataset.")
                raise

# Load the dataset with retry logic
try:
    print(f"Loading dataset: {DATASET_ID}")
    data = load_dataset_with_retry(DATASET_ID)
    print("Dataset structure:")
    print(data)
    
    # If in debug mode, take a small sample of the dataset
    if DEBUG_MODE and 'train' in data:
        print(f"DEBUG MODE: Sampling {DEBUG_SAMPLE_SIZE} examples from dataset")
        # Take a stratified sample if possible
        data['train'] = data['train'].shuffle(seed=42).select(range(min(DEBUG_SAMPLE_SIZE, len(data['train']))))
        print(f"Reduced dataset size: {len(data['train'])} examples")
        
except Exception as e:
    print(f"Fatal error loading dataset: {e}")
    raise


In [None]:
# Verify dataset structure and column names
def verify_dataset_structure(dataset):
    """Verify that the dataset has the expected structure and columns."""
    required_columns = ['repo_name', 'path', 'content']
    if 'train' not in dataset:
        print("WARNING: Dataset does not have a 'train' split.")
        return False
    
    missing_columns = [col for col in required_columns if col not in dataset['train'].column_names]
    if missing_columns:
        print(f"WARNING: Dataset is missing required columns: {missing_columns}")
        return False
    
    print("Dataset structure verification passed.")
    return True

# Verify dataset structure
dataset_valid = verify_dataset_structure(data)
if not dataset_valid:
    print("Dataset structure is not as expected. Proceeding with caution.")


In [None]:
# Load the CodeBERT tokenizer with error handling
try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, model_max_length=MAX_LENGTH)
    print(f"Tokenizer vocabulary size: {len(tokenizer)}")
    print(f"Tokenizer type: {tokenizer.__class__.__name__}")
except Exception as e:
    print(f"Error loading tokenizer: {e}")
    raise

In [None]:
def extract_file_type(path):
    """
    Extract the file type/category based on the file path and naming conventions in Swift projects.
    
    Args:
        path (str): The file path
        
    Returns:
        int: The category label (0-5)
    """
    path_lower = path.lower()
    filename = path.split('/')[-1].lower()
    
    # Category 0: Models - Data structures and model definitions
    if ('model' in path_lower or 
        'struct' in path_lower or 
        'entity' in path_lower or
        'data' in path_lower and 'class' in path_lower):
        return 0
    
    # Category 1: Views - UI related files
    elif ('view' in path_lower or 
          'ui' in path_lower or 
          'screen' in path_lower or 
          'page' in path_lower or
          'controller' in path_lower and 'view' in path_lower):
        return 1
    
    # Category 2: Controllers - Application logic
    elif ('controller' in path_lower or 
          'manager' in path_lower or 
          'coordinator' in path_lower or
          'service' in path_lower):
        return 2
    
    # Category 3: Utilities - Helper functions and extensions
    elif ('util' in path_lower or 
          'helper' in path_lower or 
          'extension' in path_lower or
          'common' in path_lower):
        return 3
    
    # Category 4: Tests - Test files
    elif ('test' in path_lower or 
          'spec' in path_lower or 
          'mock' in path_lower):
        return 4
    
    # Category 5: Configuration - Package and configuration files
    elif ('package.swift' in path_lower or 
          'config' in path_lower or 
          'settings' in path_lower or
          'info.plist' in path_lower):
        return 5
    
    # Default to category 3 (Utilities) if no clear category is found
    return 3

# Apply the function to create labels
try:
    # Create a new column with the extracted labels
    labeled_data = data['train'].map(lambda example: {
        **example,
        'label': extract_file_type(example['path'])
    })
    
    # Count the distribution of labels
    label_counts = collections.Counter(labeled_data['label'])
    
    # Define category names for better readability
    category_names = {
        0: "Models",
        1: "Views",
        2: "Controllers",
        3: "Utilities",
        4: "Tests",
        5: "Configuration"
    }
    
    print("Label distribution:")
    for label, count in sorted(label_counts.items()):
        category_name = category_names.get(label, f"Unknown-{label}")
        print(f"Label {label} ({category_name}): {count} examples ({count/len(labeled_data)*100:.2f}%)")
    
    # Get unique labels
    unique_labels = sorted(label_counts.keys())
    num_labels = len(unique_labels)
    
    print(f"\nTotal unique labels: {num_labels}")
except Exception as e:
    print(f"Error in data preparation: {e}")
    raise


In [None]:
# Split the data into train, validation, and test sets
try:
    # Shuffle the data
    shuffled_data = labeled_data.shuffle(seed=42)
    
    # Split into train (80%), validation (10%), and test (10%)
    train_size = int(0.8 * len(shuffled_data))
    val_size = int(0.1 * len(shuffled_data))
    
    train_data = shuffled_data.select(range(train_size))
    val_data = shuffled_data.select(range(train_size, train_size + val_size))
    test_data = shuffled_data.select(range(train_size + val_size, len(shuffled_data)))
    
    print(f"Training set size: {len(train_data)}")
    print(f"Training set label distribution: {collections.Counter(train_data['label'])}")
    print(f"Validation set size: {len(val_data)}")
    print(f"Validation set label distribution: {collections.Counter(val_data['label'])}")
    print(f"Test set size: {len(test_data)}")
    print(f"Test set label distribution: {collections.Counter(test_data['label'])}")
except Exception as e:
    print(f"Error splitting data: {e}")
    raise


In [None]:
# Tokenize the data
def tokenize_function(examples):
    """Tokenize the code content with proper truncation."""
    # Tokenize the code content
    return tokenizer(
        examples['content'],
        padding='max_length',
        truncation=True,
        max_length=MAX_LENGTH,
        return_tensors="pt"
    )

try:
    # Apply tokenization to each split
    tokenized_train_data = train_data.map(
        tokenize_function,
        batched=True,
        remove_columns=['repo_name', 'path', 'content']
    )
    
    tokenized_val_data = val_data.map(
        tokenize_function,
        batched=True,
        remove_columns=['repo_name', 'path', 'content']
    )
    
    tokenized_test_data = test_data.map(
        tokenize_function,
        batched=True,
        remove_columns=['repo_name', 'path', 'content']
    )
    
    print(f"Tokenized {len(tokenized_train_data)} training examples")
    print(f"Tokenized {len(tokenized_val_data)} validation examples")
    print(f"Tokenized {len(tokenized_test_data)} test examples")
    
    # Set the format for PyTorch
    tokenized_train_data.set_format("torch")
    tokenized_val_data.set_format("torch")
    tokenized_test_data.set_format("torch")
    
    print("Data tokenization complete")
except Exception as e:
    print(f"Error tokenizing data: {e}")
    raise


In [None]:
try:
    # Load the model with the correct number of labels
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME, 
        num_labels=num_labels,
        problem_type="single_label_classification"
    )
    
    # Move model to the appropriate device
    model.to(device)
    
    # Extend position embeddings to support longer sequences
    if MAX_LENGTH > 512:
        print(f"Extending position embeddings from {model.config.max_position_embeddings} to {MAX_LENGTH}")
        # Get current position embeddings
        current_max_pos = model.config.max_position_embeddings
        # Initialize new position embeddings for the extended range
        new_pos_embed = model.roberta.embeddings.position_embeddings.weight.new_empty(MAX_LENGTH, model.config.hidden_size)
        # Copy existing weights
        new_pos_embed[:current_max_pos] = model.roberta.embeddings.position_embeddings.weight
        # Initialize remaining weights (interpolation, repeating pattern, or just xavier init)
        # Using Xavier initialization for the remainder
        import torch.nn as nn
        nn.init.xavier_uniform_(new_pos_embed[current_max_pos:])
        # Update the model config
        model.config.max_position_embeddings = MAX_LENGTH
        # Replace the position embeddings weights
        model.roberta.embeddings.position_embeddings = nn.Embedding.from_pretrained(new_pos_embed, freeze=False)
        # Update the position_ids in the model to work with new length
        model.roberta.embeddings.register_buffer(
            "position_ids", torch.arange(MAX_LENGTH).expand((1, -1))
        )
        print("Successfully extended position embeddings")
    
    print(f"Model loaded with {num_labels} output classes")
    print(f"Model type: {model.__class__.__name__}")
    
    # Print model size
    model_size = sum(p.numel() for p in model.parameters())
    print(f"Model has {model_size:,} parameters")
    
    # Check memory usage
    process = psutil.Process(os.getpid())
    memory_info = process.memory_info()
    print(f"Current memory usage: {memory_info.rss / 1024 / 1024:.2f} MB")
except Exception as e:
    print(f"Error loading model: {e}")
    raise

In [None]:
# Compute class weights to handle imbalanced data
try:
    # Extract labels for computing class weights
    labels = train_data['label']
    
    # Compute balanced class weights
    class_weights = compute_class_weight('balanced', classes=np.unique(labels), y=labels)
    class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)
    
    print("Class weights:")
    for i, weight in enumerate(class_weights):
        category_name = category_names.get(i, f"Unknown-{i}")
        print(f"Class {i} ({category_name}): {weight:.4f}")
except Exception as e:
    print(f"Error computing class weights: {e}")
    raise


In [None]:
# Define a custom trainer with weighted loss and progress tracking
class DebugTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.step_counter = 0
        self.last_log_time = time.time()
        
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        # Track progress
        self.step_counter += 1
        current_time = time.time()
        
        # Log every 10 steps or if more than 30 seconds have passed
        if self.step_counter % 1000 == 0 or (current_time - self.last_log_time) > 30:
            process = psutil.Process(os.getpid())
            memory_info = process.memory_info()
            print(f"Step {self.step_counter}: Memory usage: {memory_info.rss / 1024 / 1024:.2f} MB, Time since last log: {current_time - self.last_log_time:.2f}s")
            self.last_log_time = current_time
        
        # Extract labels
        labels = inputs.pop("labels")
        
        # Forward pass
        outputs = model(**inputs)
        logits = outputs.logits
        
        # Use class weights in the loss calculation
        loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights)
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        
        return (loss, outputs) if return_outputs else loss

# Define training arguments with reduced parameters for debugging
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE*2,
    warmup_steps=WARMUP_STEPS,
    weight_decay=WEIGHT_DECAY,
    learning_rate=LEARNING_RATE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    eval_strategy="steps",  # Changed from epoch to steps for more frequent evaluation
    eval_steps=1000,  # Evaluate every 100 steps
    save_strategy="steps",
    save_steps=1000,  # Save every 100 steps
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    logging_dir="./logs",
    logging_steps=1000,  # Log every 10 steps for more visibility
    save_total_limit=2,
    fp16=True,  # Disable fp16 for CPU training
    report_to="none",
    # Add debug options
    disable_tqdm=False,  # Show progress bars
    dataloader_num_workers=2,  # No multiprocessing for debugging
    dataloader_pin_memory=True  # Disable pin memory for debugging
)

# Define early stopping callback
early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=3,
    early_stopping_threshold=0.01
)

# Define compute_metrics function for evaluation
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Create data collator for padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding='longest')

# Create trainer with debug capabilities
trainer = DebugTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_data,
    eval_dataset=tokenized_val_data,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping_callback]
)

print("Training setup complete")


In [None]:
# Function to monitor system resources during training
def monitor_resources():
    process = psutil.Process(os.getpid())
    memory_info = process.memory_info()
    mem = psutil.virtual_memory()
    cpu_percent = psutil.cpu_percent(interval=0.1)
    
    print(f"\nSystem Resources:")
    print(f"CPU Usage: {cpu_percent}%")
    print(f"Process Memory: {memory_info.rss / 1024 / 1024:.2f} MB")
    print(f"System Memory: {mem.percent}% used, {mem.available / 1024 / 1024:.2f} MB available\n")

# Run training with more detailed monitoring
try:
    print("Starting training...")
    
    # Monitor resources before training
    print("Resources before training:")
    monitor_resources()
    
    # Start training with a timeout
    start_time = time.time()
    
    # Run training
    train_result = trainer.train()
    
    # Monitor resources after training
    print("Resources after training:")
    monitor_resources()
    
    # Print training results
    print(f"Training completed in {train_result.metrics['train_runtime']:.2f} seconds")
    print(f"Training loss: {train_result.metrics['train_loss']:.4f}")
    
    # Save the model
    trainer.save_model("./final_model")
    print("Model saved to ./final_model")
    
    # Clean up memory
    cleanup_memory()
    
except Exception as e:
    print(f"Error during training: {e}")
    
    # Print stack trace for debugging
    import traceback
    traceback.print_exc()
    
    # Monitor resources after error
    print("Resources after error:")
    monitor_resources()
    
    raise
