In [1]:
import os
import yaml
import json
import logging
from pathlib import Path
from typing import Dict, Any, Optional
from dataclasses import dataclass, asdict
from datetime import datetime

In [2]:
@dataclass
class ModelConfig:
    """Model configuration"""
    name: str = "gpt2"
    hidden_size: int = 768
    num_layers: int = 12
    num_heads: int = 12
    vocab_size: int = 50257
    max_length: int = 1024

In [3]:
@dataclass
class TrainingConfig:
    """Training configuration"""
    batch_size: int = 8
    learning_rate: float = 5e-5
    num_epochs: int = 3
    warmup_steps: int = 500
    weight_decay: float = 0.01
    gradient_accumulation_steps: int = 1
    max_grad_norm: float = 1.0

In [4]:
@dataclass
class DataConfig:
    """Data configuration"""
    dataset_name: str = "wikitext"
    dataset_config: str = "wikitext-2-raw-v1"
    max_samples: Optional[int] = None
    validation_split: float = 0.1
    preprocessing_num_workers: int = 4

In [5]:
@dataclass
class ExperimentConfig:
    """Complete experiment configuration"""
    model: ModelConfig
    training: TrainingConfig
    data: DataConfig
    experiment_name: str = "genai_experiment"
    output_dir: str = "./outputs"
    seed: int = 42
    use_wandb: bool = True
    wandb_project: str = "genai-experiments"

In [6]:
class ConfigManager:
    """Advanced configuration management"""
    
    def __init__(self, config_dir: str = "./configs"):
        self.config_dir = Path(config_dir)
        self.config_dir.mkdir(exist_ok=True)
    
    def save_config(self, config: ExperimentConfig, name: str):
        """Save configuration to YAML file"""
        config_path = self.config_dir / f"{name}.yaml"
        with open(config_path, 'w') as f:
            yaml.dump(asdict(config), f, default_flow_style=False)
        print(f"Configuration saved to {config_path}")
    
    def load_config(self, name: str) -> ExperimentConfig:
        """Load configuration from YAML file"""
        config_path = self.config_dir / f"{name}.yaml"
        with open(config_path, 'r') as f:
            config_dict = yaml.safe_load(f)
        
        return ExperimentConfig(
            model=ModelConfig(**config_dict['model']),
            training=TrainingConfig(**config_dict['training']),
            data=DataConfig(**config_dict['data']),
            **{k: v for k, v in config_dict.items() if k not in ['model', 'training', 'data']}
        )
    
    def create_default_configs(self):
        """Create default configuration templates"""
        configs = {
            'small_experiment': ExperimentConfig(
                model=ModelConfig(name="distilgpt2", hidden_size=768, num_layers=6),
                training=TrainingConfig(batch_size=4, num_epochs=1),
                data=DataConfig(max_samples=1000)
            ),
            'medium_experiment': ExperimentConfig(
                model=ModelConfig(name="gpt2", hidden_size=768, num_layers=12),
                training=TrainingConfig(batch_size=8, num_epochs=3),
                data=DataConfig(max_samples=10000)
            ),
            'large_experiment': ExperimentConfig(
                model=ModelConfig(name="gpt2-large", hidden_size=1280, num_layers=36),
                training=TrainingConfig(batch_size=2, num_epochs=5, gradient_accumulation_steps=4),
                data=DataConfig(max_samples=None)
            )
        }
        
        for name, config in configs.items():
            self.save_config(config, name)

In [7]:
class ExperimentTracker:
    """Enhanced experiment tracking with multiple backends"""
    
    def __init__(self, config: ExperimentConfig):
        self.config = config
        self.experiment_dir = Path(config.output_dir) / config.experiment_name
        self.experiment_dir.mkdir(parents=True, exist_ok=True)
        
        # Setup logging
        self.setup_logging()
        
        # Setup experiment tracking
        self.setup_wandb() if config.use_wandb else None
        
        # Save experiment metadata
        self.save_experiment_metadata()
    
    def setup_logging(self):
        """Setup structured logging"""
        log_file = self.experiment_dir / "experiment.log"
        
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
            handlers=[
                logging.FileHandler(log_file),
                logging.StreamHandler()
            ]
        )
        
        self.logger = logging.getLogger(__name__)
        self.logger.info(f"Experiment started: {self.config.experiment_name}")
    
    def setup_wandb(self):
        """Setup Weights & Biases tracking"""
        try:
            import wandb
            
            wandb.init(
                project=self.config.wandb_project,
                name=self.config.experiment_name,
                config=asdict(self.config),
                dir=str(self.experiment_dir)
            )
            
            self.logger.info("Weights & Biases initialized")
        except ImportError:
            self.logger.warning("wandb not installed, skipping W&B setup")
        except Exception as e:
            self.logger.error(f"Failed to initialize wandb: {e}")
    
    def save_experiment_metadata(self):
        """Save experiment metadata"""
        metadata = {
            'experiment_name': self.config.experiment_name,
            'start_time': datetime.now().isoformat(),
            'config': asdict(self.config),
            'python_version': f"{os.sys.version_info.major}.{os.sys.version_info.minor}",
            'working_directory': str(Path.cwd())
        }
        
        metadata_file = self.experiment_dir / "metadata.json"
        with open(metadata_file, 'w') as f:
            json.dump(metadata, f, indent=2)
    
    def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None):
        """Log metrics to all tracking backends"""
        # Log to file
        self.logger.info(f"Step {step}: {metrics}")
        
        # Log to wandb
        try:
            import wandb
            wandb.log(metrics, step=step)
        except:
            pass
    
    def save_model_checkpoint(self, model, tokenizer=None, step: int = None):
        """Save model checkpoint with metadata"""
        checkpoint_dir = self.experiment_dir / "checkpoints" / f"step_{step or 'final'}"
        checkpoint_dir.mkdir(parents=True, exist_ok=True)
        
        # Save model
        model.save_pretrained(checkpoint_dir)
        
        # Save tokenizer if provided
        if tokenizer:
            tokenizer.save_pretrained(checkpoint_dir)
        
        # Save checkpoint metadata
        checkpoint_metadata = {
            'step': step,
            'save_time': datetime.now().isoformat(),
            'model_config': self.config.model.__dict__
        }
        
        with open(checkpoint_dir / "checkpoint_metadata.json", 'w') as f:
            json.dump(checkpoint_metadata, f, indent=2)
        
        self.logger.info(f"Checkpoint saved to {checkpoint_dir}")

In [8]:
class DatasetManager:
    """Advanced dataset management and preprocessing"""
    
    def __init__(self, config: DataConfig):
        self.config = config
        self.cache_dir = Path("./data/cache")
        self.cache_dir.mkdir(parents=True, exist_ok=True)
    
    def load_and_preprocess_dataset(self):
        """Load and preprocess dataset with caching"""
        from datasets import load_dataset
        from transformers import AutoTokenizer
        
        # Load dataset
        print(f"Loading dataset: {self.config.dataset_name}")
        dataset = load_dataset(
            self.config.dataset_name,
            self.config.dataset_config,
            cache_dir=str(self.cache_dir)
        )
        
        # Apply sample limit if specified
        if self.config.max_samples:
            dataset = dataset.select(range(min(self.config.max_samples, len(dataset))))
        
        # Initialize tokenizer
        tokenizer = AutoTokenizer.from_pretrained("gpt2")
        tokenizer.pad_token = tokenizer.eos_token
        
        # Tokenization function
        def tokenize_function(examples):
            return tokenizer(
                examples["text"],
                truncation=True,
                padding="max_length",
                max_length=512,
                return_tensors="pt"
            )
        
        # Apply tokenization
        print("Tokenizing dataset...")
        tokenized_dataset = dataset.map(
            tokenize_function,
            batched=True,
            num_proc=self.config.preprocessing_num_workers,
            remove_columns=dataset.column_names
        )
        
        return tokenized_dataset, tokenizer


In [9]:
class ModelBuilder:
    """Advanced model building and configuration"""
    
    def __init__(self, config: ModelConfig):
        self.config = config
    
    def build_model(self):
        """Build model based on configuration"""
        from transformers import AutoModelForCausalLM, AutoConfig
        
        if self.config.name in ["gpt2", "distilgpt2", "gpt2-medium", "gpt2-large"]:
            # Use pre-trained model
            model = AutoModelForCausalLM.from_pretrained(self.config.name)
        else:
            # Create custom model
            model_config = AutoConfig.from_pretrained("gpt2")
            model_config.hidden_size = self.config.hidden_size
            model_config.num_hidden_layers = self.config.num_layers
            model_config.num_attention_heads = self.config.num_heads
            model_config.vocab_size = self.config.vocab_size
            model_config.max_position_embeddings = self.config.max_length
            
            model = AutoModelForCausalLM.from_config(model_config)
        
        # Print model info
        total_params = sum(p.numel() for p in model.parameters())
        trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
        
        print(f"Model: {self.config.name}")
        print(f"Total parameters: {total_params:,}")
        print(f"Trainable parameters: {trainable_params:,}")
        
        return model

In [10]:
class TrainingPipeline:
    """Complete training pipeline with best practices"""
    
    def __init__(self, config: ExperimentConfig):
        self.config = config
        self.tracker = ExperimentTracker(config)
        
        # Setup reproducibility
        self.setup_reproducibility()
    
    def setup_reproducibility(self):
        """Setup reproducible training"""
        import torch
        import random
        import numpy as np
        
        torch.manual_seed(self.config.seed)
        torch.cuda.manual_seed_all(self.config.seed)
        np.random.seed(self.config.seed)
        random.seed(self.config.seed)
        
        # For deterministic behavior (may reduce performance)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
    
    def run_training(self):
        """Run complete training pipeline"""
        from transformers import Trainer, TrainingArguments
        from transformers import DataCollatorForLanguageModeling
        
        # Load data
        dataset_manager = DatasetManager(self.config.data)
        dataset, tokenizer = dataset_manager.load_and_preprocess_dataset()
        
        # Build model
        model_builder = ModelBuilder(self.config.model)
        model = model_builder.build_model()
        
        # Setup training arguments
        training_args = TrainingArguments(
            output_dir=str(self.tracker.experiment_dir / "training_output"),
            num_train_epochs=self.config.training.num_epochs,
            per_device_train_batch_size=self.config.training.batch_size,
            gradient_accumulation_steps=self.config.training.gradient_accumulation_steps,
            learning_rate=self.config.training.learning_rate,
            weight_decay=self.config.training.weight_decay,
            warmup_steps=self.config.training.warmup_steps,
            max_grad_norm=self.config.training.max_grad_norm,
            logging_steps=100,
            save_steps=1000,
            evaluation_strategy="steps",
            eval_steps=500,
            save_total_limit=3,
            load_best_model_at_end=True,
            report_to="wandb" if self.config.use_wandb else None,
            run_name=self.config.experiment_name,
            seed=self.config.seed,
        )
        
        # Data collator
        data_collator = DataCollatorForLanguageModeling(
            tokenizer=tokenizer,
            mlm=False,  # We're doing causal LM, not masked LM
        )
        
        # Split dataset
        if self.config.data.validation_split > 0:
            dataset = dataset.train_test_split(test_size=self.config.data.validation_split)
            train_dataset = dataset["train"]
            eval_dataset = dataset["test"]
        else:
            train_dataset = dataset
            eval_dataset = None
        
        # Initialize trainer
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            data_collator=data_collator,
            tokenizer=tokenizer,
        )
        
        # Start training
        self.tracker.logger.info("Starting training...")
        trainer.train()
        
        # Save final model
        trainer.save_model()
        self.tracker.save_model_checkpoint(model, tokenizer, step="final")
        
        # Log final metrics
        if eval_dataset:
            eval_results = trainer.evaluate()
            self.tracker.log_metrics(eval_results, step="final")
        
        self.tracker.logger.info("Training completed!")
        return trainer

In [11]:
def create_development_workflow():
    """Create a complete development workflow"""
    print("🚀 Setting up GenAI Development Workflow")
    print("=" * 50)
    
    # 1. Create configuration manager
    config_manager = ConfigManager()
    config_manager.create_default_configs()
    print("✅ Configuration templates created")
    
    # 2. Create directory structure
    directories = [
        "data/raw", "data/processed", "data/cache",
        "models/checkpoints", "models/final",
        "outputs", "configs", "scripts", "notebooks"
    ]
    
    for directory in directories:
        Path(directory).mkdir(parents=True, exist_ok=True)
    
    print("✅ Project structure created")
    
    # 3. Create utility scripts
    create_utility_scripts()
    print("✅ Utility scripts created")
    
    # 4. Create example notebook
    create_example_notebook()
    print("✅ Example notebook created")
    
    print("\n🎉 Development workflow setup complete!")
    print("\nNext steps:")
    print("1. Review configurations in ./configs/")
    print("2. Run example training: python scripts/train_example.py")
    print("3. Open example notebook: jupyter lab notebooks/example_training.ipynb")