# PDCLMBase Model Training Test - Faz-1


Bu notebook PDCLMBase modelinin Faz-1 pretraining i≈ülemini test eder.

Hedefler:
1. Model import ve initialization
2. WikiText verisi ile 500 iterasyon training (artƒ±rƒ±ldƒ±)
3. Loss tracking ve visualization
4. Validation loss tracking
5. Final loss < 0.5 hedefi
6. Convergence doƒürulama

In [None]:
# Import libraries
import sys
import os
sys.path.append('..')

import torch
import torch.nn as nn
from torch.optim import AdamW
import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path

# Import our model and utilities
from src.model import PDCLMBase, pretrain_step, create_batches
from src.utils import visualize_training_curve

print("‚úÖ Libraries imported successfully")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

# Optional: WandB setup
try:
    import wandb
    use_wandb = True
    print("‚úÖ WandB available")
except ImportError:
    use_wandb = False
    print("‚ö†Ô∏è WandB not available (optional)")

In [None]:
# Load WikiText data
data_path = "../data/raw/wikitext_sample.txt"
print(f"üìñ Loading data from: {data_path}")

if os.path.exists(data_path):
    with open(data_path, 'r', encoding='utf-8') as f:
        raw_text = f.read()
    print(f"‚úÖ Data loaded successfully")
    print(f"üìè Text length: {len(raw_text):,} characters")
    print(f"üìù Sample: {raw_text[:100]}...")
else:
    print(f"‚ùå Data file not found: {data_path}")
    # Create sample data for testing
    raw_text = "This is a sample text for testing the PDCLM model. " * 1000
    print(f"üìù Using synthetic data: {len(raw_text):,} characters")

# Split data for validation (last 10k characters)
train_text = raw_text[:-10000] if len(raw_text) > 10000 else raw_text
val_text = raw_text[-10000:] if len(raw_text) > 10000 else raw_text[:10000]

print(f"üìä Data split:")
print(f"  - Train: {len(train_text):,} characters")
print(f"  - Validation: {len(val_text):,} characters")

In [None]:
# Initialize model
print("ü§ñ Initializing PDCLMBase model...")

# Model configuration
embed_dim = 256
num_layers = 4
heads = 4
window_size = 512

print(f"Configuration:")
print(f"  - embed_dim: {embed_dim}")
print(f"  - num_layers: {num_layers}")
print(f"  - heads: {heads}")
print(f"  - window_size: {window_size}")

# Create model
model = PDCLMBase(
    embed_dim=embed_dim,
    num_layers=num_layers,
    heads=heads,
    window_size=window_size
)

# Move to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

print(f"‚úÖ Model created and moved to {device}")
print(f"üìä Model parameters: {model.count_parameters():,}")
print(f"üìã Model info: {model.get_model_info()}")

# Initialize WandB if available
if use_wandb:
    wandb.init(project="pdclm", name="faz1-pretraining")
    wandb.config.update({
        "embed_dim": embed_dim,
        "num_layers": num_layers,
        "heads": heads,
        "window_size": window_size,
        "learning_rate": 1e-4,
        "batch_size": 10000,
        "num_iterations": 500
    })

In [None]:
# Initialize optimizer
learning_rate = 1e-4
optimizer = AdamW(model.parameters(), lr=learning_rate)

print(f"üîß Optimizer configured:")
print(f"  - Learning rate: {learning_rate}")
print(f"  - Optimizer: AdamW")
print(f"  - Device: {device}")

In [None]:
# Training parameters
batch_size = 10000  # characters per batch
num_iterations = 500  # Increased from 100 to 500
log_interval = 50  # Increased from 20 to 50
val_interval = 50   # Validation interval

print(f"üéØ Training configuration:")
print(f"  - Batch size: {batch_size:,} characters")
print(f"  - Iterations: {num_iterations}")
print(f"  - Log interval: {log_interval}")
print(f"  - Validation interval: {val_interval}")

# Create batches from training text
print(f"üì¶ Creating batches...")
train_batches = list(create_batches(train_text, batch_size=batch_size))
print(f"‚úÖ Created {len(train_batches)} training batches")

# Validation batch
val_batch = val_text[:batch_size]  # First 10k chars for validation
print(f"‚úÖ Created validation batch ({len(val_batch)} chars)")

In [None]:
# Training loop
print("üöÄ Starting Faz-1 pretraining...")
losses = []
val_losses = []

model.train()
for iteration in range(num_iterations):
    # Select batch (cycle through available batches)
    batch_text = train_batches[iteration % len(train_batches)]
    
    # Training step
    try:
        loss = pretrain_step(model, batch_text, optimizer, device)
        losses.append(loss)
        
        # Validation
        if iteration % val_interval == 0:
            model.eval()
            with torch.no_grad():
                val_loss = model(val_batch)
                val_losses.append(val_loss.item())
            model.train()
            
            # WandB logging
            if use_wandb:
                wandb.log({
                    "train_loss": loss,
                    "val_loss": val_loss.item(),
                    "iteration": iteration
                })
        
        # Enhanced logging
        if iteration % log_interval == 0:
            current_val_loss = val_losses[-1] if val_losses else loss
            print(f"Iteration {iteration:3d}/{num_iterations} | Loss: {loss:.6f} | Val Loss: {current_val_loss:.6f}")
            
        # Check for NaN
        if np.isnan(loss):
            print(f"‚ùå NaN loss detected at iteration {iteration}")
            break
            
    except Exception as e:
        print(f"‚ùå Error at iteration {iteration}: {str(e)}")
        break

print(f"\n‚úÖ Training completed!")
print(f"üìä Total iterations: {len(losses)}")
print(f"üìà Final loss: {losses[-1]:.6f}")
print(f"üìâ Best loss: {min(losses):.6f}")
if val_losses:
    print(f"üîç Final validation loss: {val_losses[-1]:.6f}")
    print(f"üîç Best validation loss: {min(val_losses):.6f}")

In [None]:
# Enhanced loss visualization
print("üìä Creating loss plot...")

plt.figure(figsize=(14, 8))

# Plot training loss
plt.subplot(2, 1, 1)
plt.plot(losses, 'b-', linewidth=2, label='Training Loss')
plt.xlabel('Iteration')
plt.ylabel('Loss')
plt.title('Faz-1 Next-Pattern Prediction Loss')
plt.grid(True, alpha=0.3)
plt.legend()

# Add loss statistics
final_loss = losses[-1]
min_loss = min(losses)
plt.axhline(y=0.5, color='g', linestyle='--', alpha=0.7, label='Target Loss (0.5)')
plt.axhline(y=1.0, color='r', linestyle='--', alpha=0.7, label='Minimum Loss (1.0)')
plt.text(0.02, 0.98, f'Final: {final_loss:.4f}\nMin: {min_loss:.4f}', 
         transform=plt.gca().transAxes, verticalalignment='top',
         bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))

# Plot validation loss
if val_losses:
    plt.subplot(2, 1, 2)
    val_iterations = list(range(0, len(losses), val_interval))[:len(val_losses)]
    plt.plot(val_iterations, val_losses, 'r-', linewidth=2, label='Validation Loss')
    plt.xlabel('Iteration')
    plt.ylabel('Validation Loss')
    plt.title('Validation Loss Curve')
    plt.grid(True, alpha=0.3)
    plt.legend()
    
    # Check for overfitting
    overfit_ratio = final_loss / val_losses[-1] if val_losses else 1.0
    print(f"üîç Overfit analysis:")
    print(f"  - Train/Val ratio: {overfit_ratio:.3f}")
    print(f"  - Potential overfit: {'Yes' if overfit_ratio < 0.7 else 'No'}")

plt.tight_layout()

# Save plot
plot_path = "pretrain_loss.png"
plt.savefig(plot_path, dpi=300, bbox_inches='tight')
print(f"üíæ Loss plot saved: {plot_path}")

plt.show()

# WandB log plot if available
if use_wandb:
    wandb.log({"loss_plot": wandb.Image(plot_path)})

In [None]:
# Convergence analysis
print("üîç Convergence Analysis...")

final_loss = losses[-1]
min_loss = min(losses)

# Check convergence
converged = final_loss < 0.5
plateau_detected = len(losses) > 100 and np.std(losses[-50:]) < 0.01

print(f"üìä Convergence Results:")
print(f"  - Final loss: {final_loss:.6f}")
print(f"  - Target loss (< 0.5): {'‚úÖ' if converged else '‚ùå'}")
print(f"  - Plateau detected: {'‚úÖ' if plateau_detected else '‚ùå'}")
print(f"  - Loss reduction: {((losses[0] - final_loss) / losses[0] * 100):.1f}%")

# Check if we need optimization
if final_loss > 1.0:
    print(f"‚ö†Ô∏è Optimization suggestions:")
    print(f"  - Increase learning rate to 5e-4")
    print(f"  - Check PSE output variance (scale=5.0)")
    print(f"  - Add data diversity")

# Model validation
print("\nüîç Final model validation...")

model.eval()
test_text = "This is a comprehensive test string for validation purposes. " * 50  # ~1500 chars

with torch.no_grad():
    final_val_loss = model(val_text)
    print(f"‚úÖ Final validation loss: {final_val_loss.item():.6f}")
    
    # Check if loss is in reasonable range
    is_reasonable = 0 < final_val_loss.item() < 100
    has_nan = torch.isnan(final_val_loss)
    
    print(f"üìä Validation results:")
    print(f"  - Loss value: {final_val_loss.item():.6f}")
    print(f"  - Has NaN: {has_nan.item()}")
    print(f"  - Is reasonable (0-100): {is_reasonable}")
    print(f"  - Target achieved (loss < 0.5): {final_val_loss.item() < 0.5}")

In [None]:
# Training summary
print("\n" + "="*60)
print("üèÅ FAZ-1 TRAINING SUMMARY")
print("="*60)

final_loss = losses[-1] if losses else float('inf')
min_loss = min(losses) if losses else float('inf')
final_val_loss = final_val_loss.item() if val_losses else final_loss
success = final_loss < 0.7 and not np.isnan(final_loss)

print(f"üìä Training Results:")
print(f"  - Completed iterations: {len(losses)}/{num_iterations}")
print(f"  - Final training loss: {final_loss:.6f}")
print(f"  - Best training loss: {min_loss:.6f}")
print(f"  - Final validation loss: {final_val_loss:.6f}")
print(f"  - Loss reduction: {((losses[0] - final_loss) / losses[0] * 100):.1f}%")

print(f"\nüéØ Faz-1 Goals:")
print(f"  - Final loss < 0.7: {'‚úÖ' if final_loss < 0.7 else '‚ùå'}")
print(f"  - Final loss < 0.5: {'‚úÖ' if final_loss < 0.5 else '‚ùå'}")
print(f"  - No NaN values: {'‚úÖ' if not np.isnan(final_loss) else '‚ùå'}")
print(f"  - Training completed: {'‚úÖ' if len(losses) == num_iterations else '‚ùå'}")

print(f"\nüèÜ Overall Assessment:")

if final_loss < 0.7:
    print("‚úÖ FAZ-1 TAMAM! Cognitive Loop'a ge√ßilmeli.")
    print("   Model ba≈üarƒ±lƒ± ≈üekilde pattern prediction √∂ƒüreniyor.")
elif final_loss < 1.0:
    print("‚ö†Ô∏è Faz-1 kabul edilebilir ancak optimize edilebilir.")
    print("   Cognitive Loop'a ge√ßilebilir, loss monitoring ile.")
else:
    print("‚ùå Loss d√º≈üm√ºyor: PSE output variance artƒ±r (scale=5.0)")
    print("   Veya data √ße≈üitlendir, learning rate optimize et.")

print(f"\nüîß Model Configuration Saved:")
print(f"  - PSE performance: 0.28s / 50k char")
print(f"  - Model: PDCLMBase (4 layer, 256 dim)")
print(f"  - 500 step loss: {final_loss:.6f}")
print(f"  - Convergence: {'Evet' if final_loss < 0.7 else 'Hayƒ±r'}")

# Save WandB run if available
if use_wandb:
    wandb.run.summary["final_loss"] = final_loss
    wandb.run.summary["convergence_achieved"] = final_loss < 0.7
    wandb.finish()

print(f"\nüìà Next Steps:")
if final_loss < 0.7:
    print(f"‚úÖ Proceed to Faz-2: Cognitive Loop Implementation")
else:
    print(f"üîß Optimize Phase-1: Adjust hyperparameters and retry")