# Model Training and Comparison

In [17]:
import sys
import os
import torch
import wandb
import numpy as np
import pandas as pd
from datetime import datetime

sys.path.append('../')
from src.core.trainer import DialogTrainer
from src.config.settings import get_model_config, get_test_config, get_development_config
from src.data.loaders import get_dataset_manager

# Check GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name()}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

Using device: cpu


In [18]:
# Setup WandB authentication
from dotenv import load_dotenv

load_dotenv()
wandb_api_key = os.getenv('WANDB_API_KEY')

if wandb_api_key:
    wandb.login(key=wandb_api_key)
    print("✓ WandB connection established")
else:
    print("⚠ No WandB key found - need to fix the .env file")
    print("Reminder: Add WANDB_API_KEY=your_actual_key to .env file")
    print("Training will continue without WandB logging")

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/khalil/.netrc
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/khalil/.netrc


✓ WandB connection established


In [19]:
# Initialize main WandB run for model comparison experiment
main_run = wandb.init(
    project="dialog-model-training",
    name=f"model-comparison-{datetime.now().strftime('%Y%m%d-%H%M')}",
    tags=["comparison", "training", "notebook"],
    notes="Comparing multiple dialog model architectures: GPT-2, DialoGPT, and custom models"
)

print("✓ Main WandB experiment tracking started")
print(f"Run name: {main_run.name}")
print(f"Project: {main_run.project}")

✓ Main WandB experiment tracking started
Run name: model-comparison-20250722-1722
Project: dialog-model-training


In [None]:
# Load dataset and configuration
dataset_manager = get_dataset_manager()
df = dataset_manager.load_dataset()
training_config = get_development_config()

# Models to compare
models_to_train = [
    #"gpt2-small",      # Pre-trained GPT-2
    "custom-small",    # Custom model from scratch
    #"dialogpt-small"   # Pre-trained DialoGPT
]

# Single consolidated output to avoid duplicates
output = []
output.append("Training Configuration:")
output.append(f"- Epochs: {training_config.num_epochs}")
output.append(f"- Batch size: {training_config.batch_size}")
output.append(f"- Max samples: {training_config.max_samples}")
output.append(f"- Learning rate: {training_config.learning_rate}")
output.append(f"\nModels to train and compare: {models_to_train}")
output.append(f"Dataset loaded: {len(df)} samples")

print("\n".join(output))

Loading dataset from local file: data/alpaca-gpt4.csv
Training Configuration:
- Epochs: 1
- Batch size: 2
- Max samples: 100
- Learning rate: 5e-05

Models to train and compare: ['custom-small']
Dataset loaded: 52002 samples
Training Configuration:
- Epochs: 1
- Batch size: 2
- Max samples: 100
- Learning rate: 5e-05

Models to train and compare: ['custom-small']
Dataset loaded: 52002 samples


## Training Loop with WandB Tracking

Train each model variant and log metrics to WandB for comparison.

In [21]:
def train_model_with_logging(model_name, df, training_config):
    """Train a model with WandB logging"""
    
    # Initialize WandB run for this model (separate project to avoid conflicts)
    run = wandb.init(
        project="dialog-model-training-individual",
        name=f"training-{model_name}-{datetime.now().strftime('%Y%m%d-%H%M')}",
        tags=["training", model_name],
        config={
            "model": model_name,
            "epochs": training_config.num_epochs,
            "batch_size": training_config.batch_size,
            "learning_rate": training_config.learning_rate,
            "max_samples": training_config.max_samples
        }
    )
    
    try:
        # Get model config and initialize trainer
        model_config = get_model_config(model_name)
        trainer = DialogTrainer(model_config=model_config)
        
        print(f"\n{'='*50}")
        print(f"Training {model_name}")
        print(f"{'='*50}")
        
        # Prepare dataset
        train_dataset, eval_dataset = trainer.prepare_dataset(
            df, 
            max_samples=training_config.max_samples
        )
        
        # Start training
        start_time = datetime.now()
        trainer.train(
            train_dataset, 
            eval_dataset, 
            num_epochs=training_config.num_epochs,
            batch_size=training_config.batch_size,
            learning_rate=training_config.learning_rate
        )
        end_time = datetime.now()
        
        training_time = (end_time - start_time).total_seconds()
        
        # Log final metrics
        wandb.log({
            "training_time_seconds": training_time,
            "training_time_minutes": training_time / 60,
            "model_parameters": sum(p.numel() for p in trainer.model.parameters()),
            "dataset_size": len(train_dataset),
            "eval_size": len(eval_dataset)
        })
        
        print(f"Training completed in {training_time/60:.1f} minutes")
        return trainer
        
    finally:
        wandb.finish()

# Store trained models for comparison
trained_models = {}

In [22]:
# Close initial WandB run before starting individual training
print("Closing initial WandB run...")
main_run.finish()

# Train all models
for model_name in models_to_train:
    print(f"\nStarting training for {model_name}...")
    try:
        trainer = train_model_with_logging(model_name, df, training_config)
        trained_models[model_name] = trainer
        print(f"SUCCESSFULLY trained {model_name}")
    except Exception as e:
        print(f"FAILED to train {model_name}: {e}")
        continue

print(f"\nTraining Summary:")
print(f"Successfully trained: {list(trained_models.keys())}")
print(f"Total models trained: {len(trained_models)}")

# Initialize main WandB run for logging generation results
print("\nInitializing main WandB run for generation testing...")
main_run = wandb.init(
    project="dialog-model-training",
    name=f"generation-results-{datetime.now().strftime('%Y%m%d-%H%M')}",
    tags=["comparison", "generation", "notebook"],
    notes="Generation testing results for trained dialog models",
    config={
        "dataset_size": len(df),
        "models_trained": list(trained_models.keys()),
        "num_models": len(trained_models)
    }
)

# Log training completion summary to main run
wandb.log({
    "training_completed": True,
    "models_successfully_trained": len(trained_models),
    "training_completion_time": datetime.now().timestamp()
})

Closing initial WandB run...



Starting training for custom-small...


Building custom model from scratch: custom-gpt-small
Architecture: 8 layers, 512 dim, 8 heads
Output directory: ./models/custom_small
Custom GPT model initialized:
  Parameters: 76,945,408
  Layers: 8
  Embedding dim: 512
  Attention heads: 8
Device: cpu
Model parameters: 76,945,408

Training custom-small
Using 100 samples for training
Training samples: 90
Evaluation samples: 10
Custom GPT model initialized:
  Parameters: 76,945,408
  Layers: 8
  Embedding dim: 512
  Attention heads: 8
Device: cpu
Model parameters: 76,945,408

Training custom-small
Using 100 samples for training
Training samples: 90
Evaluation samples: 10


Map:   0%|          | 0/90 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Starting training...




Step,Training Loss,Validation Loss


Training completed! Model saved to ./models/custom_small
Training completed in 0.8 minutes


0,1
dataset_size,▁
eval_size,▁
model_parameters,▁
train/epoch,▁
train/global_step,▁▁
training_time_minutes,▁
training_time_seconds,▁

0,1
dataset_size,90.0
eval_size,10.0
model_parameters,76945408.0
total_flos,6872362598400.0
train/epoch,1.0
train/global_step,45.0
train_loss,10.15426
train_runtime,50.1872
train_samples_per_second,1.793
train_steps_per_second,0.897


SUCCESSFULLY trained custom-small

Training Summary:
Successfully trained: ['custom-small']
Total models trained: 1

Initializing main WandB run for generation testing...


## Dialog Generation Testing

Test the trained models by generating responses to sample instructions.

In [23]:
# Test instructions for dialog generation
test_instructions = [
    "Explain what machine learning is in simple terms.",
    "How do I make a good cup of coffee?",
    "What are the benefits of exercise?",
    "Tell me about the solar system.",
    "How can I improve my communication skills?"
]

print("Dialog Generation Comparison")
print("=" * 60)

# Store all responses for WandB logging
all_responses = []
generation_metrics = {}

for i, instruction in enumerate(test_instructions, 1):
    print(f"\n[{i}/{len(test_instructions)}] Instruction: {instruction}")
    print("-" * 40)
    
    instruction_responses = {"instruction": instruction, "responses": {}}
    
    for model_name, trainer in trained_models.items():
        try:
            start_time = datetime.now()
            response = trainer.generate_response(instruction, max_length=100)
            generation_time = (datetime.now() - start_time).total_seconds()
            
            print(f"\n{model_name}: {response}")
            
            # Store for logging
            instruction_responses["responses"][model_name] = {
                "text": response,
                "generation_time": generation_time,
                "length": len(response),
                "word_count": len(response.split())
            }
            
            # Track metrics per model
            if model_name not in generation_metrics:
                generation_metrics[model_name] = {
                    "total_time": 0,
                    "total_responses": 0,
                    "total_length": 0,
                    "total_words": 0
                }
            
            generation_metrics[model_name]["total_time"] += generation_time
            generation_metrics[model_name]["total_responses"] += 1
            generation_metrics[model_name]["total_length"] += len(response)
            generation_metrics[model_name]["total_words"] += len(response.split())
            
        except Exception as e:
            print(f"\n{model_name}: Error - {e}")
            instruction_responses["responses"][model_name] = {
                "error": str(e),
                "generation_time": 0,
                "length": 0,
                "word_count": 0
            }
    
    all_responses.append(instruction_responses)
    print("=" * 60)

print(f"Generation testing completed!")
print(f"Generated {sum(len(r['responses']) for r in all_responses if 'responses' in r)} total responses")

Dialog Generation Comparison

[1/5] Instruction: Explain what machine learning is in simple terms.
----------------------------------------

custom-small: bill





.


 the.


.

.







 the
 options the




,
 the



 the


 task the
. Store,bill such Quan
 the


 helmet
,. the
, a,

xton.,

 the


.
,,
 Quan

[2/5] Instruction: How do I make a good cup of coffee?
----------------------------------------

custom-small: bill





.


 the.


.

.







 the
 options the




,
 the



 the


 task the
. Store,bill such Quan
 the


 helmet
,. the
, a,

xton.,

 the


.
,,
 Quan

[2/5] Instruction: How do I make a good cup of coffee?
----------------------------------------

custom-small: the.

,








,
 the





 the


bill





 reliable
 the Thirty thexton






 options925
 final and

 the such the


 Stanford.






.




,
,
,


 Columbia,.dm.

,
 the horse and
 horse. legislative

[3/5] Instruction: What are the benefits of exercise?
----------------------------------------


In [24]:
# Log generation metrics to main WandB run
print("Logging generation metrics to WandB...")

# Log average metrics per model
for model_name, metrics in generation_metrics.items():
    if metrics["total_responses"] > 0:  # Avoid division by zero
        avg_metrics = {
            f"{model_name}_avg_generation_time": metrics["total_time"] / metrics["total_responses"],
            f"{model_name}_avg_response_length": metrics["total_length"] / metrics["total_responses"],
            f"{model_name}_avg_response_words": metrics["total_words"] / metrics["total_responses"],
            f"{model_name}_total_generation_time": metrics["total_time"],
            f"{model_name}_responses_generated": metrics["total_responses"]
        }
        wandb.log(avg_metrics)

# Create a summary table for WandB
response_table = wandb.Table(columns=["Model", "Instruction", "Response", "Length", "Words", "Time (s)"])

for response_set in all_responses:
    instruction = response_set["instruction"]
    for model_name, response_data in response_set["responses"].items():
        if "error" not in response_data:
            response_table.add_data(
                model_name,
                instruction[:50] + "..." if len(instruction) > 50 else instruction,
                response_data["text"][:100] + "..." if len(response_data["text"]) > 100 else response_data["text"],
                response_data["length"],
                response_data["word_count"],
                round(response_data["generation_time"], 3)
            )

wandb.log({"generation_comparison": response_table})

# Log overall generation stats and final summary
final_summary = {
    "experiment_status": "completed",
    "models_trained": len(trained_models),
    "total_test_instructions": len(test_instructions),
    "total_generations": sum(len(r["responses"]) for r in all_responses if "responses" in r),
    "generation_test_completed": True
}

wandb.log(final_summary)

# Close WandB run
main_run.finish()

print("Experiment completed successfully.")
print(f"Trained {len(trained_models)} models and completed generation testing.")

Logging generation metrics to WandB...


0,1
custom-small_avg_generation_time,▁
custom-small_avg_response_length,▁
custom-small_avg_response_words,▁
custom-small_responses_generated,▁
custom-small_total_generation_time,▁
models_successfully_trained,▁
models_trained,▁
total_generations,▁
total_test_instructions,▁
training_completion_time,▁

0,1
custom-small_avg_generation_time,7.95595
custom-small_avg_response_length,165.2
custom-small_avg_response_words,26.2
custom-small_responses_generated,5
custom-small_total_generation_time,39.77974
experiment_status,completed
generation_test_completed,True
models_successfully_trained,1
models_trained,1
total_generations,5


Experiment completed successfully.
Trained 1 models and completed generation testing.
