## 1. Setup

Import required libraries and initialize components.

In [None]:
import sys
sys.path.append('..')

from src.data.dataset_builder import DomainDatasetBuilder
from src.data.preprocessing import DataPreprocessor
from src.data.validation import DataValidator
from src.training.config import TrainingConfig
from src.training.qlora_trainer import QLoRATrainer
from src.evaluation.evaluator import ModelEvaluator
from src.evaluation.metrics import MetricsCalculator

## 2. Create Sample Dataset

For this demo, we'll create a small sample dataset for the healthcare domain.

In [None]:
# Initialize dataset builder
builder = DomainDatasetBuilder(domain="healthcare", output_dir="../data/processed")

# Create sample dataset
print("Creating sample dataset...")
dataset = builder.create_sample_dataset(num_samples=100)

print(f"Dataset size: {len(dataset)}")
print(f"Columns: {dataset.column_names}")
print("\nSample:")
print(dataset[0])

## 3. Preprocess Data

In [None]:
# Initialize preprocessor
preprocessor = DataPreprocessor(max_length=2048, min_length=10)

# Preprocess dataset
print("Preprocessing dataset...")
text_columns = ["text", "instruction", "input", "output"]
dataset = preprocessor.apply_preprocessing(dataset, text_columns)
dataset = preprocessor.remove_empty_examples(dataset, text_columns)

# Compute statistics
stats = preprocessor.compute_statistics(dataset, text_column="text")
print("\nDataset Statistics:")
for key, value in stats.items():
    print(f"  {key}: {value}")

## 4. Validate Dataset

In [None]:
# Initialize validator
validator = DataValidator(required_columns=["text"])

# Run validation
print("Validating dataset...")
validation_results = validator.run_full_validation(dataset)

# Print report
report = validator.generate_validation_report(validation_results)
print(report)

## 5. Split Dataset

In [None]:
# Split into train/val/test
print("Splitting dataset...")
dataset_dict = builder.split_dataset(
    dataset,
    train_size=0.8,
    val_size=0.1,
    test_size=0.1
)

print(f"Train: {len(dataset_dict['train'])} samples")
print(f"Validation: {len(dataset_dict['validation'])} samples")
print(f"Test: {len(dataset_dict['test'])} samples")

# Save dataset
builder.save_dataset(dataset_dict, "healthcare_sample")

## 6. Configure Training

For this demo, we'll use a small model and reduced epochs.

In [None]:
# Create training configuration
config = TrainingConfig(
    model_name="TinyLlama/TinyLlama-1.1B-Chat-v1.0",  # Small model for demo
    output_dir="../models/checkpoints/demo",
    num_train_epochs=1,  # Reduced for demo
    per_device_train_batch_size=2,
    learning_rate=2e-4,
    lora_r=8,  # Smaller for faster training
    max_seq_length=512,  # Reduced for demo
)

print("Training Configuration:")
print(f"  Model: {config.model_name}")
print(f"  Epochs: {config.num_train_epochs}")
print(f"  Batch size: {config.per_device_train_batch_size}")
print(f"  LoRA r: {config.lora_r}")

## 7. Train Model

**Note:** This cell requires GPU and may take significant time. Skip if running on CPU.

In [None]:
# Uncomment to train (requires GPU)
# trainer = QLoRATrainer(config)
# metrics = trainer.train(
#     train_dataset=dataset_dict['train'],
#     eval_dataset=dataset_dict['validation']
# )
# print("Training metrics:", metrics)

## 8. Evaluate Model

In [None]:
# Uncomment to evaluate (after training)
# evaluator = ModelEvaluator(
#     model=trainer.model,
#     tokenizer=trainer.tokenizer,
#     output_dir="../results/demo"
# )

# eval_metrics = evaluator.evaluate_dataset(
#     dataset=dataset_dict['test'],
#     input_column="input",
#     reference_column="output",
#     include_perplexity=False  # Skip for speed
# )

# print("Evaluation Metrics:")
# for key, value in eval_metrics.items():
#     print(f"  {key}: {value}")

## 9. Test Generation

In [None]:
# Uncomment to test generation (after training)
# test_prompt = "Explain what diabetes is."
# generated = trainer.generate(
#     prompt=test_prompt,
#     max_new_tokens=100,
#     temperature=0.7
# )

# print(f"Prompt: {test_prompt}")
# print(f"\nGenerated:\n{generated}")

## Summary

This notebook demonstrated:
1. ✅ Dataset creation and preprocessing
2. ✅ Data validation
3. ✅ Dataset splitting
4. ⏸️ Model training (requires GPU)
5. ⏸️ Model evaluation
6. ⏸️ Text generation

For full training, use the command-line scripts with GPU resources.