<a href="https://colab.research.google.com/github/yourusername/llama_dataset_optimizer/blob/main/test_colab_a100.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# LLaMA Dataset Optimizer - A100 GPU Testing

This notebook tests the LLaMA Dataset Optimizer on Google Colab with A100 GPU.

**Requirements:**
- Runtime: GPU (A100 High-RAM recommended)
- Enable GPU acceleration in Runtime > Change runtime type

## 1. GPU Verification

In [None]:
import torch
import subprocess

# Check GPU availability
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA version: {torch.version.cuda}")
    print(f"GPU count: {torch.cuda.device_count()}")
    print(f"GPU name: {torch.cuda.get_device_name(0)}")
    print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
    
    # Check if it's A100
    gpu_name = torch.cuda.get_device_name(0)
    if 'A100' in gpu_name:
        print("✅ A100 GPU detected!")
    else:
        print(f"⚠️  GPU is {gpu_name}, not A100")
else:
    print("❌ No GPU available. Please enable GPU runtime.")

## 2. Repository Setup

In [None]:
# Clone the repository
!git clone https://github.com/yourusername/llama_dataset_optimizer.git
%cd llama_dataset_optimizer

In [None]:
# Install dependencies
!pip install -r requirements.txt

# Install flash-attention for A100 optimization (optional but recommended)
!pip install flash-attn --no-build-isolation

## 3. Basic Functionality Test

In [None]:
import sys
sys.path.append('/content/llama_dataset_optimizer')

from llama_dataset_optimizer import LlamaDatasetOptimizer
import yaml

print("✅ Successfully imported LlamaDatasetOptimizer")

In [None]:
# Load a configuration
with open('configs/llama_3_2_instruct.yaml', 'r') as f:
    config = yaml.safe_load(f)
    
print(f"Model: {config['model_family']}")
print(f"Quality filtering batch size: {config['batch_sizes']['quality_filtering']}")
print(f"Similarity threshold: {config['deduplication']['similarity_threshold']}")

## 4. Test with Sample Data

In [ ]:
# Create sample dataset for testing
import json

# Write sample data to JSONL file
with open('sample_data.jsonl', 'w') as f:
    for item in [
        {"instruction": "What is the capital of France?", "response": "The capital of France is Paris."},
        {"instruction": "Explain machine learning in simple terms.", "response": "Machine learning is a type of artificial intelligence where computers learn patterns from data to make predictions or decisions without being explicitly programmed for each task."},
        {"instruction": "What is 2+2?", "response": "2+2 equals 4."},
        {"instruction": "What is the capital of France?", "response": "Paris is the capital city of France."}  # Duplicate for deduplication test
    ]:
        f.write(json.dumps(item) + '\n')

print("Created sample_data.jsonl with 4 examples")

In [ ]:
# Test optimization
optimizer = LlamaDatasetOptimizer()

# Test with TinyLlama model for fast testing
print("Starting optimization...")
optimized_data = optimizer.optimize(
    "sample_data.jsonl",
    output_dir="output",
    model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
    config="llama_3_2_instruct_optimized",
    top_k=3,
    skip_deduplication=False
)

print("✅ Optimization completed successfully!")

## 5. Performance Test

In [None]:
# Memory usage test
import psutil
import os

process = psutil.Process(os.getpid())
memory_info = process.memory_info()

print(f"Memory usage: {memory_info.rss / 1024**2:.1f} MB")
print(f"GPU memory allocated: {torch.cuda.memory_allocated() / 1024**2:.1f} MB")
print(f"GPU memory cached: {torch.cuda.memory_reserved() / 1024**2:.1f} MB")

In [ ]:
# Speed test with larger dataset
import time
import json

# Create larger sample dataset
print("Creating larger test dataset...")
with open('large_sample_data.jsonl', 'w') as f:
    for i in range(100):
        item = {
            "instruction": f"Question {i}: What is {i} + {i}?",
            "response": f"The answer is {i*2}."
        }
        f.write(json.dumps(item) + '\n')

start_time = time.time()
large_optimized = optimizer.optimize(
    "large_sample_data.jsonl",
    output_dir="output_large",
    model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
    config="llama_3_2_instruct_optimized",
    top_k=50,
    skip_deduplication=True  # Skip for speed test
)
end_time = time.time()

print(f"\nLarge dataset optimization:")
print(f"Dataset size: 100 examples")
print(f"Processing time: {end_time - start_time:.2f} seconds")
print(f"Speed: {100 / (end_time - start_time):.1f} examples/second")

## 6. Results Summary

In [None]:
print("\n" + "="*50)
print("LLAMA DATASET OPTIMIZER - A100 TEST RESULTS")
print("="*50)
print(f"✅ GPU: {torch.cuda.get_device_name(0)}")
print(f"✅ CUDA Version: {torch.version.cuda}")
print(f"✅ PyTorch Version: {torch.__version__}")
print(f"✅ GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
print(f"✅ Model: {config['model_name']}")
print(f"✅ Processing Speed: {len(large_sample) / (end_time - start_time):.1f} examples/second")
print(f"✅ Memory Efficient: {torch.cuda.memory_allocated() / 1024**2:.1f} MB GPU memory used")
print("\nThe optimizer is running successfully on A100 GPU! 🚀")