# A100 GPU Optimized - Hidden Letters Classification
## Optimizations Applied:
- Mixed Precision Training (FP16)
- XLA Compiler
- Increased Batch Size (8 -> 64)
- tf.data API with prefetch
- cuDNN Auto-tune

# 1. Environment Setup & GPU Optimization

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.model_selection import StratifiedKFold
from tensorflow.keras import Sequential
from tensorflow.keras.layers import *
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam

# GPU Memory Growth (prevent OOM)
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print(f"GPU detected: {len(gpus)} device(s)")
    except RuntimeError as e:
        print(e)

# Enable XLA Compiler for faster computation
tf.config.optimizer.set_jit(True)
print("XLA JIT Compilation: Enabled")

# Enable Mixed Precision (FP16) - A100 Tensor Core Optimization
from tensorflow.keras import mixed_precision
policy = mixed_precision.Policy('mixed_float16')
mixed_precision.set_global_policy(policy)
print(f"Mixed Precision Policy: {policy.name}")
print(f"Compute dtype: {policy.compute_dtype}")
print(f"Variable dtype: {policy.variable_dtype}")

# cuDNN Auto-tune
os.environ['TF_CUDNN_DETERMINISTIC'] = '0'
os.environ['TF_ENABLE_CUDNN_TENSOR_OP_MATH_FP32'] = '1'
print("cuDNN Auto-tune: Enabled")

# 2. Load Data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Base path configuration
base_path = '/content/drive/Othercomputers/ë‚´ PC/ROKEY_2526/dacon_hidden_letters/'

train = pd.read_csv(base_path + 'train.csv')
test = pd.read_csv(base_path + 'test.csv')
sub = pd.read_csv(base_path + 'submission.csv')

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")
print(f"Submission shape: {sub.shape}")
print("\nData loaded successfully!")

# 3. Data Preprocessing

In [None]:
# Display label distribution
print("Label Distribution:")
print(train['digit'].value_counts().sort_index())

In [None]:
# Extract features and labels
train_features = train.drop(['id', 'digit', 'letter'], axis=1).values
test_features = test.drop(['id', 'letter'], axis=1).values
train_labels = train['digit'].values

# Reshape to image format (28x28x1)
train_features = train_features.reshape(-1, 28, 28, 1)
test_features = test_features.reshape(-1, 28, 28, 1)

# Normalize to [0, 1] and convert to float32 (mixed precision will handle FP16)
train_features = train_features.astype(np.float32) / 255.0
test_features = test_features.astype(np.float32) / 255.0

print(f"Train features shape: {train_features.shape}")
print(f"Test features shape: {test_features.shape}")
print(f"Train labels shape: {train_labels.shape}")

In [None]:
# Visualize sample image
plt.figure(figsize=(4, 4))
plt.imshow(train_features[100].reshape(28, 28), cmap='gray')
plt.title(f"Label: {train_labels[100]}")
plt.axis('off')
plt.show()

# 4. Optimized Data Pipeline (tf.data API)

In [None]:
# Hyperparameters - Optimized for A100
BATCH_SIZE = 64  # Increased from 8 for better GPU utilization
N_SPLITS = 40
EPOCHS = 2000
LEARNING_RATE = 0.002
AUTOTUNE = tf.data.AUTOTUNE

print(f"Batch Size: {BATCH_SIZE}")
print(f"K-Fold Splits: {N_SPLITS}")
print(f"Max Epochs: {EPOCHS}")
print(f"Learning Rate: {LEARNING_RATE}")

In [None]:
@tf.function
def augment_image(image, label):
    """Data augmentation using tf.image for GPU acceleration"""
    # Random shift (height and width by 1 pixel)
    # Pad -> Random crop to simulate shift
    image = tf.image.pad_to_bounding_box(image, 1, 1, 30, 30)
    image = tf.image.random_crop(image, size=[28, 28, 1])
    return image, label

def create_train_dataset(x, y, batch_size, augment=True):
    """Create optimized training dataset with tf.data"""
    dataset = tf.data.Dataset.from_tensor_slices((x, y))
    dataset = dataset.shuffle(buffer_size=len(x), reshuffle_each_iteration=True)
    if augment:
        dataset = dataset.map(augment_image, num_parallel_calls=AUTOTUNE)
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(AUTOTUNE)
    return dataset

def create_valid_dataset(x, y, batch_size):
    """Create optimized validation dataset"""
    dataset = tf.data.Dataset.from_tensor_slices((x, y))
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(AUTOTUNE)
    return dataset

def create_test_dataset(x, batch_size):
    """Create optimized test dataset"""
    dataset = tf.data.Dataset.from_tensor_slices(x)
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(AUTOTUNE)
    return dataset

print("Data pipeline functions created.")

# 5. Model Architecture

In [None]:
def build_model():
    """Build CNN model optimized for mixed precision training"""
    model = Sequential([
        # Input
        Input(shape=(28, 28, 1)),
        
        # Block 1
        Conv2D(16, (3, 3), activation='relu', padding='same'),
        BatchNormalization(),
        Dropout(0.3),
        
        # Block 2
        Conv2D(32, (3, 3), activation='relu', padding='same'),
        BatchNormalization(),
        Conv2D(32, (5, 5), activation='relu', padding='same'),
        BatchNormalization(),
        Conv2D(32, (5, 5), activation='relu', padding='same'),
        BatchNormalization(),
        Conv2D(32, (5, 5), activation='relu', padding='same'),
        BatchNormalization(),
        MaxPooling2D((3, 3)),
        Dropout(0.3),
        
        # Block 3
        Conv2D(64, (3, 3), activation='relu', padding='same'),
        BatchNormalization(),
        Conv2D(64, (5, 5), activation='relu', padding='same'),
        BatchNormalization(),
        MaxPooling2D((3, 3)),
        Dropout(0.3),
        
        # Dense layers
        Flatten(),
        Dense(128, activation='relu'),
        BatchNormalization(),
        Dropout(0.3),
        Dense(64, activation='relu'),
        BatchNormalization(),
        Dropout(0.3),
        
        # Output layer with float32 for numerical stability in mixed precision
        Dense(10, activation='softmax', dtype='float32')
    ])
    
    return model

# Build and show model summary
sample_model = build_model()
sample_model.summary()

# 6. Training with K-Fold Cross Validation

In [None]:
%%time

# Cross validation setup
skf = StratifiedKFold(n_splits=N_SPLITS, random_state=42, shuffle=True)

# Callbacks
reduce_lr = ReduceLROnPlateau(patience=100, verbose=1, factor=0.5)
early_stop = EarlyStopping(patience=160, verbose=1, restore_best_weights=True)

# Results storage
val_loss_min = []
result = np.zeros((len(test_features), 10), dtype=np.float32)
nth = 0

# Create test dataset once (outside loop)
test_dataset = create_test_dataset(test_features, BATCH_SIZE)

print(f"Starting {N_SPLITS}-Fold Cross Validation...")
print("="*60)

for train_index, valid_index in skf.split(train_features, train_labels):
    nth += 1
    print(f"\n[Fold {nth}/{N_SPLITS}]")
    
    # Split data
    x_train, x_valid = train_features[train_index], train_features[valid_index]
    y_train, y_valid = train_labels[train_index], train_labels[valid_index]
    
    # Create optimized datasets
    train_dataset = create_train_dataset(x_train, y_train, BATCH_SIZE, augment=True)
    valid_dataset = create_valid_dataset(x_valid, y_valid, BATCH_SIZE)
    
    # Build fresh model for each fold
    model = build_model()
    
    # Compile with scaled loss for mixed precision
    model.compile(
        loss='sparse_categorical_crossentropy',
        optimizer=Adam(learning_rate=LEARNING_RATE, epsilon=1e-07),
        metrics=['accuracy']
    )
    
    # ModelCheckpoint for this fold
    checkpoint = ModelCheckpoint(
        f'best_model_fold_{nth}.h5',
        save_best_only=True,
        monitor='val_loss',
        verbose=0
    )
    
    # Train
    history = model.fit(
        train_dataset,
        epochs=EPOCHS,
        validation_data=valid_dataset,
        callbacks=[early_stop, checkpoint, reduce_lr],
        verbose=1
    )
    
    # Load best weights and predict
    model.load_weights(f'best_model_fold_{nth}.h5')
    fold_pred = model.predict(test_dataset, verbose=0)
    result += fold_pred / N_SPLITS
    
    # Save validation loss
    val_loss_min.append(min(history.history['val_loss']))
    print(f"Fold {nth} completed. Best val_loss: {val_loss_min[-1]:.4f}")
    
    # Clear session to free memory
    tf.keras.backend.clear_session()

print("\n" + "="*60)
print("Training Complete!")

# 7. Results Analysis

In [None]:
# Display validation loss statistics
print("Validation Loss per Fold:")
for i, loss in enumerate(val_loss_min, 1):
    print(f"  Fold {i:2d}: {loss:.4f}")

print(f"\nMean Validation Loss: {np.mean(val_loss_min):.4f}")
print(f"Std Validation Loss: {np.std(val_loss_min):.4f}")

# 8. Submission

In [None]:
# Generate predictions
sub['digit'] = result.argmax(axis=1)

# Display prediction distribution
print("Prediction Distribution:")
print(sub['digit'].value_counts().sort_index())

In [None]:
# Preview submission
print("Submission Preview:")
display(sub.head(10))

In [None]:
# Save submission
output_filename = 'submission_optimized_A100.csv'
sub.to_csv(output_filename, index=False)
print(f"Submission saved to: {output_filename}")

# 9. Performance Comparison Notes

## Optimization Summary:

| Optimization | Original | Optimized | Expected Speedup |
|-------------|----------|-----------|------------------|
| Precision | FP32 | Mixed FP16 | ~2-3x |
| Batch Size | 8 | 64 | ~3-5x |
| Data Pipeline | ImageDataGenerator | tf.data + prefetch | ~1.5-2x |
| XLA Compilation | Disabled | Enabled | ~1.2-1.5x |

**Total Expected Speedup: 5-10x faster on A100 GPU**

## Notes:
- Mixed precision training uses Tensor Cores on A100 for FP16 matrix operations
- Output layer uses FP32 for numerical stability (softmax)
- Larger batch size improves GPU utilization efficiency
- tf.data prefetch allows overlapping data loading with GPU computation