# Strategy 2: Dense Networks with Progressive Reduction

**Based on Professor's Top Submissions (Avg Score: 73.2)**

## Architecture Features:
- Dense MLP with progressive width reduction
- NO attention layers (simpler, faster)
- NO residual connections (pure feedforward)
- BatchNorm only (no LayerNorm)
- Progressive dropout (0.3 → 0.2 → 0.1)
- Width: 1024→768→512→384→256→128
- AdamW optimizer with weight decay

## Data Strategy:
- **CRITICAL: Split FIRST, then augment ONLY training data**
- Heavy augmentation on training set: 3x multiplier
- Techniques: Gaussian noise, perturbations, SMOTE-like interpolation
- Validation set: UNTOUCHED (no augmentation)

In [None]:
import numpy as np
import pickle
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import (
    Input, Dense, Embedding, Flatten, Concatenate,
    Dropout, BatchNormalization, Lambda
)
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import (
    EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
)
from tensorflow.keras.optimizers import AdamW
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')

# Reproducibility
np.random.seed(42)
tf.random.set_seed(42)

print("="*70)
print("STRATEGY 2: DENSE NETWORKS WITH PROGRESSIVE REDUCTION")
print("="*70)
print(f"TensorFlow version: {tf.__version__}")
print(f"GPU available: {tf.config.list_physical_devices('GPU')}")

## Step 1: Load Data

In [None]:
print("\nSTEP 1: Loading Data")
print("-"*70)

with open('data/combined_final_n_k_m_P.pkl', 'rb') as f:
    inputs_raw = pickle.load(f)

with open('data/combined_final_mHeights.pkl', 'rb') as f:
    outputs_raw = pickle.load(f)

print(f"Raw samples: {len(inputs_raw)}")
print(f"Sample: n={inputs_raw[0][0]}, k={inputs_raw[0][1]}, m={inputs_raw[0][2]}, P={inputs_raw[0][3].shape}")
print(f"Target range: [{np.min(outputs_raw):.2f}, {np.max(outputs_raw):.2f}]")

## Step 2: Split Data FIRST (NO AUGMENTATION YET)

In [None]:
print("\nSTEP 2: Split Data FIRST (before augmentation)")
print("-"*70)

# Create stratification labels
stratify_labels = [sample[1] * 10 + sample[2] for sample in inputs_raw]

# Split FIRST
inputs_train, inputs_val, outputs_train, outputs_val = train_test_split(
    inputs_raw, outputs_raw,
    test_size=0.15,
    random_state=42,
    stratify=stratify_labels
)

print(f"Training samples (before augmentation): {len(inputs_train)}")
print(f"Validation samples (NO augmentation): {len(inputs_val)}")
print("\n⚠️  CRITICAL: Validation data will NOT be augmented (prevents data leakage)")

## Step 3: Augment ONLY Training Data (3x)

In [None]:
print("\nSTEP 3: Augment ONLY Training Data (3x multiplier)")
print("-"*70)

def augment_sample_gaussian(n, k, m, P, target, noise_level=0.03):
    P_aug = P.copy().astype(np.float32)
    noise = np.random.normal(0, noise_level, P_aug.shape)
    P_aug = P_aug + noise * np.std(P_aug)
    return [n, k, m, P_aug], target

def augment_sample_perturbation(n, k, m, P, target, strength=0.02):
    P_aug = P.copy().astype(np.float32)
    perturbation = np.random.uniform(-strength, strength, P_aug.shape)
    P_aug = P_aug * (1 + perturbation)
    return [n, k, m, P_aug], target

def augment_sample_interpolation(sample1, sample2, target1, target2):
    alpha = np.random.uniform(0.3, 0.7)
    n, k, m = sample1[0], sample1[1], sample1[2]
    P1, P2 = sample1[3], sample2[3]
    if sample1[1] == sample2[1] and sample1[2] == sample2[2]:
        P_new = alpha * P1.astype(np.float32) + (1 - alpha) * P2.astype(np.float32)
        target_new = alpha * target1 + (1 - alpha) * target2
        return [n, k, m, P_new], target_new
    return None, None

# Group TRAINING samples by (k,m)
train_groups = defaultdict(list)
for i, sample in enumerate(inputs_train):
    k, m = sample[1], sample[2]
    train_groups[(k, m)].append(i)

inputs_train_aug = []
outputs_train_aug = []

# Augment ONLY training data
for i, (sample, target) in enumerate(zip(inputs_train, outputs_train)):
    n, k, m, P = sample
    
    # Keep original
    inputs_train_aug.append(sample)
    outputs_train_aug.append(target)
    
    # Aug 1: Gaussian noise
    aug1, tgt1 = augment_sample_gaussian(n, k, m, P, target)
    inputs_train_aug.append(aug1)
    outputs_train_aug.append(tgt1)
    
    # Aug 2: Perturbation or interpolation
    if np.random.rand() < 0.5:
        aug2, tgt2 = augment_sample_perturbation(n, k, m, P, target)
        inputs_train_aug.append(aug2)
        outputs_train_aug.append(tgt2)
    else:
        group_indices = train_groups[(k, m)]
        if len(group_indices) > 1:
            j = np.random.choice([idx for idx in group_indices if idx != i])
            aug2, tgt2 = augment_sample_interpolation(
                sample, inputs_train[j], target, outputs_train[j]
            )
            if aug2 is not None:
                inputs_train_aug.append(aug2)
                outputs_train_aug.append(tgt2)
            else:
                aug2, tgt2 = augment_sample_perturbation(n, k, m, P, target)
                inputs_train_aug.append(aug2)
                outputs_train_aug.append(tgt2)
        else:
            aug2, tgt2 = augment_sample_perturbation(n, k, m, P, target)
            inputs_train_aug.append(aug2)
            outputs_train_aug.append(tgt2)

print(f"Training samples after augmentation: {len(inputs_train_aug)}")
print(f"Augmentation ratio: {len(inputs_train_aug) / len(inputs_train):.2f}x")
print(f"Validation samples (unchanged): {len(inputs_val)}")
print("\n✅ NO DATA LEAKAGE: Validation set is completely independent")

## Step 4: Prepare Data for Training

In [None]:
print("\nSTEP 4: Prepare Data for Training")
print("-"*70)

def prepare_data(inputs, outputs):
    n_vals = []
    k_vals = []
    m_vals = []
    P_flat = []
    
    for sample in inputs:
        n_vals.append(sample[0])
        k_vals.append(sample[1])
        m_vals.append(sample[2])
        P_flat.append(sample[3].flatten())
    
    n_vals = np.array(n_vals, dtype=np.float32).reshape(-1, 1)
    k_vals = np.array(k_vals, dtype=np.int32).reshape(-1, 1)
    m_vals = np.array(m_vals, dtype=np.int32).reshape(-1, 1)
    outputs_arr = np.array(outputs, dtype=np.float32)
    
    # Pad P matrices
    max_p_size = max(len(p) for p in P_flat)
    P_padded = []
    for p in P_flat:
        if len(p) < max_p_size:
            padded = np.zeros(max_p_size, dtype=np.float32)
            padded[:len(p)] = p
            P_padded.append(padded)
        else:
            P_padded.append(p)
    
    P_arr = np.array(P_padded, dtype=np.float32)
    outputs_arr = np.maximum(outputs_arr, 1.0)
    
    return n_vals, k_vals, m_vals, P_arr, outputs_arr

n_train, k_train, m_train, P_train, y_train = prepare_data(inputs_train_aug, outputs_train_aug)
n_val, k_val, m_val, P_val, y_val = prepare_data(inputs_val, outputs_val)

# Normalize P matrices (fit on training, transform both)
scaler = StandardScaler()
P_train = scaler.fit_transform(P_train)
P_val = scaler.transform(P_val)  # Use training scaler

print(f"Training: n={n_train.shape}, k={k_train.shape}, m={m_train.shape}, P={P_train.shape}, y={y_train.shape}")
print(f"Validation: n={n_val.shape}, k={k_val.shape}, m={m_val.shape}, P={P_val.shape}, y={y_val.shape}")
print(f"P train: mean={P_train.mean():.4f}, std={P_train.std():.4f}")
print(f"P val: mean={P_val.mean():.4f}, std={P_val.std():.4f}")

## Step 5: Build Model (Dense Progressive)

In [None]:
print("\nSTEP 5: Building Dense Progressive Model")
print("-"*70)

def build_model(p_shape, k_vocab_size=7, m_vocab_size=6):
    """
    Dense MLP with progressive width reduction
    - NO attention layers
    - NO residual connections
    - BatchNorm only (no LayerNorm)
    - Progressive reduction: 1024→768→512→384→256→128
    """
    n_input = Input(shape=(1,), name='n')
    k_input = Input(shape=(1,), name='k', dtype=tf.int32)
    m_input = Input(shape=(1,), name='m', dtype=tf.int32)
    P_input = Input(shape=(p_shape,), name='P_flat')

    # Embeddings
    k_embed = Flatten()(Embedding(k_vocab_size, 32)(k_input))
    m_embed = Flatten()(Embedding(m_vocab_size, 32)(m_input))

    # P processing (simple, no attention)
    x = Dense(256, activation='gelu')(P_input)
    x = BatchNormalization()(x)
    x = Dropout(0.3)(x)
    
    x = Dense(512, activation='gelu')(x)
    x = BatchNormalization()(x)
    x = Dropout(0.2)(x)

    # Combine all features
    combined = Concatenate()([n_input, k_embed, m_embed, x])

    # Dense progressive reduction: 1024→768→512→384→256→128
    x = Dense(1024, activation='gelu')(combined)
    x = BatchNormalization()(x)
    x = Dropout(0.3)(x)
    
    x = Dense(768, activation='gelu')(x)
    x = BatchNormalization()(x)
    x = Dropout(0.3)(x)
    
    x = Dense(512, activation='gelu')(x)
    x = BatchNormalization()(x)
    x = Dropout(0.2)(x)
    
    x = Dense(384, activation='gelu')(x)
    x = BatchNormalization()(x)
    x = Dropout(0.2)(x)
    
    x = Dense(256, activation='gelu')(x)
    x = BatchNormalization()(x)
    x = Dropout(0.1)(x)
    
    x = Dense(128, activation='gelu')(x)
    x = BatchNormalization()(x)
    x = Dropout(0.1)(x)

    # Log-space prediction (same as strategy 1)
    log2_pred = Dense(1, activation='linear')(x)
    log2_positive = Lambda(lambda z: tf.nn.softplus(z))(log2_pred)
    m_height = Lambda(lambda z: tf.pow(2.0, z))(log2_positive)

    return Model(inputs=[n_input, k_input, m_input, P_input], outputs=m_height,
                 name='strategy2_dense_progressive')

model = build_model(P_train.shape[1], k_vocab_size=k_train.max()+1, m_vocab_size=m_train.max()+1)
print(f"Parameters: {model.count_params():,}")
model.summary()

## Step 6: Train

In [None]:
print("\nSTEP 6: Compile and Train")
print("-"*70)

def log2_mse_loss(y_true, y_pred):
    epsilon = 1e-7
    y_true = tf.maximum(y_true, epsilon)
    y_pred = tf.maximum(y_pred, epsilon)
    log2_true = tf.math.log(y_true) / tf.math.log(2.0)
    log2_pred = tf.math.log(y_pred) / tf.math.log(2.0)
    return tf.reduce_mean(tf.square(log2_true - log2_pred))

optimizer = AdamW(learning_rate=1e-3, weight_decay=1e-4, clipnorm=1.0)
model.compile(optimizer=optimizer, loss=log2_mse_loss, metrics=[log2_mse_loss])

callbacks = [
    EarlyStopping(monitor='val_loss', patience=30, restore_best_weights=True, verbose=1),
    ReduceLROnPlateau(monitor='val_loss', factor=0.7, patience=15, min_lr=1e-6, verbose=1),
    ModelCheckpoint('strategy2_best_model.h5', monitor='val_loss', save_best_only=True, verbose=1)
]

history = model.fit(
    [n_train, k_train, m_train, P_train], y_train,
    validation_data=([n_val, k_val, m_val, P_val], y_val),
    epochs=200, batch_size=256, callbacks=callbacks, verbose=1
)

print("\nTraining completed!")

## Step 7: Evaluate

In [None]:
print("\nSTEP 7: Evaluation")
print("-"*70)

model.load_weights('strategy2_best_model.h5')
y_pred_val = model.predict([n_val, k_val, m_val, P_val], verbose=0).flatten()

def compute_log2_mse(y_true, y_pred):
    epsilon = 1e-7
    y_true = np.maximum(y_true, epsilon)
    y_pred = np.maximum(y_pred, epsilon)
    return np.mean((np.log2(y_true) - np.log2(y_pred)) ** 2)

val_log2_mse = compute_log2_mse(y_val, y_pred_val)

print(f"\nValidation log2-MSE: {val_log2_mse:.6f}")
print(f"Prediction range: [{y_pred_val.min():.2f}, {y_pred_val.max():.2f}]")

group_metrics = defaultdict(lambda: {'true': [], 'pred': []})
for i in range(len(y_val)):
    k, m = k_val[i, 0], m_val[i, 0]
    group_metrics[(k, m)]['true'].append(y_val[i])
    group_metrics[(k, m)]['pred'].append(y_pred_val[i])

print("\nPer-Group Performance:")
print(f"{'Group':<12} {'n_val':<8} {'log2-MSE':<12}")
print("-"*40)
for (k, m), data in sorted(group_metrics.items()):
    group_mse = compute_log2_mse(np.array(data['true']), np.array(data['pred']))
    print(f"k={k}, m={m}    {len(data['true']):6d}   {group_mse:.6f}")

print("\n" + "="*70)
print("STRATEGY 2 COMPLETE")
print("="*70)