In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import f1_score
from sklearn.utils import class_weight

# Set seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# ==========================================
# 1. LOAD DATA
# ==========================================
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# ==========================================
# 2. ADVANCED FEATURE ENGINEERING
# ==========================================
def create_features(df):
    df = df.copy()
    
    # 1. Activity Score
    df['activity_score'] = (df['hobby_engagement_level'] + 
                            df['physical_activity_index'] + 
                            df['creative_expression_index'])
    
    # 2. Support Score
    df['support_total'] = (df['support_environment_score'] + 
                           df['external_guidance_usage'] + 
                           df['upbringing_influence'])
    
    # 3. Efficiency Ratios (Adding epsilon for safety)
    df['efficiency'] = df['consistency_score'] / (df['focus_intensity'] + 1.0)
    df['focus_per_support'] = df['focus_intensity'] / (df['support_environment_score'] + 1.0)
    
    # 4. Age Norms
    df['consistency_per_age'] = df['consistency_score'] / df['age_group']
    
    return df

print("Generating Base Features...")
train_df = create_features(train_df)
test_df = create_features(test_df)

# ==========================================
# 3. PCA-GUIDED CLUSTERING FEATURES
# ==========================================
print("Generating PCA-Cluster Features...")

# Combine for global structure learning (excluding target)
train_temp = train_df.drop(['personality_cluster', 'participant_id'], axis=1)
test_temp = test_df.drop(['participant_id'], axis=1)
full_data = pd.concat([train_temp, test_temp], axis=0, ignore_index=True)

# Select columns for structure
cluster_cols = ['focus_intensity', 'consistency_score', 'efficiency', 
                'activity_score', 'support_total', 'focus_per_support']

# Scale for PCA
scaler_pca = StandardScaler()
full_scaled = scaler_pca.fit_transform(full_data[cluster_cols])

# PCA (Keep 95% variance)
pca = PCA(n_components=0.95, random_state=42)
full_pca = pca.fit_transform(full_scaled)

# Cluster 5 (Target aligned)
kmeans_5 = KMeans(n_clusters=5, random_state=42, n_init=20)
full_data['cluster_pca_5'] = kmeans_5.fit_predict(full_pca)

# Cluster 8 (Sub-types)
kmeans_8 = KMeans(n_clusters=8, random_state=42, n_init=20)
full_data['cluster_pca_8'] = kmeans_8.fit_predict(full_pca)

# Split back to Train/Test
train_df['cluster_pca_5'] = full_data.iloc[:len(train_df)]['cluster_pca_5'].values
train_df['cluster_pca_8'] = full_data.iloc[:len(train_df)]['cluster_pca_8'].values
test_df['cluster_pca_5'] = full_data.iloc[len(train_df):]['cluster_pca_5'].values
test_df['cluster_pca_8'] = full_data.iloc[len(train_df):]['cluster_pca_8'].values

# ==========================================
# 4. PREPROCESSING
# ==========================================
X = train_df.drop(['participant_id', 'personality_cluster'], axis=1)
y = train_df['personality_cluster']
X_test = test_df.drop(['participant_id'], axis=1)

# Encode Target
le = LabelEncoder()
y_enc = le.fit_transform(y)
num_classes = len(le.classes_)

# Identify Feature Groups
categorical_cols = ['cultural_background', 'cluster_pca_5', 'cluster_pca_8']
numeric_cols = [c for c in X.columns if c not in categorical_cols]

# Build Transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ]
)

print("Preprocessing Data...")
X_processed = preprocessor.fit_transform(X)
X_test_processed = preprocessor.transform(X_test)

# ==========================================
# 5. NEURAL NETWORK ENSEMBLING (K-FOLD)
# ==========================================
# Instead of one split, we use 5 splits and average the results.
N_FOLDS = 5
kfold = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=42)

# Store test predictions (probabilities) to average later
test_probs_sum = np.zeros((X_test_processed.shape[0], num_classes))
oof_preds = np.zeros(X_processed.shape[0]) # Out-of-fold predictions
cv_scores = []

print(f"\nStarting {N_FOLDS}-Fold Cross-Validation Ensemble...")

for fold, (train_idx, val_idx) in enumerate(kfold.split(X_processed, y_enc)):
    # 1. Split Data
    X_train_fold, X_val_fold = X_processed[train_idx], X_processed[val_idx]
    y_train_fold, y_val_fold = y_enc[train_idx], y_enc[val_idx]
    
    # 2. Compute Class Weights for this fold
    class_weights = class_weight.compute_class_weight(
        'balanced', classes=np.unique(y_enc), y=y_train_fold
    )
    class_weight_dict = dict(enumerate(class_weights))
    
    # 3. Define Model Architecture (Fresh for each fold)
    model = keras.Sequential([
        layers.Input(shape=(X_train_fold.shape[1],)),
        
        layers.Dense(128, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.4),
        
        layers.Dense(64, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.3),
        
        layers.Dense(32, activation='relu'),
        
        layers.Dense(num_classes, activation='softmax')
    ])
    
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=0.001),
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    
    # 4. Train with Early Stopping
    early_stopping = keras.callbacks.EarlyStopping(
        monitor='val_loss', patience=10, restore_best_weights=True, verbose=0
    )
    
    print(f"Training Fold {fold+1}...")
    model.fit(
        X_train_fold, y_train_fold,
        validation_data=(X_val_fold, y_val_fold),
        epochs=60,
        batch_size=32,
        class_weight=class_weight_dict,
        callbacks=[early_stopping],
        verbose=0 # Silent training
    )
    
    # 5. Evaluate on Validation
    val_probs = model.predict(X_val_fold, verbose=0)
    val_pred_labels = np.argmax(val_probs, axis=1)
    score = f1_score(y_val_fold, val_pred_labels, average='macro')
    cv_scores.append(score)
    print(f"  Fold {fold+1} F1 Score: {score:.4f}")
    
    # 6. Predict on Test (Accumulate Probabilities)
    test_probs_sum += model.predict(X_test_processed, verbose=0)

# ==========================================
# 6. FINAL PREDICTION
# ==========================================
print(f"\nAverage CV F1 Score: {np.mean(cv_scores):.4f}")

# Average the probabilities across all folds
avg_test_probs = test_probs_sum / N_FOLDS
final_preds = np.argmax(avg_test_probs, axis=1)
final_labels = le.inverse_transform(final_preds)

# Save Submission
submission = pd.DataFrame({
    'participant_id': test_df['participant_id'],
    'personality_cluster': final_labels
})
filename = 'submission_nn_ensemble_advanced.csv'
submission.to_csv(filename, index=False)
print(f"Ensemble Submission Saved: {filename}")