In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.utils import class_weight
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.metrics import f1_score
import tensorflow as tf

# Set seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# 1. Load Data
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# ==========================================
# 4.6 FEATURE ENGINEERING (NEW SECTION)
# ==========================================
# We create new features BEFORE splitting X and y so both get them.
def create_features(df):
    df = df.copy()
    # 1. Total Activity Score: Combining the 3 binary activity columns
    df['total_activity_score'] = (df['hobby_engagement_level'] + 
                                  df['physical_activity_index'] + 
                                  df['creative_expression_index'])
    
    # 2. Support-Guidance Interaction: Does the person use guidance AND have support?
    df['support_x_guidance'] = df['support_environment_score'] * df['external_guidance_usage']
    return df

# Apply to both Train and Test
train_df = create_features(train_df)
test_df = create_features(test_df)

print("--- Added New Features ---")
print(train_df[['total_activity_score', 'support_x_guidance']].head())

# 2. Preprocessing
# Separate features and target
X = train_df.drop(['participant_id', 'personality_cluster'], axis=1)
y = train_df['personality_cluster']
X_test_orig = test_df.drop(['participant_id'], axis=1)

# Encode Target Labels (Cluster_A -> 0, etc.)
le = LabelEncoder()
y_enc = le.fit_transform(y)

# Define Feature Groups
# UPDATED: Added new features here so they get Scaled
numeric_cols = ['age_group', 'upbringing_influence', 'focus_intensity', 
                'consistency_score', 'support_environment_score',
                'total_activity_score', 'support_x_guidance']

categorical_cols = ['cultural_background']
# Binary/Other columns are left as is (passthrough)

# Build Preprocessing Pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ],
    remainder='passthrough'
)

# Apply Transformations
X_processed = preprocessor.fit_transform(X)
X_test_processed = preprocessor.transform(X_test_orig)

# Split into Train and Validation Sets (Stratified to keep class ratios)
X_train, X_val, y_train, y_val = train_test_split(
    X_processed, y_enc, test_size=0.2, random_state=42, stratify=y_enc
)

# 3. Handle Class Imbalance
class_weights = class_weight.compute_class_weight(
    'balanced', classes=np.unique(y_enc), y=y_enc
)
class_weight_dict = dict(enumerate(class_weights))
print("Class Weights:", class_weight_dict)

# 4. Build Pure Neural Network Model
model = keras.Sequential([
    layers.Input(shape=(X_train.shape[1],)),
    
    # Hidden Layer 1
    layers.Dense(128, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.3),
    
    # Hidden Layer 2
    layers.Dense(64, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.3),
    
    # Hidden Layer 3
    layers.Dense(32, activation='relu'),
    
    # Output Layer (5 classes)
    layers.Dense(len(le.classes_), activation='softmax')
])

model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

# 5. Train the Model
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=50,
    batch_size=32,
    class_weight=class_weight_dict, # CRITICAL for F1 Score
    verbose=1
)

# Check Validation Score Locally
val_preds = np.argmax(model.predict(X_val), axis=1)
val_f1 = f1_score(y_val, val_preds, average='macro')
print(f"Local Validation Macro F1: {val_f1:.4f}")

# 6. Generate Predictions
test_preds_prob = model.predict(X_test_processed)
test_preds = np.argmax(test_preds_prob, axis=1)
test_labels = le.inverse_transform(test_preds)

# 7. Create Submission File
submission = pd.DataFrame({
    'participant_id': test_df['participant_id'],
    'personality_cluster': test_labels
})
submission.to_csv('submission_aa_nn_updated.csv', index=False)
print("Submission saved successfully!")