In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import f1_score

# ==========================================
# 1. LOAD DATA
# ==========================================
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# ==========================================
# 2. FEATURE ENGINEERING
# ==========================================
def create_features(df):
    df = df.copy()
    
    # 1. Activity Score
    df['activity_score'] = (df['hobby_engagement_level'] + 
                            df['physical_activity_index'] + 
                            df['creative_expression_index'])
    
    # 2. Support Score
    df['support_total'] = (df['support_environment_score'] + 
                           df['external_guidance_usage'] + 
                           df['upbringing_influence'])
    
    # 3. Efficiency Ratios
    df['efficiency'] = df['consistency_score'] / (df['focus_intensity'] + 1.0)
    df['focus_per_support'] = df['focus_intensity'] / (df['support_environment_score'] + 1.0)
    
    # 4. Age Norms
    df['consistency_per_age'] = df['consistency_score'] / df['age_group']
    
    return df

print("Feature Engineering...")
train_eng = create_features(train)
test_eng = create_features(test)

# ==========================================
# 3. PCA-GUIDED CLUSTERING (Crucial Step)
# ==========================================
print("Generating PCA-Cluster Features...")

# Combine for structure learning
full_data = pd.concat([train_eng.drop('personality_cluster', axis=1), test_eng], axis=0, ignore_index=True)

# Select columns that define structure
cluster_cols = ['focus_intensity', 'consistency_score', 'efficiency', 'activity_score', 'support_total', 'focus_per_support']
scaler = StandardScaler()
full_scaled = scaler.fit_transform(full_data[cluster_cols])

# PCA (Keep 95% variance)
pca = PCA(n_components=0.95, random_state=42)
full_pca = pca.fit_transform(full_scaled)

# Cluster Features
kmeans_5 = KMeans(n_clusters=5, random_state=42, n_init=50)
full_data['cluster_pca_5'] = kmeans_5.fit_predict(full_pca)

kmeans_8 = KMeans(n_clusters=8, random_state=42, n_init=50)
full_data['cluster_pca_8'] = kmeans_8.fit_predict(full_pca)

# Split back
train_eng['cluster_pca_5'] = full_data.iloc[:len(train)]['cluster_pca_5'].values
train_eng['cluster_pca_8'] = full_data.iloc[:len(train)]['cluster_pca_8'].values
test_eng['cluster_pca_5'] = full_data.iloc[len(train):]['cluster_pca_5'].values
test_eng['cluster_pca_8'] = full_data.iloc[len(train):]['cluster_pca_8'].values

# ==========================================
# 4. PREPARE DATA FOR LIGHTGBM
# ==========================================
target_col = 'personality_cluster'
drop_cols = ['participant_id', target_col]

X = train_eng.drop(drop_cols, axis=1)
y = train_eng[target_col]
X_test = test_eng.drop(['participant_id'], axis=1)

# Encode Target
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Identify Categorical Features
# LightGBM handles these natively and often better than OneHot
cat_cols = ['cultural_background', 'cluster_pca_5', 'cluster_pca_8', 'identity_code']
# Ensure they are category type for LGBM
for col in cat_cols:
    X[col] = X[col].astype('category')
    X_test[col] = X_test[col].astype('category')

# ==========================================
# 5. STRATIFIED LIGHTGBM ENSEMBLE
# ==========================================
print("Training LightGBM Ensemble...")

# Robust Parameters for Multiclass F1
lgbm_params = {
    'objective': 'multiclass',
    'num_class': len(np.unique(y_encoded)),
    'metric': 'multi_logloss',
    'boosting_type': 'gbdt',
    'learning_rate': 0.015,      # Low rate for better generalization
    'num_leaves': 31,            # Standard, prevent overfitting
    'max_depth': 8,              # Limit depth to prevent memorization
    'min_child_samples': 20,
    'feature_fraction': 0.8,     # Randomly select 80% of features per tree
    'bagging_fraction': 0.8,     # Randomly select 80% of data per iteration
    'bagging_freq': 1,
    'lambda_l1': 1.0,            # L1 regularization
    'lambda_l2': 10.0,           # L2 
    'n_jobs': -1,
    'verbose': -1,
    'seed': 42
}

# 10-Fold CV for maximum stability
n_folds = 10
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)

# Holders for predictions
oof_preds = np.zeros((len(X), len(le.classes_)))
test_preds = np.zeros((len(X_test), len(le.classes_)))

cv_scores = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y_encoded)):
    X_train, y_train = X.iloc[train_idx], y_encoded[train_idx]
    X_val, y_val = X.iloc[val_idx], y_encoded[val_idx]
    
    # Create dataset
    dtrain = lgb.Dataset(X_train, label=y_train)
    dval = lgb.Dataset(X_val, label=y_val, reference=dtrain)
    
    # Train
    model = lgb.train(
        lgbm_params,
        dtrain,
        valid_sets=[dtrain, dval],
        num_boost_round=3000,
        callbacks=[
            lgb.early_stopping(stopping_rounds=100),
            lgb.log_evaluation(period=0) # Silence logs
        ]
    )
    
    # Predict
    val_prob = model.predict(X_val)
    oof_preds[val_idx] = val_prob
    
    # Score
    val_pred_labels = np.argmax(val_prob, axis=1)
    fold_score = f1_score(y_val, val_pred_labels, average='macro')
    cv_scores.append(fold_score)
    
    # Test accumulation (average later)
    test_preds += model.predict(X_test) / n_folds
    
    print(f"Fold {fold+1}/{n_folds} | F1 Macro: {fold_score:.5f}")

print(f"\nAverage CV F1: {np.mean(cv_scores):.5f} +/- {np.std(cv_scores):.5f}")

# ==========================================
# 6. SUBMISSION
# ==========================================
final_pred_indices = np.argmax(test_preds, axis=1)
final_pred_labels = le.inverse_transform(final_pred_indices)

submission = pd.DataFrame({
    'participant_id': test['participant_id'],
    'personality_cluster': final_pred_labels
})

filename = 'submission_pure_lgbm.csv'
submission.to_csv(filename, index=False)
print(f"Submission saved to {filename}")

Feature Engineering...
Generating PCA-Cluster Features...
Training LightGBM Ensemble...
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[473]	training's multi_logloss: 0.469909	valid_1's multi_logloss: 0.777312
Fold 1/10 | F1 Macro: 0.53454
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[362]	training's multi_logloss: 0.522641	valid_1's multi_logloss: 0.745753
Fold 2/10 | F1 Macro: 0.62167
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[402]	training's multi_logloss: 0.504518	valid_1's multi_logloss: 0.732621
Fold 3/10 | F1 Macro: 0.57009
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[272]	training's multi_logloss: 0.578302	valid_1's multi_logloss: 0.760933
Fold 4/10 | F1 Macro: 0.52144
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[434]	t