In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, PowerTransformer, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression

# ==========================================
# 1. LOAD DATA
# ==========================================
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# ==========================================
# 2. FEATURE ENGINEERING
# ==========================================
def create_features(df):
    df = df.copy()
    
    # 1. Activity Score
    df['activity_score'] = (df['hobby_engagement_level'] + 
                            df['physical_activity_index'] + 
                            df['creative_expression_index'])
    
    # 2. Support Score
    df['support_total'] = (df['support_environment_score'] + 
                           df['external_guidance_usage'] + 
                           df['upbringing_influence'])
    
    # 3. Efficiency Ratios
    df['efficiency'] = df['consistency_score'] / (df['focus_intensity'] + 1.0)
    df['focus_per_support'] = df['focus_intensity'] / (df['support_environment_score'] + 1.0)
    
    # 4. Age Norms
    df['consistency_per_age'] = df['consistency_score'] / df['age_group']
    
    return df

print("Feature Engineering...")
train_eng = create_features(train)
test_eng = create_features(test)

# ==========================================
# 3. PCA-GUIDED CLUSTERING
# ==========================================
print("Generating Structure Features...")

full_data = pd.concat([train_eng.drop('personality_cluster', axis=1), test_eng], axis=0, ignore_index=True)
cluster_cols = ['focus_intensity', 'consistency_score', 'efficiency', 'activity_score', 'support_total', 'focus_per_support']

scaler = StandardScaler()
full_scaled = scaler.fit_transform(full_data[cluster_cols])

# PCA to remove noise
pca = PCA(n_components=0.95, random_state=42)
full_pca = pca.fit_transform(full_scaled)

# Cluster 5 (Target aligned)
kmeans_5 = KMeans(n_clusters=5, random_state=42, n_init=50)
full_data['cluster_pca_5'] = kmeans_5.fit_predict(full_pca)

# Cluster 8 (Sub-types)
kmeans_8 = KMeans(n_clusters=8, random_state=42, n_init=50)
full_data['cluster_pca_8'] = kmeans_8.fit_predict(full_pca)

# Split back
train_eng['cluster_pca_5'] = full_data.iloc[:len(train)]['cluster_pca_5'].values
train_eng['cluster_pca_8'] = full_data.iloc[:len(train)]['cluster_pca_8'].values
test_eng['cluster_pca_5'] = full_data.iloc[len(train):]['cluster_pca_5'].values
test_eng['cluster_pca_8'] = full_data.iloc[len(train):]['cluster_pca_8'].values

# ==========================================
# 4. PREPROCESSING
# ==========================================
target_col = 'personality_cluster'
X = train_eng.drop([target_col, 'participant_id'], axis=1)
y = train_eng[target_col]
X_test = test_eng.drop(['participant_id'], axis=1)

nominal_cols = ['cultural_background', 'cluster_pca_5', 'cluster_pca_8']
numeric_cols = [c for c in X.columns if c not in nominal_cols]

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('power', PowerTransformer(method='yeo-johnson')) 
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, nominal_cols)
    ]
)

print("Preprocessing Data...")
X_processed = preprocessor.fit_transform(X)
X_test_processed = preprocessor.transform(X_test)

le = LabelEncoder()
y_encoded = le.fit_transform(y)

# ==========================================
# 5. THE "QUAD-FOREST" STACK
# ==========================================
print("Initializing Forest Models...")

# Forest 1: Standard Random Forest (Gini)
rf_gini = RandomForestClassifier(
    n_estimators=800, 
    criterion='gini', # Standard math
    max_depth=10, 
    min_samples_leaf=4,
    class_weight='balanced', 
    random_state=42
)

# Forest 2: Entropy Random Forest (Information Gain)
# "Thinks" differently about splits
rf_entropy = RandomForestClassifier(
    n_estimators=800, 
    criterion='entropy', # Different math
    max_depth=10, 
    min_samples_leaf=4,
    class_weight='balanced', 
    random_state=42
)

# Forest 3: Extra Trees (Gini)
# More random -> Reduces overfitting
et_gini = ExtraTreesClassifier(
    n_estimators=800,
    criterion='gini',
    max_depth=12, # ET can be deeper than RF without overfitting
    bootstrap=False,
    class_weight='balanced',
    random_state=42
)

# Forest 4: Extra Trees (Entropy)
et_entropy = ExtraTreesClassifier(
    n_estimators=800,
    criterion='entropy',
    max_depth=12,
    bootstrap=False,
    class_weight='balanced',
    random_state=42
)

# Meta-Learner: Logistic Regression
# It learns which Forest is right for which sample
meta_learner = LogisticRegression(class_weight='balanced', C=0.5, max_iter=2000)

stacking_clf = StackingClassifier(
    estimators=[
        ('rf_g', rf_gini),
        ('rf_e', rf_entropy),
        ('et_g', et_gini),
        ('et_e', et_entropy)
    ],
    final_estimator=meta_learner,
    cv=5,
    n_jobs=-1
)

# ==========================================
# 6. TRAIN & PREDICT
# ==========================================
print("Training Forest Stack... (This combines 3200 trees)")
stacking_clf.fit(X_processed, y_encoded)

print("Generating Predictions...")
y_pred_encoded = stacking_clf.predict(X_test_processed)
y_pred_labels = le.inverse_transform(y_pred_encoded)

# ==========================================
# 7. SUBMISSION
# ==========================================
submission = pd.DataFrame({
    'participant_id': test['participant_id'],
    'personality_cluster': y_pred_labels
})

filename = 'submission_forest_stack.csv'
submission.to_csv(filename, index=False)
print(f"SUCCESS! Forest-Based Submission Saved: {filename}")
print(submission.head())

Feature Engineering...
Generating Structure Features...
Preprocessing Data...
Initializing Forest Models...
Training Forest Stack... (This combines 3200 trees)
Generating Predictions...
SUCCESS! Forest-Based Submission Saved: submission_forest_stack.csv
   participant_id personality_cluster
0            1005           Cluster_E
1             197           Cluster_B
2            2343           Cluster_D
3            1709           Cluster_A
4             436           Cluster_E
