In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, PowerTransformer, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# The Boosting Models
from sklearn.ensemble import HistGradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression

# ==========================================
# 1. LOAD DATA
# ==========================================
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# ==========================================
# 2. FEATURE ENGINEERING
# ==========================================
def create_features(df):
    df = df.copy()
    
    # These columns had near-zero importance in previous runs
    df = df.drop(columns=['altruism_score', 'identity_code'], errors='ignore')
    
    # 2. Activity Score
    df['activity_score'] = (df['hobby_engagement_level'] + 
                            df['physical_activity_index'] + 
                            df['creative_expression_index'])
    
    # 3. Support Score
    df['support_total'] = (df['support_environment_score'] + 
                           df['external_guidance_usage'] + 
                           df['upbringing_influence'])
    
    # 4. Efficiency Ratios
    df['efficiency'] = df['consistency_score'] / (df['focus_intensity'] + 1.0)
    df['focus_per_support'] = df['focus_intensity'] / (df['support_environment_score'] + 1.0)
    
    # 5. Age Norms
    df['consistency_per_age'] = df['consistency_score'] / df['age_group']
    
    return df

print("Feature Engineering & Cleaning...")
train_eng = create_features(train)
test_eng = create_features(test)

# ==========================================
# 3. PCA-GUIDED CLUSTERING (The Secret Sauce)
# ==========================================
print("Generating Structure Features...")

full_data = pd.concat([train_eng.drop('personality_cluster', axis=1), test_eng], axis=0, ignore_index=True)
# We exclude the dropped columns automatically
cluster_cols = ['focus_intensity', 'consistency_score', 'efficiency', 'activity_score', 'support_total', 'focus_per_support']

scaler = StandardScaler()
full_scaled = scaler.fit_transform(full_data[cluster_cols])

# PCA (95% Variance)
pca = PCA(n_components=0.95, random_state=42)
full_pca = pca.fit_transform(full_scaled)

# Cluster 5 & 8
kmeans_5 = KMeans(n_clusters=5, random_state=42, n_init=50)
full_data['cluster_pca_5'] = kmeans_5.fit_predict(full_pca)

kmeans_8 = KMeans(n_clusters=8, random_state=42, n_init=50)
full_data['cluster_pca_8'] = kmeans_8.fit_predict(full_pca)

# Split back
train_eng['cluster_pca_5'] = full_data.iloc[:len(train)]['cluster_pca_5'].values
train_eng['cluster_pca_8'] = full_data.iloc[:len(train)]['cluster_pca_8'].values
test_eng['cluster_pca_5'] = full_data.iloc[len(train):]['cluster_pca_5'].values
test_eng['cluster_pca_8'] = full_data.iloc[len(train):]['cluster_pca_8'].values

# ==========================================
# 4. PREPROCESSING
# ==========================================
target_col = 'personality_cluster'
X = train_eng.drop([target_col, 'participant_id'], axis=1)
y = train_eng[target_col]
X_test = test_eng.drop(['participant_id'], axis=1)

nominal_cols = ['cultural_background', 'cluster_pca_5', 'cluster_pca_8']
numeric_cols = [c for c in X.columns if c not in nominal_cols]

# PowerTransformer is crucial for Boosting to handle outliers
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('power', PowerTransformer(method='yeo-johnson')) 
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, nominal_cols)
    ]
)

print("Preprocessing Data...")
X_processed = preprocessor.fit_transform(X)
X_test_processed = preprocessor.transform(X_test)

le = LabelEncoder()
y_encoded = le.fit_transform(y)

# ==========================================
# 5. THE "TRIPLE-BOOST" ENSEMBLE
# ==========================================
print("Initializing Optimized Boosting Models...")

# Model A: "The Tank" (High Regularization)
# Prevents overfitting by penalizing complex rules (L2=10)
hgb_tank = HistGradientBoostingClassifier(
    learning_rate=0.03, 
    max_iter=1000, 
    max_depth=6, 
    l2_regularization=10.0, 
    class_weight='balanced', 
    random_state=42
)

# Model B: "The Sniper"(Precision)
hgb_sniper = HistGradientBoostingClassifier(
    learning_rate=0.01, 
    max_iter=2000, 
    max_depth=10, 
    l2_regularization=5.0, 
    class_weight='balanced', 
    random_state=42
)

# Model C: "The Scout" (Broad Strokes)
hgb_scout = HistGradientBoostingClassifier(
    learning_rate=0.05, 
    max_iter=800, 
    max_depth=4, 
    l2_regularization=5.0, 
    class_weight='balanced', 
    random_state=42
)

# Voting: Soft averaging of probabilities
voting_clf = VotingClassifier(
    estimators=[
        ('tank', hgb_tank),
        ('sniper', hgb_sniper),
        ('scout', hgb_scout)
    ],
    voting='soft',
    weights=[2, 3, 1] # Giving highest trust to the "Sniper" (Deep/Slow) model
)

# ==========================================
# 6. TRAIN & PREDICT
# ==========================================
print("Training Boosting Ensemble...")
voting_clf.fit(X_processed, y_encoded)

print("Generating Predictions...")
y_pred_encoded = voting_clf.predict(X_test_processed)
y_pred_labels = le.inverse_transform(y_pred_encoded)

# ==========================================
# 7. SUBMISSION
# ==========================================
submission = pd.DataFrame({
    'participant_id': test['participant_id'],
    'personality_cluster': y_pred_labels
})

filename = 'submission_optimized_xgboost.csv'
submission.to_csv(filename, index=False)
print(f"SUCCESS! Optimized XGBoost-style Submission Saved: {filename}")
print(submission.head())

Feature Engineering & Cleaning...
Generating Structure Features...
Preprocessing Data...
Initializing Optimized Boosting Models...
Training Boosting Ensemble...
Generating Predictions...
SUCCESS! Optimized XGBoost-style Submission Saved: submission_optimized_xgboost.csv
   participant_id personality_cluster
0            1005           Cluster_E
1             197           Cluster_C
2            2343           Cluster_E
3            1709           Cluster_B
4             436           Cluster_E
