In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, PowerTransformer, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.ensemble import HistGradientBoostingClassifier

# ==========================================
# 1. LOAD DATA
# ==========================================
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# ==========================================
# 2. FEATURE ENGINEERING
# ==========================================
def create_features(df):
    df = df.copy()
    
    # 1. Drop Noise
    df = df.drop(columns=['altruism_score', 'identity_code'], errors='ignore')
    
    # 2. Activity Score
    df['activity_score'] = (df['hobby_engagement_level'] + 
                            df['physical_activity_index'] + 
                            df['creative_expression_index'])
    
    # 3. Support Score
    df['support_total'] = (df['support_environment_score'] + 
                           df['external_guidance_usage'] + 
                           df['upbringing_influence'])
    
    # 4. Efficiency Ratios
    df['efficiency'] = df['consistency_score'] / (df['focus_intensity'] + 1.0)
    df['focus_per_support'] = df['focus_intensity'] / (df['support_environment_score'] + 1.0)
    
    return df

print("Feature Engineering...")
train_eng = create_features(train)
test_eng = create_features(test)

# ==========================================
# 3. PCA-GUIDED CLUSTERING
# ==========================================
print("Generating Structure Features...")
full_data = pd.concat([train_eng.drop('personality_cluster', axis=1), test_eng], axis=0, ignore_index=True)

# Select numeric columns for structure finding
cluster_cols = ['focus_intensity', 'consistency_score', 'efficiency', 'activity_score', 'support_total', 'focus_per_support']
scaler = StandardScaler()
full_scaled = scaler.fit_transform(full_data[cluster_cols])

# PCA (95% Variance)
pca = PCA(n_components=0.95, random_state=42)
full_pca = pca.fit_transform(full_scaled)

# Cluster 5 (Target aligned)
kmeans_5 = KMeans(n_clusters=5, random_state=42, n_init=50)
full_data['cluster_pca_5'] = kmeans_5.fit_predict(full_pca)

# Cluster 8 (Sub-types)
kmeans_8 = KMeans(n_clusters=8, random_state=42, n_init=50)
full_data['cluster_pca_8'] = kmeans_8.fit_predict(full_pca)

# Split back
train_eng['cluster_pca_5'] = full_data.iloc[:len(train)]['cluster_pca_5'].values
train_eng['cluster_pca_8'] = full_data.iloc[:len(train)]['cluster_pca_8'].values
test_eng['cluster_pca_5'] = full_data.iloc[len(train):]['cluster_pca_5'].values
test_eng['cluster_pca_8'] = full_data.iloc[len(train):]['cluster_pca_8'].values

# ==========================================
# 4. PREPROCESSING
# ==========================================
target_col = 'personality_cluster'
X = train_eng.drop([target_col, 'participant_id'], axis=1)
y = train_eng[target_col]
X_test = test_eng.drop(['participant_id'], axis=1)

nominal_cols = ['cultural_background', 'cluster_pca_5', 'cluster_pca_8']
numeric_cols = [c for c in X.columns if c not in nominal_cols]

# Numeric Pipeline: PowerTransformer
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('power', PowerTransformer(method='yeo-johnson')) 
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, nominal_cols)
    ]
)

print("Preprocessing Data...")
X_processed = preprocessor.fit_transform(X)
X_test_processed = preprocessor.transform(X_test)

le = LabelEncoder()
y_encoded = le.fit_transform(y)

# ==========================================
# 5. SINGLE OPTIMIZED GRADIENT BOOSTING MODEL
# ==========================================
print("Training Single Optimized XGBoost-style Model...")

# This configuration mimics a highly tuned XGBoost model
# - learning_rate=0.01: Very slow learning for maximum precision
# - max_iter=2000: Needed because of the slow learning rate
# - max_depth=8: Allows learning complex interactions
# - l2_regularization=10.0: Strong regularization to prevent overfitting
# - early_stopping=True: Automatically stops if it stops improving (prevents overtraining)

model = HistGradientBoostingClassifier(
    learning_rate=0.01, 
    max_iter=2000, 
    max_depth=8, 
    l2_regularization=10.0, 
    max_leaf_nodes=40, 
    class_weight='balanced', 
    early_stopping=True,
    validation_fraction=0.1,
    n_iter_no_change=50,
    random_state=42
)

model.fit(X_processed, y_encoded)

# ==========================================
# 6. SUBMISSION
# ==========================================
print("Generating Predictions...")
y_pred_encoded = model.predict(X_test_processed)
y_pred_labels = le.inverse_transform(y_pred_encoded)

submission = pd.DataFrame({
    'participant_id': test['participant_id'],
    'personality_cluster': y_pred_labels
})

filename = 'submission_pure_xgboost_optimised.csv'
submission.to_csv(filename, index=False)
print(f"SUCCESS! Submission Saved: {filename}")

Feature Engineering...
Generating Structure Features...
Preprocessing Data...
Training Single Optimized XGBoost-style Model...
Generating Predictions...
SUCCESS! Submission Saved: submission_pure_xgboost_optimised.csv
