In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, PowerTransformer, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectPercentile, f_classif
from catboost import CatBoostClassifier

# ==========================================
# 1. LOAD DATA
# ==========================================
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# ==========================================
# 2. FEATURE ENGINEERING
# ==========================================
def create_features(df):
    df = df.copy()
    
    # 1. Activity Score
    df['activity_score'] = (df['hobby_engagement_level'] + 
                            df['physical_activity_index'] + 
                            df['creative_expression_index'])
    
    # 2. Support Score
    df['support_total'] = (df['support_environment_score'] + 
                           df['external_guidance_usage'] + 
                           df['upbringing_influence'])
    
    # 3. Efficiency Ratios
    df['efficiency'] = df['consistency_score'] / (df['focus_intensity'] + 1.0)
    df['focus_per_support'] = df['focus_intensity'] / (df['support_environment_score'] + 1.0)
    
    # 4. Age Norms
    df['consistency_per_age'] = df['consistency_score'] / df['age_group']
    
    return df

print("Feature Engineering...")
train_eng = create_features(train)
test_eng = create_features(test)

# ==========================================
# 3. PCA-GUIDED CLUSTERING
# ==========================================
print("Generating PCA-Cluster Features...")

full_data = pd.concat([train_eng.drop('personality_cluster', axis=1), test_eng], axis=0, ignore_index=True)

cluster_cols = ['focus_intensity', 'consistency_score', 'efficiency', 'activity_score', 'support_total', 'focus_per_support']
scaler = StandardScaler()
full_scaled = scaler.fit_transform(full_data[cluster_cols])

pca = PCA(n_components=0.95, random_state=42)
full_pca = pca.fit_transform(full_scaled)

# Clusters
kmeans_5 = KMeans(n_clusters=5, random_state=42, n_init=50)
full_data['cluster_pca_5'] = kmeans_5.fit_predict(full_pca)

kmeans_8 = KMeans(n_clusters=8, random_state=42, n_init=50)
full_data['cluster_pca_8'] = kmeans_8.fit_predict(full_pca)

# Split back
train_eng['cluster_pca_5'] = full_data.iloc[:len(train)]['cluster_pca_5'].values
train_eng['cluster_pca_8'] = full_data.iloc[:len(train)]['cluster_pca_8'].values
test_eng['cluster_pca_5'] = full_data.iloc[len(train):]['cluster_pca_5'].values
test_eng['cluster_pca_8'] = full_data.iloc[len(train):]['cluster_pca_8'].values

# ==========================================
# 4. PREPROCESSING PIPELINE
# ==========================================
target_col = 'personality_cluster'
X = train_eng.drop([target_col, 'participant_id'], axis=1)
y = train_eng[target_col]
X_test = test_eng.drop(['participant_id'], axis=1)

nominal_cols = ['cultural_background', 'cluster_pca_5', 'cluster_pca_8']
numeric_cols = [c for c in X.columns if c not in nominal_cols]

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('power', PowerTransformer(method='yeo-johnson')), # Fixes skew
    ('selector', SelectPercentile(f_classif, percentile=90)) # Removes noise
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False)) # Back to OneHot
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, nominal_cols)
    ]
)

print("Preprocessing Data...")
X_processed = preprocessor.fit_transform(X, y)
X_test_processed = preprocessor.transform(X_test)

le = LabelEncoder()
y_encoded = le.fit_transform(y)

# ==========================================
# 5. CATBOOST ON PROCESSED DATA
# ==========================================
print("Training CatBoost on Processed Data...")

model = CatBoostClassifier(
    iterations=2000,
    learning_rate=0.02, 
    depth=7,                # Slightly deeper than before since noise is reduced
    l2_leaf_reg=3,          # Reduced reg since we selected top 90% features already
    loss_function='MultiClass',
    eval_metric='Accuracy',
    random_seed=42,
    verbose=200,
    allow_writing_files=False
)

model.fit(X_processed, y_encoded)

# ==========================================
# 6. SUBMISSION
# ==========================================
print("Generating Predictions...")
y_pred_encoded = model.predict(X_test_processed).flatten()
y_pred_labels = le.inverse_transform(y_pred_encoded)

submission = pd.DataFrame({
    'participant_id': test['participant_id'],
    'personality_cluster': y_pred_labels
})

filename = 'submission_catboost_pipeline.csv'
submission.to_csv(filename, index=False)
print(f"Saved: {filename}")

Feature Engineering...
Generating PCA-Cluster Features...
Preprocessing Data...
Training CatBoost on Processed Data...
0:	learn: 0.6753790	total: 9.01ms	remaining: 18s
200:	learn: 0.7987454	total: 780ms	remaining: 6.98s
400:	learn: 0.8395191	total: 1.54s	remaining: 6.15s
600:	learn: 0.8672243	total: 2.32s	remaining: 5.39s
800:	learn: 0.8902248	total: 3.13s	remaining: 4.68s
1000:	learn: 0.9095661	total: 3.88s	remaining: 3.87s
1200:	learn: 0.9231573	total: 4.63s	remaining: 3.08s
1400:	learn: 0.9367486	total: 5.45s	remaining: 2.33s
1600:	learn: 0.9487716	total: 6.19s	remaining: 1.54s
1800:	learn: 0.9618400	total: 6.98s	remaining: 772ms
1999:	learn: 0.9702039	total: 7.73s	remaining: 0us
Generating Predictions...
Saved: submission_catboost_pipeline.csv
