In [None]:
import pandas as pd
import numpy as np
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import LabelEncoder, StandardScaler, PowerTransformer
from sklearn.decomposition import PCA
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score

# ==========================================
# 1. LOAD DATA
# ==========================================
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# ==========================================
# 2. HYBRID FEATURE ENGINEERING
# ==========================================
def create_features(df):
    df = df.copy()
    
    # 1. Total Activity Score
    df['total_activity_score'] = (df['hobby_engagement_level'] + 
                                  df['physical_activity_index'] + 
                                  df['creative_expression_index'])
    
    # 2. Support-Guidance Interaction
    df['support_x_guidance'] = df['support_environment_score'] * df['external_guidance_usage']
    
    # 3. Efficiency (Consistency vs Intensity)
    df['efficiency'] = df['consistency_score'] / (df['focus_intensity'] + 1.0)
    
    # 4. Support Total
    df['support_total'] = (df['support_environment_score'] + 
                           df['external_guidance_usage'] + 
                           df['upbringing_influence'])
    
    # 5. Age Norms (Standardizing behavior by age)
    df['consistency_per_age'] = df['consistency_score'] / df['age_group']
    
    return df

print("Engineering Features...")
train_eng = create_features(train)
test_eng = create_features(test)

# ==========================================
# 3. GMM "SOFT CLUSTERING"
# ==========================================
print("Fitting Gaussian Mixture Models...")

# Combine for global structure learning
full_data = pd.concat([train_eng.drop('personality_cluster', axis=1), test_eng], axis=0, ignore_index=True)

# Features to define the "Personality Space"
gmm_cols = ['focus_intensity', 'consistency_score', 'efficiency', 
            'total_activity_score', 'support_total', 'support_x_guidance']

# GMM requires Gaussian-like data, so we use PowerTransformer
pt = PowerTransformer(method='yeo-johnson')
full_scaled = pt.fit_transform(full_data[gmm_cols])

# PCA to denoise before GMM (Helps GMM converge)
pca = PCA(n_components=0.95, random_state=42)
full_pca = pca.fit_transform(full_scaled)

# instead of 1 column of integers, we get 5 columns of probabilities
gmm_5 = GaussianMixture(n_components=5, covariance_type='full', random_state=42, n_init=5)
gmm_5.fit(full_pca)
probs_5 = gmm_5.predict_proba(full_pca)
# Add Soft Probabilities as features
for i in range(5):
    full_data[f'gmm_5_prob_{i}'] = probs_5[:, i]

# Captures nuance within the main clusters
gmm_10 = GaussianMixture(n_components=10, covariance_type='full', random_state=42, n_init=5)
gmm_10.fit(full_pca)
probs_10 = gmm_10.predict_proba(full_pca)
# Add Soft Probabilities
for i in range(10):
    full_data[f'gmm_10_prob_{i}'] = probs_10[:, i]

# Split back
train_processed = full_data.iloc[:len(train)].copy()
test_processed = full_data.iloc[len(train):].copy()

# Add targets back
train_processed['personality_cluster'] = train['personality_cluster']

# ==========================================
# 4. PREPARE DATA
# ==========================================
target_col = 'personality_cluster'
drop_cols = ['participant_id', target_col]

X = train_processed.drop(drop_cols, axis=1)
y = train_processed[target_col]
X_test = test_processed.drop(['participant_id'], axis=1)

le = LabelEncoder()
y_encoded = le.fit_transform(y)

# 'cultural_background' and 'identity_code' are categorical
cat_indices = [X.columns.get_loc(c) for c in ['cultural_background', 'identity_code'] if c in X.columns]

# ==========================================
# 5. TRAIN MODEL (Using GMM Features)
# ==========================================
print("Training GMM-Enhanced Classifier...")

# Using HistGradientBoosting as it's robust and fast (similar to LGBM)
# It will naturally use the "Soft Cluster" probabilities to make decisions
model = HistGradientBoostingClassifier(
    learning_rate=0.02,
    max_iter=1000,
    max_depth=8,
    l2_regularization=5.0,
    categorical_features=cat_indices,
    class_weight='balanced',
    early_stopping=True,
    random_state=42
)

# 10-Fold CV
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
scores = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X, y_encoded)):
    X_train, y_train = X.iloc[train_idx], y_encoded[train_idx]
    X_val, y_val = X.iloc[val_idx], y_encoded[val_idx]
    
    model.fit(X_train, y_train)
    
    val_preds = model.predict(X_val)
    score = f1_score(y_val, val_preds, average='macro')
    scores.append(score)
    print(f"Fold {fold+1} F1: {score:.4f}")

print(f"\nAverage F1 Macro: {np.mean(scores):.4f}")

# ==========================================
# 6. SUBMISSION
# ==========================================
# Retrain on full data
model.fit(X, y_encoded)
final_preds = model.predict(X_test)
final_labels = le.inverse_transform(final_preds)

submission = pd.DataFrame({
    'participant_id': test['participant_id'],
    'personality_cluster': final_labels
})

filename = 'submission_gmm_soft_clusters.csv'
submission.to_csv(filename, index=False)
print(f"SUCCESS! Saved: {filename}")

Engineering Features...
Fitting Gaussian Mixture Models...
Training GMM-Enhanced Classifier...
Fold 1 F1: 0.5788
Fold 2 F1: 0.5738
Fold 3 F1: 0.5945
Fold 4 F1: 0.5834
Fold 5 F1: 0.5333
Fold 6 F1: 0.6543
Fold 7 F1: 0.5724
Fold 8 F1: 0.5562
Fold 9 F1: 0.4966
Fold 10 F1: 0.5643

Average F1 Macro: 0.5707
SUCCESS! Saved: submission_gmm_soft_clusters.csv
