In [None]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, PowerTransformer, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectPercentile, f_classif, VarianceThreshold
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# ==========================================
# 1. LOAD DATA
# ==========================================
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# ==========================================
# 2. FEATURE ENGINEERING
# ==========================================
def create_features(df):
    df = df.copy()
    
    # 1. Activity Score
    df['activity_score'] = (df['hobby_engagement_level'] + 
                            df['physical_activity_index'] + 
                            df['creative_expression_index'])
    
    # 2. Support Score
    df['support_total'] = (df['support_environment_score'] + 
                           df['external_guidance_usage'] + 
                           df['upbringing_influence'])
    
    # 3. Efficiency Ratios
    df['efficiency'] = df['consistency_score'] / (df['focus_intensity'] + 1.0)
    df['focus_per_support'] = df['focus_intensity'] / (df['support_environment_score'] + 1.0)
    
    # 4. Age Norms
    df['consistency_per_age'] = df['consistency_score'] / df['age_group']
    
    # 5. Support-Guidance Interaction
    df['support_x_guidance'] = df['support_environment_score'] * df['external_guidance_usage']
    
    return df

print("Feature Engineering...")
train_eng = create_features(train)
test_eng = create_features(test)

# ==========================================
# 3. PCA-GUIDED CLUSTERING
# ==========================================
print("Generating PCA-Cluster Features...")

# Combine for global structure learning
full_data = pd.concat([train_eng.drop('personality_cluster', axis=1), test_eng], axis=0, ignore_index=True)

# Select columns that define personality structure
cluster_cols = ['focus_intensity', 'consistency_score', 'efficiency', 'activity_score', 
                'support_total', 'focus_per_support', 'support_x_guidance']
scaler = StandardScaler()
full_scaled = scaler.fit_transform(full_data[cluster_cols])

# PCA
pca = PCA(n_components=0.95, random_state=42)
full_pca = pca.fit_transform(full_scaled)

# Clusters (5 and 8)
kmeans_5 = KMeans(n_clusters=5, random_state=42, n_init=50)
full_data['cluster_pca_5'] = kmeans_5.fit_predict(full_pca)

kmeans_8 = KMeans(n_clusters=8, random_state=42, n_init=50)
full_data['cluster_pca_8'] = kmeans_8.fit_predict(full_pca)

# Split back
train_eng['cluster_pca_5'] = full_data.iloc[:len(train)]['cluster_pca_5'].values
train_eng['cluster_pca_8'] = full_data.iloc[:len(train)]['cluster_pca_8'].values
test_eng['cluster_pca_5'] = full_data.iloc[len(train):]['cluster_pca_5'].values
test_eng['cluster_pca_8'] = full_data.iloc[len(train):]['cluster_pca_8'].values

# ==========================================
# 4. PREPROCESSING PIPELINE
# ==========================================
target_col = 'personality_cluster'
X = train_eng.drop([target_col, 'participant_id'], axis=1)
y = train_eng[target_col]
X_test = test_eng.drop(['participant_id'], axis=1)

nominal_cols = ['cultural_background', 'cluster_pca_5', 'cluster_pca_8']
numeric_cols = [c for c in X.columns if c not in nominal_cols]

# Numeric: PowerTransform is CRITICAL for Naive Bayes (makes data Gaussian)
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('power', PowerTransformer(method='yeo-johnson'))
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, nominal_cols)
    ]
)

le = LabelEncoder()
y_encoded = le.fit_transform(y)

# ==========================================
# 5. OPTIMIZED BAYES PIPELINE
# ==========================================
bayes_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('var_filter', VarianceThreshold(threshold=0.0)), # Remove constants
    ('selector', SelectPercentile(f_classif)),        # <--- KEY: Keep only the best % of features
    ('gnb', GaussianNB())
])

# Parameter Grid for Optimization
param_grid = {
    'selector__percentile': [20, 30, 40, 60, 80, 100], # Try different feature subsets
    'gnb__var_smoothing': np.logspace(0, -9, num=20)   # Tune the "width" of the Gaussian
}

print("Training Optimized Bayes Method (GridSearch)...")
grid_search = GridSearchCV(bayes_pipeline, param_grid, cv=5, scoring='f1_macro', n_jobs=-1, verbose=1)
grid_search.fit(X, y_encoded)

print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best CV Score: {grid_search.best_score_}")

# ==========================================
# 6. SUBMISSION
# ==========================================
print("Generating Predictions...")
best_model = grid_search.best_estimator_
y_pred_encoded = best_model.predict(X_test)
y_pred_labels = le.inverse_transform(y_pred_encoded)

submission = pd.DataFrame({
    'participant_id': test['participant_id'],
    'personality_cluster': y_pred_labels
})

filename = 'submission_bayes_optimized.csv'
submission.to_csv(filename, index=False)
print(f"SUCCESS! Submission Saved: {filename}")
print(submission.head())

Feature Engineering...
Generating PCA-Cluster Features...
Training Optimized Bayes Method (GridSearch)...
Fitting 5 folds for each of 120 candidates, totalling 600 fits
Best Parameters: {'gnb__var_smoothing': np.float64(0.3359818286283782), 'selector__percentile': 60}
Best CV Score: 0.5105517040297632
Generating Predictions...
SUCCESS! Submission Saved: submission_bayes_optimized.csv
   participant_id personality_cluster
0            1005           Cluster_E
1             197           Cluster_C
2            2343           Cluster_E
3            1709           Cluster_B
4             436           Cluster_E
