In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# 1. Load Data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# 2. Apply Feature Engineering
def create_features(df):
    df = df.copy()
    # 1. Total Activity Score: Summing binary indicators to create an ordinal feature
    df['total_activity_score'] = (df['hobby_engagement_level'] + 
                                  df['physical_activity_index'] + 
                                  df['creative_expression_index'])
    
    # 2. Support-Guidance Interaction: Interaction term
    df['support_x_guidance'] = df['support_environment_score'] * df['external_guidance_usage']
    return df

train = create_features(train)
test = create_features(test)

# 3. Prepare X and y
# Dropping ID as it's not a feature
X = train.drop(columns=['participant_id', 'personality_cluster'])
y = train['personality_cluster']
X_test = test.drop(columns=['participant_id'])

# 4. Define Preprocessing
# We treat low-cardinality categorical/ordinal variables as nominal for KNN to allow non-linear distance relationships.
cat_cols = ['cultural_background', 'upbringing_influence']
num_cols = [c for c in X.columns if c not in cat_cols]

# ColumnTransformer:
# - OneHotEncoder: Converts categories to binary columns. handle_unknown='ignore' handles unseen values in test.
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols),
        ('num', 'passthrough', num_cols)
    ],
    verbose_feature_names_out=False
)

# 5. Build Pipeline
# KNN is distance-based, so StandardScaler is MANDATORY to prevent features with large ranges (like age) from dominating.
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler()), 
    ('knn', KNeighborsClassifier())
])

# 6. Grid Search for Optimization
# We tune 'n_neighbors' (k), 'weights' (uniform vs distance), and 'p' (metric: Manhattan vs Euclidean)
param_grid = {
    'knn__n_neighbors': [5, 7, 9, 11, 13, 15, 17, 19, 21, 25, 30],
    'knn__weights': ['uniform', 'distance'],
    'knn__p': [1, 2]  # 1 = Manhattan (L1), 2 = Euclidean (L2)
}

# Stratified K-Fold preserves class percentage in splits (important for imbalanced data)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid = GridSearchCV(
    pipeline, 
    param_grid, 
    cv=cv, 
    scoring='f1_macro', 
    n_jobs=-1, 
    verbose=1
)

# 7. Train
print("Training KNN Grid Search...")
grid.fit(X, y)

print(f"Best CV F1 Macro Score: {grid.best_score_:.4f}")
print(f"Best Parameters: {grid.best_params_}")

# 8. Predict and Create Submission
preds = grid.predict(X_test)

submission = pd.DataFrame({
    'participant_id': test['participant_id'],
    'personality_cluster': preds
})

# Save submission
submission.to_csv('submission_knn.csv', index=False)
print("Submission saved to 'submission_knn.csv'")
print(submission.head())

Training KNN Grid Search...
Fitting 5 folds for each of 44 candidates, totalling 220 fits


 0.31144835 0.31368438        nan 0.31946759 0.3104431  0.30486449
        nan 0.31822433 0.29888503 0.29359807        nan 0.3046277
 0.28580011 0.28125455        nan 0.30259437 0.29855377 0.27280482
        nan 0.29076062 0.27660678 0.26966165        nan 0.27175643
 0.27844405 0.26961577        nan 0.26561029 0.26114327 0.26136687
        nan 0.25414495 0.24707093 0.24920874        nan 0.24542583
 0.22993726 0.23052991]


Best CV F1 Macro Score: 0.3221
Best Parameters: {'knn__n_neighbors': 7, 'knn__p': 1, 'knn__weights': 'distance'}
Submission saved to 'submission_knn.csv'
   participant_id personality_cluster
0            1005           Cluster_E
1             197           Cluster_E
2            2343           Cluster_E
3            1709           Cluster_E
4             436           Cluster_E
