In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

# 1. Load Data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# 2. Feature Engineering
def create_features(df):
    df = df.copy()
    # Combined activity metric
    df['total_activity_score'] = (df['hobby_engagement_level'] + 
                                  df['physical_activity_index'] + 
                                  df['creative_expression_index'])
    # Interaction: External help * Support system
    df['support_x_guidance'] = df['support_environment_score'] * df['external_guidance_usage']
    return df

train = create_features(train)
test = create_features(test)

# 3. Prepare X and y
target_col = 'personality_cluster'
X = train.drop(columns=['participant_id', target_col])
y = train[target_col]
X_test = test.drop(columns=['participant_id'])

# 4. Pipeline
# Note: We removed 'multi_class' to fix the warning. Sklearn handles it automatically.
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('logreg', LogisticRegression(
        solver='lbfgs',
        max_iter=3000,           # High max_iter to ensure convergence
        class_weight='balanced', # Crucial for Macro F1
        random_state=42
    ))
])

# 5. Grid Search
param_grid = {
    'logreg__C': [0.01, 0.1, 1, 10]  # Regularization strength
}

grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    scoring='f1_macro',
    n_jobs=-1,
    verbose=1
)

print("Training Logistic Regression...")
grid_search.fit(X, y)

print(f"\nBest Local Macro F1: {grid_search.best_score_:.5f}")
print(f"Best Params: {grid_search.best_params_}")

# 6. Prediction & Submission (THE FIX)
test_preds = grid_search.predict(X_test)

# Create a NEW DataFrame using IDs from test.csv
submission = pd.DataFrame({
    'participant_id': test['participant_id'],
    'personality_cluster': test_preds
})

submission.to_csv('submission_pure_logreg.csv', index=False)
print("\nSuccess! Submission file created with correct number of rows.")
print(submission.head())

Training Logistic Regression...
Fitting 5 folds for each of 4 candidates, totalling 20 fits

Best Local Macro F1: 0.47706
Best Params: {'logreg__C': 1}

Success! Submission file created with correct number of rows.
   participant_id personality_cluster
0            1005           Cluster_E
1             197           Cluster_C
2            2343           Cluster_D
3            1709           Cluster_B
4             436           Cluster_E
