In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, PowerTransformer, StandardScaler, PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.linear_model import LogisticRegressionCV

# ==========================================
# 1. LOAD DATA
# ==========================================
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# ==========================================
# 2. FEATURE ENGINEERING
# ==========================================
def create_features(df):
    df = df.copy()
    
    # 1. Activity Score
    df['activity_score'] = (df['hobby_engagement_level'] + 
                            df['physical_activity_index'] + 
                            df['creative_expression_index'])
    
    # 2. Support Score
    df['support_total'] = (df['support_environment_score'] + 
                           df['external_guidance_usage'] + 
                           df['upbringing_influence'])
    
    # 3. Efficiency Ratios
    df['efficiency'] = df['consistency_score'] / (df['focus_intensity'] + 1.0)
    df['focus_per_support'] = df['focus_intensity'] / (df['support_environment_score'] + 1.0)
    
    # 4. Age Norms
    df['consistency_per_age'] = df['consistency_score'] / df['age_group']

    # 5. Simple Interaction
    df['support_x_guidance'] = df['support_environment_score'] * df['external_guidance_usage']
    
    return df

print("Feature Engineering...")
train_eng = create_features(train)
test_eng = create_features(test)

# ==========================================
# 3. PCA-GUIDED CLUSTERING
# ==========================================
print("Generating PCA-Cluster Features...")

# Combine for structure learning
full_data = pd.concat([train_eng.drop('personality_cluster', axis=1), test_eng], axis=0, ignore_index=True)

# Select columns
cluster_cols = ['focus_intensity', 'consistency_score', 'efficiency', 'activity_score', 
                'support_total', 'focus_per_support', 'support_x_guidance']

# Standardize
scaler = StandardScaler()
full_scaled = scaler.fit_transform(full_data[cluster_cols])

# PCA (Keep 95% signal)
pca = PCA(n_components=0.95, random_state=42)
full_pca = pca.fit_transform(full_scaled)

# Clustering
kmeans_5 = KMeans(n_clusters=5, random_state=42, n_init=50)
full_data['cluster_pca_5'] = kmeans_5.fit_predict(full_pca)

kmeans_8 = KMeans(n_clusters=8, random_state=42, n_init=50)
full_data['cluster_pca_8'] = kmeans_8.fit_predict(full_pca)

# Split back
train_eng['cluster_pca_5'] = full_data.iloc[:len(train)]['cluster_pca_5'].values
train_eng['cluster_pca_8'] = full_data.iloc[:len(train)]['cluster_pca_8'].values
test_eng['cluster_pca_5'] = full_data.iloc[len(train):]['cluster_pca_5'].values
test_eng['cluster_pca_8'] = full_data.iloc[len(train):]['cluster_pca_8'].values

# ==========================================
# 4. PREPROCESSING PIPELINE
# ==========================================
target_col = 'personality_cluster'
X = train_eng.drop([target_col, 'participant_id'], axis=1)
y = train_eng[target_col]
X_test = test_eng.drop(['participant_id'], axis=1)

nominal_cols = ['cultural_background', 'cluster_pca_5', 'cluster_pca_8']
numeric_cols = [c for c in X.columns if c not in nominal_cols]

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('power', PowerTransformer(method='yeo-johnson')),
    ('poly', PolynomialFeatures(degree=2, include_bias=False)), 
    ('scaler', StandardScaler()), 
    ('selector', SelectPercentile(f_classif, percentile=80)) 
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, nominal_cols)
    ]
)

le = LabelEncoder()
y_encoded = le.fit_transform(y)

# ==========================================
# 5. PURE LOGISTIC REGRESSION
# ==========================================
print("Initializing Fixed Logistic Regression...")

# Changed solver to 'lbfgs' (more robust for small data)
# Increased max_iter to 10000 to STOP warnings
# Using 'l2' penalty
clf = LogisticRegressionCV(
    Cs=10, 
    cv=5,  
    penalty='l2', 
    solver='lbfgs',
    scoring='f1_macro',
    class_weight='balanced',
    max_iter=10000,
    random_state=42,
    n_jobs=-1
)

# ==========================================
# 6. TRAIN & PREDICT
# ==========================================
print("Training Pure Logistic Model...")

model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', clf)
])

model_pipeline.fit(X, y_encoded)

print(f"Best Regularization Strength (C) Found: {clf.C_[0]}")

print("Generating Predictions...")
y_pred_encoded = model_pipeline.predict(X_test)
y_pred_labels = le.inverse_transform(y_pred_encoded)

# ==========================================
# 7. SUBMISSION
# ==========================================
submission = pd.DataFrame({
    'participant_id': test['participant_id'],
    'personality_cluster': y_pred_labels
})

filename = 'submission_pure_logistic_fixed.csv'
submission.to_csv(filename, index=False)
print(f"SUCCESS! Pure Logistic Submission Saved: {filename}")
print(submission.head())

Feature Engineering...
Generating PCA-Cluster Features...
Initializing Fixed Logistic Regression...
Training Pure Logistic Model...
Best Regularization Strength (C) Found: 0.3593813663804626
Generating Predictions...
SUCCESS! Pure Logistic Submission Saved: submission_pure_logistic_fixed.csv
   participant_id personality_cluster
0            1005           Cluster_D
1             197           Cluster_C
2            2343           Cluster_E
3            1709           Cluster_A
4             436           Cluster_E
