In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import RobustScaler, OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from collections import Counter

# 1. LOAD DATA
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# 2. FEATURE ENGINEERING
def create_features(df):
    df = df.copy()
    
    # 1. Total Activity Score: Sum of the binary/ordinal activity indicators
    df['total_activity_score'] = (df['hobby_engagement_level'] + 
                                  df['physical_activity_index'] + 
                                  df['creative_expression_index'])
    
    # 2. Support-Guidance Interaction: High support AND usage of guidance
    df['support_x_guidance'] = df['support_environment_score'] * df['external_guidance_usage']
    
    # --- Additional Density-Helping Features ---
    # 3. Focus x Consistency: Differentiates "scattered" vs "disciplined" behavior
    # This is crucial for separating personality clusters
    df['focus_x_consistency'] = df['focus_intensity'] * df['consistency_score']
    
    # 4. Log Transform Continuous Variables
    # DBSCAN uses Euclidean distance. Large raw values can dominate the distance.
    # Log transform compresses the range, making clusters spherical and easier to find.
    # We add +1 to avoid log(0)
    df['log_focus'] = np.log1p(df['focus_intensity'])
    df['log_consistency'] = np.log1p(df['consistency_score'])
    
    return df

train_fe = create_features(train_df)
test_fe = create_features(test_df)

# ==========================================
# 3. PREPARATION & PREPROCESSING
# ==========================================
# Separate Target
X_train = train_fe.drop(['participant_id', 'personality_cluster'], axis=1)
y_train = train_fe['personality_cluster']
X_test = test_fe.drop(['participant_id'], axis=1)

# Combine for Transductive Clustering (Clustering on the whole geometry usually helps DBSCAN)
X_combined = pd.concat([X_train, X_test], axis=0)

# Define Columns
categorical_cols = ['cultural_background'] # usually the main non-ordinal category
# Select numerical columns (excluding the raw ones we log-transformed if we want to reduce noise, 
# but keeping them is okay if Scaled. Let's drop raw to reduce correlation)
numerical_cols = [c for c in X_combined.columns if c not in categorical_cols 
                  and c not in ['focus_intensity', 'consistency_score']]

# Preprocessing Pipeline
# We use RobustScaler because DBSCAN is sensitive to outliers. 
# RobustScaler scales based on percentiles, not min/max or mean/std.
preprocessor = ColumnTransformer([
    ('num', RobustScaler(), numerical_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols)
])

X_combined_scaled = preprocessor.fit_transform(X_combined)

# ==========================================
# 4. RUN DBSCAN
# ==========================================
db = DBSCAN(eps=1.5, min_samples=5, metric='euclidean', n_jobs=-1)
cluster_labels = db.fit_predict(X_combined_scaled)

# ==========================================
# 5. MAP CLUSTERS TO LABELS
# ==========================================
train_size = len(X_train)
train_cluster_labels = cluster_labels[:train_size]
test_cluster_labels = cluster_labels[train_size:]

# Strategy:
# 1. For every cluster ID found, calculate the most frequent true label in the training set.
# 2. Assign that label to all test points in that cluster.
# 3. If a test point is "Noise" (-1), use KNN to find the nearest labeled neighbor.

cluster_map = {}
y_train_arr = y_train.values

# Find majority label for each cluster
unique_clusters = set(train_cluster_labels)
for cid in unique_clusters:
    if cid == -1: continue # Skip noise for now
    
    indices = np.where(train_cluster_labels == cid)[0]
    true_labels_in_cluster = y_train_arr[indices]
    
    if len(true_labels_in_cluster) > 0:
        most_common = Counter(true_labels_in_cluster).most_common(1)[0][0]
        cluster_map[cid] = most_common

# Prepare Fallback for Noise (-1) or Empty Clusters
# We fit a simple KNN on the TRAIN data to predict labels for "Noise" points in Test
knn_filler = KNeighborsClassifier(n_neighbors=5)
knn_filler.fit(X_combined_scaled[:train_size], y_train)

# Generate Predictions
final_preds = []

# Get scaled test portion for KNN lookup
X_test_scaled = X_combined_scaled[train_size:]

for i, cluster_id in enumerate(test_cluster_labels):
    # Case A: The point belongs to a valid cluster we mapped
    if cluster_id != -1 and cluster_id in cluster_map:
        final_preds.append(cluster_map[cluster_id])
        
    # Case B: The point is Noise (-1) OR belongs to a cluster with no training data
    else:
        # Use KNN to find the closest labeled point
        # Reshape is needed for single prediction
        current_point = X_test_scaled[i].reshape(1, -1)
        predicted_label = knn_filler.predict(current_point)[0]
        final_preds.append(predicted_label)

# ==========================================
# 6. SUBMISSION
# ==========================================
submission = pd.DataFrame({
    'participant_id': test_df['participant_id'],
    'personality_cluster': final_preds
})

print("Submission created with shape:", submission.shape)
print("Unique predictions:", submission['personality_cluster'].unique())
submission.to_csv('submission_dbscan_improved.csv', index=False)

Submission created with shape: (479, 2)
Unique predictions: ['Cluster_E' 'Cluster_C' 'Cluster_D' 'Cluster_B' 'Cluster_A']
