In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [3]:
def create_social_energy_score_v1_adjusted(df):
    """
    Create Social Energy Score using adjusted MBTI theory weights
    
    Key insight: Voluntary social activities (like social events) are more 
    indicative of true social energy preferences than passive behaviors
    
    Adjusted weights:
    - Drained after socializing: -0.35 (energy drain indicator)
    - Time spent alone: -0.20 (recovery needs)  
    - Social event attendance: +0.25 (voluntary social participation - high weight)
    - Going outside: +0.05 (general activity)
    - Friends circle size: +0.03 (social network)
    - Post frequency: +0.01 (social sharing)
    - Stage fear: +0.01 (confidence factor)
    """
    df_features = df.copy()
    
    print("Creating Social Energy Score v1 (Adjusted Weights)")
    print("="*55)
    print("Based on refined MBTI insights about voluntary vs passive social behaviors")
    
    # Encode categorical variables
    le_stage = LabelEncoder()
    le_drained = LabelEncoder()
    
    # Handle potential missing values in categorical columns
    df_features['Stage_fear_encoded'] = le_stage.fit_transform(df_features['Stage_fear'].astype(str))
    df_features['Drained_encoded'] = le_drained.fit_transform(df_features['Drained_after_socializing'].astype(str))
    
    # Normalize numerical features to 0-1 scale for fair weighting
    time_alone_norm = (df_features['Time_spent_Alone'] - df_features['Time_spent_Alone'].min()) / (df_features['Time_spent_Alone'].max() - df_features['Time_spent_Alone'].min())
    
    social_events_norm = (df_features['Social_event_attendance'] - df_features['Social_event_attendance'].min()) / (df_features['Social_event_attendance'].max() - df_features['Social_event_attendance'].min())
    
    going_out_norm = (df_features['Going_outside'] - df_features['Going_outside'].min()) / (df_features['Going_outside'].max() - df_features['Going_outside'].min())
    
    friends_norm = (df_features['Friends_circle_size'] - df_features['Friends_circle_size'].min()) / (df_features['Friends_circle_size'].max() - df_features['Friends_circle_size'].min())
    
    post_norm = (df_features['Post_frequency'] - df_features['Post_frequency'].min()) / (df_features['Post_frequency'].max() - df_features['Post_frequency'].min())
    
    # Convert categorical to binary
    drained_binary = (df_features['Drained_after_socializing'] == 'Yes').astype(int)
    stage_fear_binary = (df_features['Stage_fear'] == 'Yes').astype(int)
    
    # Calculate Social Energy Score with adjusted weights
    df_features['Social_Energy_Score'] = (
        -0.35 * drained_binary +           # Energy drain (negative impact)
        -0.20 * time_alone_norm +          # Need for alone time (negative impact)
        +0.25 * social_events_norm +       # Voluntary social participation (high positive impact)
        +0.05 * going_out_norm +           # General activity level
        +0.03 * friends_norm +             # Social network size
        +0.01 * post_norm +                # Social sharing behavior
        +0.01 * (1 - stage_fear_binary)    # Confidence in social situations
    )
    
    # Display results
    print(f"\nSocial Energy Score created!")
    print(f"Score range: {df_features['Social_Energy_Score'].min():.3f} to {df_features['Social_Energy_Score'].max():.3f}")
    print(f"Mean score: {df_features['Social_Energy_Score'].mean():.3f}")
    print(f"Standard deviation: {df_features['Social_Energy_Score'].std():.3f}")
    
    print(f"\nWeight breakdown:")
    print(f"  Drained after socializing: -0.35 (most negative impact)")
    print(f"  Time spent alone: -0.20")
    print(f"  Social event attendance: +0.25 (most positive impact)")
    print(f"  Going outside: +0.05")
    print(f"  Friends circle size: +0.03")
    print(f"  Post frequency: +0.01")
    print(f"  Stage fear (reversed): +0.01")
    
    print(f"\nInterpretation:")
    print(f"  Higher scores → More extroverted energy patterns")
    print(f"  Lower scores → More introverted energy patterns")
    print(f"  Score around 0 → Mixed or balanced energy patterns")
    
    # If target variable exists, show performance
    if 'Personality' in df.columns:
        print(f"\nValidation against known personality types:")
        
        introvert_scores = df_features[df_features['Personality'] == 'Introvert']['Social_Energy_Score']
        extrovert_scores = df_features[df_features['Personality'] == 'Extrovert']['Social_Energy_Score']
        
        print(f"  Introvert mean score: {introvert_scores.mean():.3f}")
        print(f"  Extrovert mean score: {extrovert_scores.mean():.3f}")
        print(f"  Difference: {extrovert_scores.mean() - introvert_scores.mean():.3f}")
        
        # Calculate correlation with personality
        le_personality = LabelEncoder()
        personality_encoded = le_personality.fit_transform(df['Personality'])
        correlation = np.corrcoef(df_features['Social_Energy_Score'], personality_encoded)[0,1]
        
        print(f"  Correlation with Personality: {correlation:.3f}")
        print(f"  {'✅ Good separation!' if abs(correlation) > 0.3 else '⚠️ Moderate separation' if abs(correlation) > 0.1 else '❌ Poor separation'}")
    
    # Clean up temporary columns
    columns_to_drop = ['Stage_fear_encoded', 'Drained_encoded']
    df_features = df_features.drop(columns=[col for col in columns_to_drop if col in df_features.columns])
    
    return df_features


In [4]:
if __name__ == "__main__":
    # Load your imputed data
    train_df = pd.read_csv('train_imputed.csv')
    test_df = pd.read_csv('test_imputed.csv')
    
    print("Processing training data...")
    train_with_score = create_social_energy_score_v1_adjusted(train_df)
    
    print(f"\n" + "="*55)
    print("Processing test data...")
    test_with_score = create_social_energy_score_v1_adjusted(test_df)
    
    # Save enhanced datasets
    train_with_score.to_csv('train_with_social_energy.csv', index=False)
    test_with_score.to_csv('test_with_social_energy.csv', index=False)
    
    print(f"\n✅ Enhanced datasets saved!")
    print(f"  - train_with_social_energy.csv")
    print(f"  - test_with_social_energy.csv")
    
    # Show sample results
    print(f"\nSample results:")
    display_cols = ['Social_Energy_Score']
    if 'Personality' in train_df.columns:
        display_cols = ['Personality'] + display_cols
        
    print(train_with_score[display_cols].head(10).to_string(index=False))

Processing training data...
Creating Social Energy Score v1 (Adjusted Weights)
Based on refined MBTI insights about voluntary vs passive social behaviors

Social Energy Score created!
Score range: -0.550 to 0.345
Mean score: 0.040
Standard deviation: 0.274

Weight breakdown:
  Drained after socializing: -0.35 (most negative impact)
  Time spent alone: -0.20
  Social event attendance: +0.25 (most positive impact)
  Going outside: +0.05
  Friends circle size: +0.03
  Post frequency: +0.01
  Stage fear (reversed): +0.01

Interpretation:
  Higher scores → More extroverted energy patterns
  Lower scores → More introverted energy patterns
  Score around 0 → Mixed or balanced energy patterns

Validation against known personality types:
  Introvert mean score: -0.373
  Extrovert mean score: 0.186
  Difference: 0.559
  Correlation with Personality: -0.896
  ✅ Good separation!

Processing test data...
Creating Social Energy Score v1 (Adjusted Weights)
Based on refined MBTI insights about volunta