In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings("ignore")

plt.style.use("default")
sns.set_palette("husl")

train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
sample_submission = pd.read_csv("sample_submission.csv")

In [5]:
print(f"Train dataset shape: {train_df.shape}")
print(f"Test dataset shape: {test_df.shape}")
print(f"Sample submission shape: {sample_submission.shape}")

Train dataset shape: (18524, 9)
Test dataset shape: (6175, 8)
Sample submission shape: (6175, 2)


In [6]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18524 entries, 0 to 18523
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   id                         18524 non-null  int64  
 1   Time_spent_Alone           17334 non-null  float64
 2   Stage_fear                 16631 non-null  object 
 3   Social_event_attendance    17344 non-null  float64
 4   Going_outside              17058 non-null  float64
 5   Drained_after_socializing  17375 non-null  object 
 6   Friends_circle_size        17470 non-null  float64
 7   Post_frequency             17260 non-null  float64
 8   Personality                18524 non-null  object 
dtypes: float64(5), int64(1), object(3)
memory usage: 1.3+ MB


In [17]:
train_df.head(20)

Unnamed: 0,id,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality
0,0,0.0,No,6.0,4.0,No,15.0,5.0,Extrovert
1,1,1.0,No,7.0,3.0,No,10.0,8.0,Extrovert
2,2,6.0,Yes,1.0,0.0,,3.0,0.0,Introvert
3,3,3.0,No,7.0,3.0,No,11.0,5.0,Extrovert
4,4,1.0,No,4.0,4.0,No,13.0,,Extrovert
5,5,2.0,No,8.0,5.0,No,,3.0,Extrovert
6,6,1.0,No,8.0,,No,,4.0,Extrovert
7,7,2.0,No,8.0,3.0,No,4.0,5.0,Extrovert
8,8,4.0,Yes,2.0,1.0,,0.0,2.0,Introvert
9,9,1.0,No,8.0,6.0,No,14.0,9.0,Extrovert


In [10]:
train_df.describe()

Unnamed: 0,id,Time_spent_Alone,Social_event_attendance,Going_outside,Friends_circle_size,Post_frequency
count,18524.0,17334.0,17344.0,17058.0,17470.0,17260.0
mean,9261.5,3.137764,5.265106,4.044319,7.996737,4.982097
std,5347.562529,3.003786,2.753359,2.06258,4.223484,2.879139
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,4630.75,1.0,3.0,3.0,5.0,3.0
50%,9261.5,2.0,5.0,4.0,8.0,5.0
75%,13892.25,4.0,8.0,6.0,12.0,7.0
max,18523.0,11.0,10.0,7.0,15.0,10.0


In [14]:
# Missing value check
train_df.isnull().sum()

id                              0
Time_spent_Alone             1190
Stage_fear                   1893
Social_event_attendance      1180
Going_outside                1466
Drained_after_socializing    1149
Friends_circle_size          1054
Post_frequency               1264
Personality                     0
dtype: int64

In [16]:
test_df.isnull().sum()

id                             0
Time_spent_Alone             425
Stage_fear                   598
Social_event_attendance      397
Going_outside                466
Drained_after_socializing    432
Friends_circle_size          350
Post_frequency               408
dtype: int64

In [19]:
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler

In [27]:
import pandas as pd
import numpy as np

def fast_mbti_weighted_imputation(df):
    """
    Fast MBTI theory-based weighted imputation
    
    Core principle: Focus on the two most important MBTI indicators
    - Drained_after_socializing: 0.55 (energy drain pattern - most critical)
    - Time_spent_Alone: 0.30 (energy recovery needs - very important)
    - Other features: Simple imputation (less critical for E/I classification)
    """
    print("Fast MBTI Theory-based Weighted Imputation")
    print("="*50)
    print("Core focus: Drained_after_socializing + Time_spent_Alone")
    
    df_imputed = df.copy()
    
    # Check missing values before imputation
    print(f"\nMissing values before imputation:")
    missing_before = df_imputed.isnull().sum()
    for col, count in missing_before.items():
        if count > 0:
            print(f"  {col}: {count}")
    
    # Step 1: Simple imputation for secondary features (fast)
    secondary_features = ['Social_event_attendance', 'Going_outside', 
                         'Friends_circle_size', 'Post_frequency', 'Stage_fear']
    
    print(f"\nStep 1: Simple imputation for secondary features...")
    for col in secondary_features:
        if col in df_imputed.columns and df_imputed[col].isnull().any():
            if df_imputed[col].dtype == 'object':
                # Categorical: use mode
                mode_val = df_imputed[col].mode()
                if len(mode_val) > 0:
                    df_imputed[col].fillna(mode_val[0], inplace=True)
                    print(f"  {col}: filled with mode '{mode_val[0]}'")
            else:
                # Numerical: use median
                median_val = df_imputed[col].median()
                df_imputed[col].fillna(median_val, inplace=True)
                print(f"  {col}: filled with median {median_val:.2f}")
    
    # Step 2: MBTI-focused imputation for core features
    print(f"\nStep 2: MBTI-focused imputation for core features...")
    
    # Handle Drained_after_socializing (Weight: 0.55 - Most Important)
    if 'Drained_after_socializing' in df_imputed.columns:
        missing_drained = df_imputed['Drained_after_socializing'].isnull()
        missing_count = missing_drained.sum()
        
        if missing_count > 0:
            print(f"  Processing Drained_after_socializing ({missing_count} missing)...")
            
            for idx in df_imputed[missing_drained].index:
                time_alone = df_imputed.loc[idx, 'Time_spent_Alone']
                
                if not pd.isnull(time_alone):
                    # MBTI Logic: People who spend more time alone are more likely 
                    # to be drained after socializing (introverts)
                    time_alone_median = df_imputed['Time_spent_Alone'].median()
                    
                    if time_alone > time_alone_median:
                        df_imputed.loc[idx, 'Drained_after_socializing'] = 'Yes'
                    else:
                        df_imputed.loc[idx, 'Drained_after_socializing'] = 'No'
                else:
                    # Fallback to overall mode if Time_spent_Alone is also missing
                    mode_val = df_imputed['Drained_after_socializing'].mode()
                    if len(mode_val) > 0:
                        df_imputed.loc[idx, 'Drained_after_socializing'] = mode_val[0]
            
            print(f"    Filled based on Time_spent_Alone correlation")
    
    # Handle Time_spent_Alone (Weight: 0.30 - Very Important)
    if 'Time_spent_Alone' in df_imputed.columns:
        missing_time = df_imputed['Time_spent_Alone'].isnull()
        missing_count = missing_time.sum()
        
        if missing_count > 0:
            print(f"  Processing Time_spent_Alone ({missing_count} missing)...")
            
            # Calculate group medians for MBTI-based imputation
            drained_yes_median = df_imputed[df_imputed['Drained_after_socializing'] == 'Yes']['Time_spent_Alone'].median()
            drained_no_median = df_imputed[df_imputed['Drained_after_socializing'] == 'No']['Time_spent_Alone'].median()
            overall_median = df_imputed['Time_spent_Alone'].median()
            
            for idx in df_imputed[missing_time].index:
                drained = df_imputed.loc[idx, 'Drained_after_socializing']
                
                if not pd.isnull(drained):
                    if drained == 'Yes':
                        # People drained after socializing need more alone time
                        fill_value = drained_yes_median if not pd.isnull(drained_yes_median) else overall_median
                        df_imputed.loc[idx, 'Time_spent_Alone'] = fill_value
                    else:
                        # People not drained after socializing need less alone time
                        fill_value = drained_no_median if not pd.isnull(drained_no_median) else overall_median
                        df_imputed.loc[idx, 'Time_spent_Alone'] = fill_value
                else:
                    # Fallback to overall median
                    df_imputed.loc[idx, 'Time_spent_Alone'] = overall_median
            
            print(f"    Filled based on Drained_after_socializing groups")
            print(f"    - 'Yes' group median: {drained_yes_median:.2f}")
            print(f"    - 'No' group median: {drained_no_median:.2f}")
    
    # Step 3: Final validation and summary
    print(f"\nStep 3: Validation...")
    missing_after = df_imputed.isnull().sum()
    remaining_missing = missing_after.sum()
    
    print(f"Missing values after imputation:")
    for col, count in missing_after.items():
        if count > 0:
            print(f"  {col}: {count}")
    
    if remaining_missing == 0:
        print("✅ All missing values successfully imputed!")
    else:
        print(f"⚠️ {remaining_missing} missing values remain")
    
    # MBTI Theory Validation (if Personality column exists)
    if 'Personality' in df_imputed.columns:
        print(f"\nMBTI Theory Validation:")
        
        introvert_data = df_imputed[df_imputed['Personality'] == 'Introvert']
        extrovert_data = df_imputed[df_imputed['Personality'] == 'Extrovert']
        
        if len(introvert_data) > 0 and len(extrovert_data) > 0:
            # Check Drained_after_socializing pattern
            if 'Drained_after_socializing' in df_imputed.columns:
                intro_drained_pct = (introvert_data['Drained_after_socializing'] == 'Yes').mean()
                extro_drained_pct = (extrovert_data['Drained_after_socializing'] == 'Yes').mean()
                
                print(f"  Drained after socializing:")
                print(f"    Introverts: {intro_drained_pct:.1%}")
                print(f"    Extroverts: {extro_drained_pct:.1%}")
                print(f"    Difference: {intro_drained_pct - extro_drained_pct:+.1%} ({'✅ Good' if intro_drained_pct > extro_drained_pct else '⚠️ Check'})")
            
            # Check Time_spent_Alone pattern
            if 'Time_spent_Alone' in df_imputed.columns:
                intro_time_mean = introvert_data['Time_spent_Alone'].mean()
                extro_time_mean = extrovert_data['Time_spent_Alone'].mean()
                
                print(f"  Time spent alone:")
                print(f"    Introverts: {intro_time_mean:.2f}")
                print(f"    Extroverts: {extro_time_mean:.2f}")
                print(f"    Difference: {intro_time_mean - extro_time_mean:+.2f} ({'✅ Good' if intro_time_mean > extro_time_mean else '⚠️ Check'})")
    
    return df_imputed

def process_datasets():
    """
    Complete pipeline to process both train and test datasets
    """
    print("MBTI-based Missing Value Imputation Pipeline")
    print("="*60)
    
    # Load datasets
    print("Loading datasets...")
    try:
        train_df = pd.read_csv('train.csv')
        test_df = pd.read_csv('test.csv')
        print(f"✅ Train dataset: {train_df.shape}")
        print(f"✅ Test dataset: {test_df.shape}")
    except FileNotFoundError as e:
        print(f"❌ Error loading datasets: {e}")
        return None, None
    
    # Process training data
    print(f"\n" + "="*40)
    print("PROCESSING TRAINING DATA")
    print("="*40)
    train_imputed = fast_mbti_weighted_imputation(train_df)
    
    # Process test data  
    print(f"\n" + "="*40)
    print("PROCESSING TEST DATA")
    print("="*40)
    test_imputed = fast_mbti_weighted_imputation(test_df)
    
    # Save results
    print(f"\n" + "="*40)
    print("SAVING RESULTS")
    print("="*40)
    
    try:
        train_imputed.to_csv('train_imputed.csv', index=False)
        test_imputed.to_csv('test_imputed.csv', index=False)
        print("✅ Saved train_imputed.csv")
        print("✅ Saved test_imputed.csv")
    except Exception as e:
        print(f"❌ Error saving files: {e}")
    
    print(f"\n🎉 Imputation completed successfully!")
    print(f"Ready for model building with MBTI-theory enhanced data!")
    
    return train_imputed, test_imputed

# Run the complete pipeline
if __name__ == "__main__":
    train_imputed, test_imputed = process_datasets()

MBTI-based Missing Value Imputation Pipeline
Loading datasets...
✅ Train dataset: (18524, 9)
✅ Test dataset: (6175, 8)

PROCESSING TRAINING DATA
Fast MBTI Theory-based Weighted Imputation
Core focus: Drained_after_socializing + Time_spent_Alone

Missing values before imputation:
  Time_spent_Alone: 1190
  Stage_fear: 1893
  Social_event_attendance: 1180
  Going_outside: 1466
  Drained_after_socializing: 1149
  Friends_circle_size: 1054
  Post_frequency: 1264

Step 1: Simple imputation for secondary features...
  Social_event_attendance: filled with median 5.00
  Going_outside: filled with median 4.00
  Friends_circle_size: filled with median 8.00
  Post_frequency: filled with median 5.00
  Stage_fear: filled with mode 'No'

Step 2: MBTI-focused imputation for core features...
  Processing Drained_after_socializing (1149 missing)...
    Filled based on Time_spent_Alone correlation
  Processing Time_spent_Alone (1190 missing)...
    Filled based on Drained_after_socializing groups
    - 