In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
import warnings
warnings.filterwarnings('ignore')

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import StratifiedKFold
import itertools

In [2]:
def preprocess_data(df, test_size=0.2, random_state=42, return_preprocessor=False, 
                     has_target=True, preprocessor=None, label_encoder=None):
    """
    Comprehensive preprocessing function for the Adult Census dataset.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        Raw dataset
    test_size : float, default=0.2
        Proportion of dataset to include in test split (only used if has_target=True)
    random_state : int, default=42
        Random state for reproducibility
    return_preprocessor : bool, default=False
        Whether to return the fitted preprocessor object
    has_target : bool, default=True
        Whether the dataset contains the target variable
    preprocessor : sklearn object, default=None
        Pre-fitted preprocessor to use (for validation data)
    label_encoder : sklearn.preprocessing.LabelEncoder, default=None
        Pre-fitted label encoder (for validation data with target)
    
    Returns:
    --------
    If has_target=True and test_size > 0:
        X_train, X_test, y_train, y_test : torch.Tensors
    If has_target=True and test_size=0:
        X_processed, y_encoded : torch.Tensors
    If has_target=False:
        X_processed : torch.Tensor
    
    Optional returns if return_preprocessor=True:
        preprocessor, label_encoder
    """
    
    print(f"Original dataset shape: {df.shape}")
    print(f"First 5 rows before transformation:\n{df.head()}\n" + "="*50 + "\n")
    
    # Step 1: Fix column names
    df = df.copy()  # Avoid modifying original dataframe
    
    # Handle unnamed first column
    if df.columns[0] == '' or 'Unnamed' in str(df.columns[0]):
        df = df.rename(columns={df.columns[0]: 'id'})
    
    # Clean column names (remove spaces, special characters)
    df.columns = df.columns.str.strip().str.replace(' ', '_').str.replace('-', '_')
    
    # Step 2: Handle missing values representation
    missing_indicators = ['?', 'unknown', 'Unknown', '', ' ', 'n/a', 'N/A', 'na', 'NA']
    df = df.replace(missing_indicators, np.nan)
    
    print("Missing values by column:")
    missing_summary = df.isnull().sum()
    print(missing_summary[missing_summary > 0])
    print()
    
    # Step 3: Data type corrections
    # Ensure numeric columns are actually numeric
    numeric_columns = ['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']
    for col in numeric_columns:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')
    
    # Step 4: Feature Engineering (Basic)
    if 'capital_gain' in df.columns and 'capital_loss' in df.columns:
        df['capital_net'] = df['capital_gain'] - df['capital_loss']
        df['has_capital_gain'] = (df['capital_gain'] > 0).astype(int)
        df['has_capital_loss'] = (df['capital_loss'] > 0).astype(int)
    
    # Age groups
    if 'age' in df.columns:
        df['age_group'] = pd.cut(df['age'], 
                                bins=[0, 25, 35, 45, 55, 65, 100], 
                                labels=['18_25', '26_35', '36_45', '46_55', '56_65', '65_plus'])
    
    # Work hours categories
    if 'hours_per_week' in df.columns:
        df['work_hours_category'] = pd.cut(df['hours_per_week'],
                                          bins=[0, 20, 40, 60, 100],
                                          labels=['part_time', 'full_time', 'overtime', 'excessive'])
    
    # Step 5: Separate features and target
    if has_target:
        # Target column is 'income' or last column
        target_col = 'income' if 'income' in df.columns else df.columns[-1]
        X = df.drop(columns=[target_col, 'id'], errors='ignore')
        y = df[target_col]
        
        # Encode target
        if label_encoder is None:
            le = LabelEncoder()
            y_encoded = le.fit_transform(y)
            print(f"Target encoding: {dict(zip(le.classes_, le.transform(le.classes_)))}")
        else:
            y_encoded = label_encoder.transform(y)
            le = label_encoder
        
        print(f"Target distribution: {np.bincount(y_encoded) / len(y_encoded)}\n")
    else:
        X = df.drop(columns=['id'], errors='ignore')
        le = None
    
    # Step 6: Identify numeric and categorical columns
    numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()
    
    print(f"Identified numeric features ({len(numeric_features)}): {numeric_features}")
    print(f"Identified categorical features ({len(categorical_features)}): {categorical_features}\n")
    
    # Step 7: Create preprocessing pipelines
    if preprocessor is None:
        # Numeric pipeline: impute with median, then standardize
        numeric_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ])
        
        # Categorical pipeline: impute with most frequent, then one-hot encode
        categorical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
        ])
        
        # Combine preprocessing
        preprocessor_obj = ColumnTransformer(
            transformers=[
                ('num', numeric_transformer, numeric_features),
                ('cat', categorical_transformer, categorical_features)
            ])
        
        # Fit and transform
        X_processed = preprocessor_obj.fit_transform(X)
    else:
        # Use provided preprocessor
        preprocessor_obj = preprocessor
        X_processed = preprocessor_obj.transform(X)
    
    # Step 8: Split data if target exists and test_size > 0
    if has_target and test_size > 0:
        X_train, X_test, y_train, y_test = train_test_split(
            X_processed, y_encoded, test_size=test_size, 
            random_state=random_state, stratify=y_encoded
        )
        
        print(f"Train set: {X_train.shape}, Test set: {X_test.shape}")
        print(f"Train target distribution: {np.bincount(y_train) / len(y_train)}")
        print(f"Test target distribution: {np.bincount(y_test) / len(y_test)}\n")
        
        # Convert to PyTorch tensors
        X_train = torch.FloatTensor(X_train)
        X_test = torch.FloatTensor(X_test)
        y_train = torch.LongTensor(y_train)
        y_test = torch.LongTensor(y_test)
        
        print(f"Processed shapes - Train: {X_train.shape}, Test: {X_test.shape}")
        print(f"Tensor shapes - Train: {X_train.shape}, Test: {X_test.shape}")
        print(f"Tensor dtypes - X: {X_train.dtype}, y: {y_train.dtype}\n")
        
        print("=" * 50)
        print("PREPROCESSING COMPLETED SUCCESSFULLY!")
        print("=" * 50)
        
        if return_preprocessor:
            return X_train, X_test, y_train, y_test, preprocessor_obj, le
        return X_train, X_test, y_train, y_test
    
    elif has_target:
        # No split, return all data
        X_processed = torch.FloatTensor(X_processed)
        y_encoded = torch.LongTensor(y_encoded)
        
        if return_preprocessor:
            return X_processed, y_encoded, preprocessor_obj, le
        return X_processed, y_encoded
    
    else:
        # No target
        X_processed = torch.FloatTensor(X_processed)
        
        if return_preprocessor:
            return X_processed, preprocessor_obj
        return X_processed

In [3]:
class MLPTuner:
    """
    MLP Hyperparameter Tuner with Grid Search and Early Stopping
    """
    def __init__(self, X_train, X_test, y_train, y_test, use_cv=True, n_folds=5):
        """
        Parameters:
        -----------
        X_train, X_test, y_train, y_test : torch.Tensors
            Training and test data
        use_cv : bool, default=True
            Whether to use cross-validation for hyperparameter tuning
        n_folds : int, default=5
            Number of cross-validation folds
        """
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
        self.input_size = X_train.shape[1]
        self.output_size = len(torch.unique(y_train))
        self.use_cv = use_cv
        self.n_folds = n_folds
        self.results = []
    
    def create_model(self, hidden_layers, dropout_rate):
        """Create MLP with specified architecture"""
        layers = []
        prev_size = self.input_size
        
        for hidden_size in hidden_layers:
            layers.append(nn.Linear(prev_size, hidden_size))
            layers.append(nn.ReLU())
            if dropout_rate > 0:
                layers.append(nn.Dropout(dropout_rate))
            prev_size = hidden_size
        
        layers.append(nn.Linear(prev_size, self.output_size))
        
        return nn.Sequential(*layers)
    
    def train_single_fold(self, model, train_loader, val_X, val_y, 
                         learning_rate, epochs, weight_decay,
                         patience=5, min_delta=0.0001):
        """
        Train model on a single fold with early stopping
        
        Parameters:
        -----------
        patience : int, default=5
            Number of epochs to wait for improvement before stopping
        min_delta : float, default=0.0001
            Minimum change in validation accuracy to qualify as improvement
        """
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
        
        # Early stopping variables
        best_val_acc = 0.0
        patience_counter = 0
        best_model_state = None
        
        # Training loop
        for epoch in range(epochs):
            model.train()
            for batch_X, batch_y in train_loader:
                outputs = model(batch_X)
                loss = criterion(outputs, batch_y)
                
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
            
            # Evaluate on validation set
            model.eval()
            with torch.no_grad():
                val_outputs = model(val_X)
                val_preds = torch.argmax(val_outputs, dim=1)
                val_acc = (val_preds == val_y).float().mean().item()
                val_loss = criterion(val_outputs, val_y).item()
            
            # Early stopping check
            if val_acc > best_val_acc + min_delta:
                best_val_acc = val_acc
                patience_counter = 0
                # Save best model state
                best_model_state = model.state_dict().copy()
            else:
                patience_counter += 1
            
            # Stop if no improvement for 'patience' epochs
            if patience_counter >= patience:
                # Restore best model
                if best_model_state is not None:
                    model.load_state_dict(best_model_state)
                break
        
        return best_val_acc, val_loss
    
    def train_with_cv(self, hidden_layers, learning_rate, batch_size, 
                     dropout_rate, epochs, weight_decay,
                     patience=5, min_delta=0.0001):
        """
        Train model with K-Fold Cross-Validation and early stopping
        
        Returns:
        --------
        cv_scores : dict with mean and std of validation metrics
        """
        # Use StratifiedKFold to maintain class distribution
        skf = StratifiedKFold(n_splits=self.n_folds, shuffle=True, random_state=42)
        
        fold_accs = []
        fold_losses = []
        
        # Convert to numpy for sklearn compatibility
        y_train_np = self.y_train.cpu().numpy()
        
        for fold, (train_idx, val_idx) in enumerate(skf.split(self.X_train, y_train_np), 1):
            # Create model for this fold
            model = self.create_model(hidden_layers, dropout_rate)
            
            # Split data for this fold
            train_X = self.X_train[train_idx]
            train_y = self.y_train[train_idx]
            val_X = self.X_train[val_idx]
            val_y = self.y_train[val_idx]
            
            # Create data loader for training set
            train_dataset = TensorDataset(train_X, train_y)
            train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
            
            # Train on this fold with early stopping
            val_acc, val_loss = self.train_single_fold(
                model, train_loader, val_X, val_y, 
                learning_rate, epochs, weight_decay,
                patience, min_delta
            )
            
            fold_accs.append(val_acc)
            fold_losses.append(val_loss)
        
        # Aggregate results across folds
        cv_scores = {
            'mean_val_acc': np.mean(fold_accs),
            'std_val_acc': np.std(fold_accs),
            'mean_val_loss': np.mean(fold_losses),
            'std_val_loss': np.std(fold_losses),
            'fold_accs': fold_accs
        }
        
        return cv_scores
    
    def train_model(self, model, learning_rate, batch_size, epochs, weight_decay=0,
                   patience=10, min_delta=0.0001):
        """
        Train a single model configuration with early stopping (no CV)
        
        Parameters:
        -----------
        patience : int, default=10
            Number of epochs to wait for improvement before stopping
        min_delta : float, default=0.0001
            Minimum change in test accuracy to qualify as improvement
        """
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
        
        # Create data loaders
        train_dataset = TensorDataset(self.X_train, self.y_train)
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        
        # Early stopping variables
        best_test_acc = 0.0
        patience_counter = 0
        best_model_state = None
        
        # Training loop
        for epoch in range(epochs):
            model.train()
            for batch_X, batch_y in train_loader:
                outputs = model(batch_X)
                loss = criterion(outputs, batch_y)
                
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
            
            # Evaluate on both train and test sets
            model.eval()
            with torch.no_grad():
                # Train accuracy
                train_outputs = model(self.X_train)
                train_preds = torch.argmax(train_outputs, dim=1)
                train_acc = (train_preds == self.y_train).float().mean().item()
                
                # Test accuracy
                test_outputs = model(self.X_test)
                test_preds = torch.argmax(test_outputs, dim=1)
                test_acc = (test_preds == self.y_test).float().mean().item()
            
            # Early stopping check
            if test_acc > best_test_acc + min_delta:
                best_test_acc = test_acc
                patience_counter = 0
                # Save best model state
                best_model_state = model.state_dict().copy()
            else:
                patience_counter += 1
            
            # Stop if no improvement for 'patience' epochs
            if patience_counter >= patience:
                # Restore best model
                if best_model_state is not None:
                    model.load_state_dict(best_model_state)
                break
        
        # Final evaluation
        model.eval()
        with torch.no_grad():
            train_outputs = model(self.X_train)
            train_preds = torch.argmax(train_outputs, dim=1)
            train_acc = (train_preds == self.y_train).float().mean().item()
            
            test_outputs = model(self.X_test)
            test_preds = torch.argmax(test_outputs, dim=1)
            test_acc = (test_preds == self.y_test).float().mean().item()
        
        return train_acc, test_acc
    
    def evaluate_configuration(self, params, epochs, patience=5, min_delta=0.0001):
        """
        Evaluate a configuration with or without CV
        
        Returns:
        --------
        result : dict with performance metrics
        """
        if self.use_cv:
            # Cross-validation
            cv_scores = self.train_with_cv(
                params['hidden_layers'],
                params['learning_rate'],
                params['batch_size'],
                params['dropout_rate'],
                epochs,
                params['weight_decay'],
                patience,
                min_delta
            )
            
            # Train final model on full training set to get test performance
            model = self.create_model(params['hidden_layers'], params['dropout_rate'])
            train_acc, test_acc = self.train_model(
                model, params['learning_rate'], params['batch_size'],
                epochs, params['weight_decay'], patience, min_delta
            )
            
            result = {
                **params,
                'cv_mean_val_acc': cv_scores['mean_val_acc'],
                'cv_std_val_acc': cv_scores['std_val_acc'],
                'cv_fold_accs': cv_scores['fold_accs'],
                'train_acc': train_acc,
                'test_acc': test_acc,
                'overfit_gap': train_acc - test_acc
            }
        else:
            # Simple train/test split
            model = self.create_model(params['hidden_layers'], params['dropout_rate'])
            train_acc, test_acc = self.train_model(
                model, params['learning_rate'], params['batch_size'],
                epochs, params['weight_decay'], patience, min_delta
            )
            
            result = {
                **params,
                'train_acc': train_acc,
                'test_acc': test_acc,
                'overfit_gap': train_acc - test_acc
            }
        
        return result
    
    def grid_search(self, param_grid, epochs=50, patience=5, min_delta=0.0001):
        """
        Perform grid search over hyperparameters with optional CV and early stopping
        
        Parameters:
        -----------
        param_grid : dict
            Dictionary with parameters names (str) as keys and lists of parameter
            settings to try as values.
            Example:
            {
                'hidden_layers': [[128], [128, 64], [256, 128, 64]],
                'learning_rate': [0.001, 0.01, 0.1],
                'batch_size': [32, 64, 128],
                'dropout_rate': [0.0, 0.2, 0.5],
                'weight_decay': [0, 1e-5, 1e-4]
            }
        epochs : int, default=50
            Maximum number of training epochs
        patience : int, default=5
            Number of epochs to wait for improvement before early stopping
        min_delta : float, default=0.0001
            Minimum change to qualify as improvement
        
        Returns:
        --------
        best_params : dict
            Best parameters found
        """
        cv_status = f"with {self.n_folds}-fold CV" if self.use_cv else "without CV"
        print(f"Starting Grid Search {cv_status} with early stopping (patience={patience})...")
        print(f"Total combinations: {np.prod([len(v) for v in param_grid.values()])}\n")
        
        # Get all combinations
        keys = param_grid.keys()
        values = param_grid.values()
        
        for i, combination in enumerate(itertools.product(*values), 1):
            params = dict(zip(keys, combination))
            
            print(f"[{i}] Testing: {params}")
            
            # Evaluate configuration
            result = self.evaluate_configuration(params, epochs, patience, min_delta)
            self.results.append(result)
            
            # Print results
            if self.use_cv:
                print(f"   CV Val Acc: {result['cv_mean_val_acc']:.4f} ± {result['cv_std_val_acc']:.4f}")
                print(f"   Test Acc: {result['test_acc']:.4f}")
                print(f"   Fold Accs: {[f'{acc:.4f}' for acc in result['cv_fold_accs']]}\n")
            else:
                print(f"   Train Acc: {result['train_acc']:.4f}, Test Acc: {result['test_acc']:.4f}\n")
        
        return self.get_best_params()
    
    def get_best_params(self, metric='test_acc'):
        """Get best parameters based on specified metric"""
        if not self.results:
            return None
        
        # For CV, use CV validation accuracy
        if self.use_cv and 'cv_mean_val_acc' in self.results[0]:
            metric = 'cv_mean_val_acc'
        
        best = max(self.results, key=lambda x: x[metric])
        return best
    
    def print_results(self, top_n=5):
        """Print top N results"""
        if not self.results:
            print("No results to display. Run grid_search first.")
            return
        
        if self.use_cv and 'cv_mean_val_acc' in self.results[0]:
            sorted_results = sorted(self.results, key=lambda x: x['cv_mean_val_acc'], reverse=True)
            metric_name = 'CV Val Acc'
        else:
            sorted_results = sorted(self.results, key=lambda x: x['test_acc'], reverse=True)
            metric_name = 'Test Acc'
        
        print("\n" + "="*80)
        print(f"TOP {top_n} CONFIGURATIONS (by {metric_name})")
        print("="*80)
        
        for i, result in enumerate(sorted_results[:top_n], 1):
            print(f"\nRank {i}:")
            print(f"  Architecture: {result['hidden_layers']}")
            print(f"  Learning Rate: {result['learning_rate']}")
            print(f"  Batch Size: {result['batch_size']}")
            print(f"  Dropout: {result['dropout_rate']}")
            print(f"  Weight Decay: {result['weight_decay']}")
            
            if self.use_cv and 'cv_mean_val_acc' in result:
                print(f"  CV Val Acc: {result['cv_mean_val_acc']:.4f} ± {result['cv_std_val_acc']:.4f}")
                print(f"  Fold Accuracies: {[f'{acc:.4f}' for acc in result['cv_fold_accs']]}")
            
            print(f"  Test Accuracy: {result['test_acc']:.4f}")
            print(f"  Train Accuracy: {result['train_acc']:.4f}")
            print(f"  Overfit Gap: {result['overfit_gap']:.4f}")

In [5]:
# Load data 
df = pd.read_csv("/Users/lawsonlevin/ML/project1/data/project_adult.csv")

# Preprocess data
X_train, X_test, y_train, y_test, preprocessor, label_encoder = preprocess_data(df, return_preprocessor=True)

# ============================================================
# Grid Search with 5-Fold Cross-Validation and Early Stopping
# ============================================================
tuner_cv = MLPTuner(X_train, X_test, y_train, y_test, use_cv=True, n_folds=5)

# Define parameter grid
param_grid = {
    'hidden_layers': [[128], [128, 64], [256, 128]],
    'learning_rate': [0.001, 0.01],
    'batch_size': [32, 64],
    'dropout_rate': [0.0, 0.2],
    'weight_decay': [0, 1e-5]
}

# Run grid search with early stopping
best_params = tuner_cv.grid_search(
    param_grid, 
    epochs=50,       # Maximum epochs
    patience=3,      # Stop if no improvement for 3 epochs
    min_delta=0.0001 # Minimum improvement threshold
)

# Print top 5 configurations
tuner_cv.print_results(top_n=5)

# ============================================================
# Train Final Model with Best Parameters
# ============================================================
print("\n" + "="*80)
print("TRAINING FINAL MODEL WITH BEST PARAMETERS")
print("="*80)

final_model = tuner_cv.create_model(
    best_params['hidden_layers'], 
    best_params['dropout_rate']
)

train_acc, test_acc = tuner_cv.train_model(
    final_model,
    best_params['learning_rate'],
    best_params['batch_size'],
    epochs=100,      # Train longer for final model
    weight_decay=best_params['weight_decay'],
    patience=10,     # More patience for final model
    min_delta=0.0001
)

print(f"\nFinal Model Performance:")
print(f"Architecture: {best_params['hidden_layers']}")
print(f"Train Accuracy: {train_acc:.4f}")
print(f"Test Accuracy: {test_acc:.4f}")
print(f"Overfit Gap: {train_acc - test_acc:.4f}")

# Save the model
torch.save(final_model.state_dict(), 'best_model.pth')
print("\nModel saved to 'best_model.pth'")

Original dataset shape: (26048, 16)
First 5 rows before transformation:
   Unnamed: 0  age         workclass  fnlwgt     education  education-num  \
0        5514   33         Local-gov  198183     Bachelors             13   
1       19777   36           Private   86459     Assoc-voc             11   
2       10781   58  Self-emp-not-inc  203039           9th              5   
3       32240   21           Private  180190     Assoc-voc             11   
4        9876   27           Private  279872  Some-college             10   

       marital-status       occupation   relationship   race     sex  \
0       Never-married   Prof-specialty  Not-in-family  White  Female   
1  Married-civ-spouse  Exec-managerial        Husband  White    Male   
2           Separated     Craft-repair  Not-in-family  White    Male   
3  Married-civ-spouse  Farming-fishing        Husband  White    Male   
4            Divorced    Other-service  Not-in-family  White    Male   

   capital-gain  capital-loss  h

In [7]:
# ============================================================
# LOAD MODEL AND MAKE PREDICTIONS
# ============================================================

# 1. Paste hyperparameters here:
HIDDEN_LAYERS = [128]      
DROPOUT_RATE = 0.0              
INPUT_SIZE = 116              
OUTPUT_SIZE = 2                 

# 2. Recreate model architecture
layers = []
prev_size = INPUT_SIZE
for hidden_size in HIDDEN_LAYERS:
    layers.append(nn.Linear(prev_size, hidden_size))
    layers.append(nn.ReLU())
    if DROPOUT_RATE > 0:
        layers.append(nn.Dropout(DROPOUT_RATE))
    prev_size = hidden_size
layers.append(nn.Linear(prev_size, OUTPUT_SIZE))

model = nn.Sequential(*layers)

# 3. Load weights
model.load_state_dict(torch.load('best_model.pth'))
model.eval()

print("✅ Model loaded!")

# 4. Load and preprocess new data
new_data = pd.read_csv("project1/data/project_validation_inputs.csv")

X_new = preprocess_data(new_data, has_target=False, preprocessor=preprocessor)

# 5. Make predictions
with torch.no_grad():
    outputs = model(X_new)
    probabilities = torch.softmax(outputs, dim=1)
    predicted_classes = torch.argmax(outputs, dim=1).numpy()

# 6. Create results
results = pd.DataFrame({
    'predicted_class': predicted_classes,
    'confidence': probabilities.max(dim=1)[0].numpy(),
    'prob_class_0': probabilities[:, 0].numpy(),
    'prob_class_1': probabilities[:, 1].numpy()
})

# Combine with original data
results = pd.concat([new_data.reset_index(drop=True), results], axis=1)

# Save
results.to_csv('predictions.csv', index=False)

print(f"✅ Made {len(predicted_classes)} predictions!")
print(f"   Saved to: predictions.csv")
print(f"\nFirst 5 predictions:")
print(results.head())

# ============================================================
# 7. CREATE SUBMISSION FILE (Required Format)
# ============================================================

# Decode predictions to labels
predicted_labels = label_encoder.inverse_transform(predicted_classes)

# Transform to required format: 1 if '>50K' else -1
transformed_predictions = [1 if x == '>50K' else -1 for x in predicted_labels]

# Create submission dataframe
submission = pd.DataFrame({
    'prediction': transformed_predictions
})

# Save with required filename
submission.to_csv('Group_24_MLP_PredictedOutputs.csv', index=False)

print(f"\n{'='*60}")
print("✅ SUBMISSION FILE CREATED!")
print(f"{'='*60}")
print(f"Filename: Group_24_MLP_PredictedOutputs.csv")
print(f"Total predictions: {len(transformed_predictions)}")
print(f"Predicted >50K (1): {transformed_predictions.count(1)}")
print(f"Predicted <=50K (-1): {transformed_predictions.count(-1)}")
print(f"\nFirst 10 predictions:")
print(submission.head(10))

RuntimeError: Error(s) in loading state_dict for Sequential:
	size mismatch for 0.weight: copying a param with shape torch.Size([128, 118]) from checkpoint, the shape in current model is torch.Size([128, 116]).