In [1]:
import numpy as np
import pandas as pd

# Set global random state for reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

# Load dataset
file_path = "STTHK3013_pilot_performance_simulation_data.xlsx"
df = pd.read_excel(file_path, sheet_name='Sheet1')

In [2]:
# Handle missing values using median imputation
df.fillna(df.median(), inplace=True)

# Feature Engineering: Add Interaction and Polynomial Features
df['reaction_stress_interaction'] = df['time_reaction'] * df['environmental_stressors']
df['fatigue_mission_ratio'] = df['fatigue_level'] / (df['mission_complexity'] + 1)
df['heart_rate_squared'] = df['heart_rate'] ** 2

# Reclassify 'final_performance' into three categories
def classify_performance(value):
    if value in [0, 1]:
        return 0  # Basic
    elif value in [2, 3]:
        return 1  # Skilled
    else:
        return 2  # Expert

df['final_performance'] = df['final_performance'].apply(classify_performance)



In [4]:
# Use the cleaned data (data_cleaned) for Chi-Square test
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import MinMaxScaler
from scipy import stats

X = df.drop(columns=['final_performance'])  # Correct column name
y = df['final_performance']  # Correct column name

# Remove Outliers using Z-score
z_scores = np.abs(stats.zscore(X))
X_filtered = X[(z_scores < 3).all(axis=1)]
y_filtered = y[X_filtered.index]

# Scale the feature data to be non-negative (using MinMaxScaler)
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X_filtered)

# Apply Chi-Square Test for feature selection
chi2_selector = SelectKBest(chi2, k='all')  # Select all features
X_chi2 = chi2_selector.fit_transform(X_scaled, y_filtered)

# Get the Chi-Square p-values for each feature
p_values = chi2_selector.pvalues_

# Create a DataFrame to sort the features by p-values
p_values_df = pd.DataFrame({
    'Feature': X.columns,
    'Chi-Square p-value': p_values
})

# Sort the features by p-value (ascending order)
p_values_df_sorted = p_values_df.sort_values(by='Chi-Square p-value', ascending=False)

# Print the sorted features by their Chi-Square p-value
print("\nFeatures sorted by Chi-Square p-values :\n")
print(p_values_df_sorted)

# Display the selected features based on Chi-Square test
selected_features = pd.Series(X.columns[chi2_selector.get_support()]).head(6)
print(f"\nSelected features based on Chi-Square test:\n {selected_features} ")


Features sorted by Chi-Square p-values :

                        Feature  Chi-Square p-value
7                  stress_level            0.997657
1                 sleep_quality            0.995977
10        fatigue_mission_ratio            0.991588
4       environmental_stressors            0.958933
6                 fatigue_level            0.946188
9   reaction_stress_interaction            0.928008
0                    heart_rate            0.923088
11           heart_rate_squared            0.922239
5               cognitive_level            0.917228
8                 time_reaction            0.899806
3              experience_level            0.842575
2            mission_complexity            0.837363

Selected features based on Chi-Square test:
 0                 heart_rate
1              sleep_quality
2         mission_complexity
3           experience_level
4    environmental_stressors
5            cognitive_level
dtype: object 


In [5]:
from imblearn.over_sampling import SMOTE
from collections import Counter
import pandas as pd

# Subset the dataset with the top 5 features and the numerical target
selected_features = [            
              'heart_rate',
              'sleep_quality',
              'mission_complexity',
              'experience_level',
              'environmental_stressors',
              'cognitive_level',
]
X = df[selected_features]
y = df['final_performance']  # Numerical target

# Display original class distribution
print(f"Original class distribution: {Counter(y)}")

# Calculate the desired increase in data size (20% increase)
original_size = len(X)
desired_size = int(original_size * 1.177)  # 20% increase

# Calculate the sampling strategy for SMOTE
majority_class = max(Counter(y).values())
desired_samples_per_class = int(desired_size / len(Counter(y)))
sampling_strategy = {cls: max(min(desired_samples_per_class, majority_class), Counter(y)[cls]) for cls in Counter(y).keys()}

# Apply SMOTE with the calculated sampling strategy
smote = SMOTE(sampling_strategy=sampling_strategy, random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Display class distribution after SMOTE
print(f"Class distribution after SMOTE: {Counter(y_resampled)}")

# Display original and resampled data sizes
print(f"Original data size: {len(X)}")
print(f"Resampled data size: {len(X_resampled)}")
print(f"Percentage increase in data size: {((len(X_resampled) - len(X)) / len(X)) * 100:.2f}%")


Original class distribution: Counter({2: 358, 1: 324, 0: 297})
Class distribution after SMOTE: Counter({1: 358, 2: 358, 0: 358})
Original data size: 979
Resampled data size: 1074
Percentage increase in data size: 9.70%


In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Step 4: Train and Optimize Models with Stratified K-Fold CV
param_grids = {
    "Logistic Regression": {
        'C': [0.1, 1, 10],
        'solver': ['lbfgs', 'liblinear']
    },
    "Deep Neural Network (MLP)": {
        'hidden_layer_sizes': [(256, 128, 64), (512, 256, 128)],
        'activation': ['relu', 'tanh'],
        'learning_rate_init': [0.001, 0.005]
    },
    "XGBoost": {
        'n_estimators': [300, 400, 500],
        'max_depth': [6, 9, 12],
        'learning_rate': [0.01, 0.05, 0.1]
    }
}

models = {
    "Logistic Regression": LogisticRegression(random_state=RANDOM_STATE),
    "Deep Neural Network (MLP)": MLPClassifier(random_state=RANDOM_STATE),
    "XGBoost": XGBClassifier(random_state=RANDOM_STATE)
}

best_models = {}
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

for name, param_grid in param_grids.items():
    print(f"Optimizing {name}...")
    grid_search = GridSearchCV(models[name], param_grid, cv=skf, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_resampled, y_resampled)
    
    best_models[name] = grid_search.best_estimator_
    print(f"Best Params for {name}: {grid_search.best_params_}")

# Split dataset for final testing
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=RANDOM_STATE, stratify=y_resampled)

# Standardize features using RobustScaler
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 5: Train & Compare All Optimized Models
final_results = {}
for name, model in best_models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, digits=2)
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    final_results[name] = {"accuracy": accuracy, "report": report, "confusion_matrix": conf_matrix}

    # Display results
    print(f"===== {name} =====")
    print("Accuracy:", accuracy)
    print("\nClassification Report:\n", report)
    print("\nConfusion Matrix:\n", conf_matrix, "\n")

# Identify the Best Performing Model
best_model_name = max(final_results, key=lambda x: final_results[x]['accuracy'])
best_model_params = best_models[best_model_name].get_params()

print(f"\nBest Model: {best_model_name} with Accuracy: {final_results[best_model_name]['accuracy']:.4f}")
print(f"Best Model Parameters: {best_model_params}")


  from pandas import MultiIndex, Int64Index


Optimizing Logistic Regression...
Best Params for Logistic Regression: {'C': 0.1, 'solver': 'liblinear'}
Optimizing Deep Neural Network (MLP)...
Best Params for Deep Neural Network (MLP): {'activation': 'tanh', 'hidden_layer_sizes': (512, 256, 128), 'learning_rate_init': 0.001}
Optimizing XGBoost...


  from pandas import MultiIndex, Int64Index


Best Params for XGBoost: {'learning_rate': 0.05, 'max_depth': 9, 'n_estimators': 500}
===== Logistic Regression =====
Accuracy: 0.37209302325581395

Classification Report:
               precision    recall  f1-score   support

           0       0.39      0.38      0.39        71
           1       0.44      0.39      0.41        72
           2       0.30      0.35      0.32        72

    accuracy                           0.37       215
   macro avg       0.38      0.37      0.37       215
weighted avg       0.38      0.37      0.37       215


Confusion Matrix:
 [[27 12 32]
 [19 28 25]
 [23 24 25]] 





===== Deep Neural Network (MLP) =====
Accuracy: 0.4558139534883721

Classification Report:
               precision    recall  f1-score   support

           0       0.51      0.52      0.51        71
           1       0.42      0.44      0.43        72
           2       0.45      0.40      0.42        72

    accuracy                           0.46       215
   macro avg       0.46      0.46      0.46       215
weighted avg       0.46      0.46      0.46       215


Confusion Matrix:
 [[37 18 16]
 [20 32 20]
 [16 27 29]] 

===== XGBoost =====
Accuracy: 0.3767441860465116

Classification Report:
               precision    recall  f1-score   support

           0       0.45      0.52      0.48        71
           1       0.29      0.32      0.30        72
           2       0.39      0.29      0.33        72

    accuracy                           0.38       215
   macro avg       0.38      0.38      0.37       215
weighted avg       0.38      0.38      0.37       215


Confusion Ma