In [1]:
import numpy as np
import pandas as pd

# Set global random state for reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

# Load dataset
file_path = "STTHK3013_pilot_performance_simulation_data.xlsx"
df = pd.read_excel(file_path, sheet_name='Sheet1')

In [2]:
# Handle missing values using median imputation
df.fillna(df.median(), inplace=True)

# Feature Engineering: Add Interaction and Polynomial Features
df['reaction_stress_interaction'] = df['time_reaction'] * df['environmental_stressors']
df['fatigue_mission_ratio'] = df['fatigue_level'] / (df['mission_complexity'] + 1)
df['heart_rate_squared'] = df['heart_rate'] ** 2

# Reclassify 'final_performance' into three categories
def classify_performance(value):
    if value in [0, 1]:
        return 0  # Basic
    elif value in [2, 3]:
        return 1  # Skilled
    else:
        return 2  # Expert

df['final_performance'] = df['final_performance'].apply(classify_performance)



In [4]:
# Use the cleaned data (data_cleaned) for Chi-Square test
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import MinMaxScaler
from scipy import stats

X = df.drop(columns=['final_performance'])  # Correct column name
y = df['final_performance']  # Correct column name

# Remove Outliers using Z-score
z_scores = np.abs(stats.zscore(X))
X_filtered = X[(z_scores < 3).all(axis=1)]
y_filtered = y[X_filtered.index]

# Scale the feature data to be non-negative (using MinMaxScaler)
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X_filtered)

# Apply Chi-Square Test for feature selection
chi2_selector = SelectKBest(chi2, k='all')  # Select all features
X_chi2 = chi2_selector.fit_transform(X_scaled, y_filtered)

# Get the Chi-Square p-values for each feature
p_values = chi2_selector.pvalues_

# Create a DataFrame to sort the features by p-values
p_values_df = pd.DataFrame({
    'Feature': X.columns,
    'Chi-Square p-value': p_values
})

# Sort the features by p-value (ascending order)
p_values_df_sorted = p_values_df.sort_values(by='Chi-Square p-value', ascending=False)

# Print the sorted features by their Chi-Square p-value
print("\nFeatures sorted by Chi-Square p-values :\n")
print(p_values_df_sorted)

# Display the selected features based on Chi-Square test
selected_features = pd.Series(X.columns[chi2_selector.get_support()]).head(6)
print(f"\nSelected features based on Chi-Square test:\n {selected_features} ")


Features sorted by Chi-Square p-values :

                        Feature  Chi-Square p-value
7                  stress_level            0.997657
1                 sleep_quality            0.995977
10        fatigue_mission_ratio            0.991588
4       environmental_stressors            0.958933
6                 fatigue_level            0.946188
9   reaction_stress_interaction            0.928008
0                    heart_rate            0.923088
11           heart_rate_squared            0.922239
5               cognitive_level            0.917228
8                 time_reaction            0.899806
3              experience_level            0.842575
2            mission_complexity            0.837363

Selected features based on Chi-Square test:
 0                 heart_rate
1              sleep_quality
2         mission_complexity
3           experience_level
4    environmental_stressors
5            cognitive_level
dtype: object 


In [None]:
from imblearn.over_sampling import SMOTE
from collections import Counter
import pandas as pd

# Subset the dataset with the top 5 features and the numerical target
selected_features = [            
              'heart_rate',
              'sleep_quality',
              'mission_complexity',
              'experience_level',
              'environmental_stressors',
              'cognitive_level',
]
X = df[selected_features]
y = df['final_performance']  # Numerical target

# Display original class distribution
print(f"Original class distribution: {Counter(y)}")

# Calculate the desired increase in data size (20% increase)
original_size = len(X)
desired_size = int(original_size * 1.177)  # 20% increase

# Calculate the sampling strategy for SMOTE
majority_class = max(Counter(y).values())
desired_samples_per_class = int(desired_size / len(Counter(y)))
sampling_strategy = {cls: max(min(desired_samples_per_class, majority_class), Counter(y)[cls]) for cls in Counter(y).keys()}

# Apply SMOTE with the calculated sampling strategy
smote = SMOTE(sampling_strategy=sampling_strategy, random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Display class distribution after SMOTE
print(f"Class distribution after SMOTE: {Counter(y_resampled)}")

# Display original and resampled data sizes
print(f"Original data size: {len(X)}")
print(f"Resampled data size: {len(X_resampled)}")
print(f"Percentage increase in data size: {((len(X_resampled) - len(X)) / len(X)) * 100:.2f}%")


Original class distribution: Counter({2: 358, 1: 324, 0: 297})
Class distribution after SMOTE: Counter({1: 358, 2: 358, 0: 358})
Original data size: 979
Resampled data size: 1074
Percentage increase in data size: 9.70%


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

# Define DNN model creation function
def create_dnn_model(optimizer='adam', activation='relu', neurons=32):
    model = Sequential([
        Dense(neurons, input_dim=X_resampled.shape[1], activation=activation),  # First hidden layer
        Dense(neurons // 2, activation=activation),  # Second hidden layer
        Dense(len(np.unique(y_resampled)), activation='softmax')  # Output layer
    ])
    model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# Wrap DNN model with KerasClassifier
dnn_model = KerasClassifier(build_fn=create_dnn_model, verbose=0)

# Define parameter grid for DNN model
dnn_param_grid = {
    'batch_size': [16, 32, 64],
    'epochs': [50, 100],
    'optimizer': ['adam', 'sgd'],
    'activation': ['relu', 'tanh'],
    'neurons': [16, 32, 64]
}

# Define K-Fold Cross-Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Initialize scalers and result container
scaler = StandardScaler()
dnn_accuracies = []

# Loop through each fold for DNN
for fold, (train_index, test_index) in enumerate(kf.split(X_resampled, y_resampled)):
    print(f"\nFold {fold + 1}")

    # Reset index of X_resampled to ensure indices match
    X_resampled_reset = X_resampled.reset_index(drop=True)

    # Split the data into training and testing sets
    X_train, X_test = X_resampled_reset.iloc[train_index], X_resampled_reset.iloc[test_index]
    y_train, y_test = y_resampled[train_index], y_resampled[test_index]
    
    # Scale the features
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Train the DNN model using GridSearchCV
    dnn_grid_search = GridSearchCV(estimator=dnn_model, param_grid=dnn_param_grid, scoring='accuracy', cv=3, n_jobs=-1, verbose=1)
    dnn_grid_search.fit(X_train_scaled, y_train)

    # Get the best DNN model and evaluate it
    best_dnn_model = dnn_grid_search.best_estimator_
    y_pred_dnn = best_dnn_model.predict(X_test_scaled)
    dnn_accuracy = accuracy_score(y_test, y_pred_dnn)
    print(f"DNN Accuracy for Fold {fold + 1}: {dnn_accuracy:.2f}")
    dnn_accuracies.append(dnn_accuracy)

# Calculate and display the mean accuracy for DNN
mean_dnn_accuracy = np.mean(dnn_accuracies)
print(f"\nMean DNN Accuracy across all folds: {mean_dnn_accuracy:.2f}")


  dnn_model = KerasClassifier(build_fn=create_dnn_model, verbose=0)



Fold 1
Fitting 3 folds for each of 72 candidates, totalling 216 fits
DNN Accuracy for Fold 1: 0.35

Fold 2
Fitting 3 folds for each of 72 candidates, totalling 216 fits
DNN Accuracy for Fold 2: 0.40

Fold 3
Fitting 3 folds for each of 72 candidates, totalling 216 fits
DNN Accuracy for Fold 3: 0.36

Fold 4
Fitting 3 folds for each of 72 candidates, totalling 216 fits
DNN Accuracy for Fold 4: 0.38

Fold 5
Fitting 3 folds for each of 72 candidates, totalling 216 fits
DNN Accuracy for Fold 5: 0.37

Mean DNN Accuracy across all folds: 0.37


In [None]:
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier

# Initialize scalers and result container
scaler = StandardScaler()
xgb_accuracies = []

# Define the XGBoost model
xgb_model = XGBClassifier(random_state=RANDOM_STATE)

# Convert X_resampled to DataFrame to use reset_index
X_resampled_df = pd.DataFrame(X_resampled, columns=X.columns)

# Loop through each fold for XGBoost
for fold, (train_index, test_index) in enumerate(kf.split(X_resampled_df, y_resampled)):
    print(f"\nFold {fold + 1}")

    # Reset index of X_resampled to ensure indices match
    X_resampled_reset = X_resampled_df.reset_index(drop=True)

    # Split the data into training and testing sets
    X_train, X_test = X_resampled_reset.iloc[train_index], X_resampled_reset.iloc[test_index]
    y_train, y_test = y_resampled[train_index], y_resampled[test_index]
    
    # Scale the features
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Train the XGBoost model
    xgb_model.fit(X_train_scaled, y_train)
    
    # Evaluate the model
    y_pred_xgb = xgb_model.predict(X_test_scaled)
    xgb_accuracy = accuracy_score(y_test, y_pred_xgb)
    print(f"XGBoost Accuracy for Fold {fold + 1}: {xgb_accuracy:.2f}")
    xgb_accuracies.append(xgb_accuracy)

# Calculate and display the mean accuracy for XGBoost
mean_xgb_accuracy = np.mean(xgb_accuracies)
print(f"\nMean XGBoost Accuracy across all folds: {mean_xgb_accuracy:.2f}")


Fold 1




XGBoost Accuracy for Fold 1: 0.44

Fold 2




XGBoost Accuracy for Fold 2: 0.37

Fold 3




XGBoost Accuracy for Fold 3: 0.35

Fold 4




XGBoost Accuracy for Fold 4: 0.35

Fold 5




XGBoost Accuracy for Fold 5: 0.39

Mean XGBoost Accuracy across all folds: 0.38


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
import numpy as np

# Define Logistic Regression model
log_reg_model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=5000)

# Define parameter grid for Logistic Regression
log_reg_param_grid = {
    'C': [0.1, 1, 10],                # Regularization strength
    'solver': ['lbfgs', 'newton-cg'], # Solvers to try
}

# Define K-Fold Cross-Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Initialize scalers and result container
scaler = StandardScaler()
log_reg_accuracies = []

# Loop through each fold for Logistic Regression
for fold, (train_index, test_index) in enumerate(kf.split(X_resampled, y_resampled)):
    print(f"\nFold {fold + 1}")

    # Reset index of X_resampled to ensure indices match
    X_resampled_reset = X_resampled.reset_index(drop=True)

    # Split the data into training and testing sets
    X_train, X_test = X_resampled_reset.iloc[train_index], X_resampled_reset.iloc[test_index]
    y_train, y_test = y_resampled[train_index], y_resampled[test_index]
    
    # Scale the features
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Train the Logistic Regression model using GridSearchCV
    log_reg_grid_search = GridSearchCV(estimator=log_reg_model, param_grid=log_reg_param_grid, scoring='accuracy', cv=3, n_jobs=-1, verbose=1)
    log_reg_grid_search.fit(X_train_scaled, y_train)

    # Get the best Logistic Regression model and evaluate it
    best_log_reg_model = log_reg_grid_search.best_estimator_
    y_pred_log_reg = best_log_reg_model.predict(X_test_scaled)
    log_reg_accuracy = accuracy_score(y_test, y_pred_log_reg)
    print(f"Logistic Regression Accuracy for Fold {fold + 1}: {log_reg_accuracy:.2f}")
    log_reg_accuracies.append(log_reg_accuracy)

# Calculate and display the mean accuracy for Logistic Regression
mean_log_reg_accuracy = np.mean(log_reg_accuracies)
print(f"\nMean Logistic Regression Accuracy across all folds: {mean_log_reg_accuracy:.2f}")



Fold 1
Fitting 3 folds for each of 6 candidates, totalling 18 fits
Logistic Regression Accuracy for Fold 1: 0.33

Fold 2
Fitting 3 folds for each of 6 candidates, totalling 18 fits
Logistic Regression Accuracy for Fold 2: 0.35

Fold 3
Fitting 3 folds for each of 6 candidates, totalling 18 fits
Logistic Regression Accuracy for Fold 3: 0.34

Fold 4
Fitting 3 folds for each of 6 candidates, totalling 18 fits
Logistic Regression Accuracy for Fold 4: 0.33

Fold 5
Fitting 3 folds for each of 6 candidates, totalling 18 fits
Logistic Regression Accuracy for Fold 5: 0.34

Mean Logistic Regression Accuracy across all folds: 0.34
