ANOVA

In [None]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import f_classif
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE

RANDOM_STATE = 42
AUGMENTATION_FACTOR = 3

# --------------------------------------
# 1) Load & Basic Cleaning
# --------------------------------------
df = pd.read_csv("survey_data_cleaned.csv")
drop_cols = ['Timestamp', 'Date_Of_Birth', 'Age']
df = df.drop(columns=[c for c in drop_cols if c in df.columns])

target_col = "Career_Interest"
X = df.drop(columns=[target_col])
y = df[target_col].astype(str)

print("[INFO] Original shape:", df.shape)
print("[INFO] Original class counts:\n", y.value_counts(), "\n")

# --------------------------------------
# 2) Feature Type Detection
# --------------------------------------
numeric_continuous = ['Math','English','Bio','Chemistry','Physics','ICT',
                      'Business','Average_Score','IELTS_Score']

ordinal_features = ['Parents_Education_Encoded','Family_Lifestyle_Encoded']

binary_features = [
    'Building miniatures / models','DIY projects','Debate / Public Speaking',
    'Hackathons / App development projects','Programming / coding clubs',
    'Science clubs','Student Council / Leadership Roles',
    'Volunteering at hospitals, clinics, or NGOs','DIY / Project recognition',
    'Debate / Public Speaking awards','Hackathon / App competition prize',
    'Model building recognition','Other (please specify)',
    'Programming / Coding award','Science fair / Olympiad prize',
    'Student Council / Leadership role','Volunteering recognition'
]

categorical_object = [
    'Gender','Age_Group','Aptitude','if_HS_Student','Study_Method','Study_Habit',
    'English_Proficiency','Favorite_Subject','Personality_Trait','Address',
    'Study_Country','Chosen_University','Influence'
]

print("[INFO] Continuous numeric:", numeric_continuous)
print("[INFO] Ordinal:", ordinal_features)
print("[INFO] Binary:", binary_features)
print("[INFO] Categorical:", categorical_object, "\n")

# --------------------------------------
# 3) Encode categorical (object only)
# --------------------------------------
X_enc = X.copy()
encoders = {}
for col in categorical_object:
    le = LabelEncoder()
    X_enc[col] = le.fit_transform(X_enc[col].astype(str))
    encoders[col] = le

# (binary + ordinal already numeric — keep them as-is)

# Encode target
le_target = LabelEncoder()
y_enc = le_target.fit_transform(y)
X_enc[target_col] = y_enc

# --------------------------------------
# 4) Gaussian Noise ONLY on continuous features
# --------------------------------------
def augment_numeric_only(df_original, factor, target_col, noise_std=0.05):
    df_list = [df_original.copy()]
    for _ in range(factor-1):
        df_new = df_original.copy()
        for col in numeric_continuous:
            std = df_original[col].std()
            noise = np.random.normal(0, noise_std * std, size=len(df_original))
            df_new[col] = np.maximum(df_new[col] + noise, 0)
        df_list.append(df_new)
    return pd.concat(df_list, ignore_index=True)

df_augmented = augment_numeric_only(X_enc, AUGMENTATION_FACTOR, target_col)
print("[INFO] Augmented dataset size:", len(df_augmented), "\n")

X_aug = df_augmented.drop(columns=[target_col])
y_aug = df_augmented[target_col]

# --------------------------------------
# 5) Split (Train/Valid/Test)
# --------------------------------------
X_train_temp, X_test, y_train_temp, y_test = train_test_split(
    X_aug, y_aug, test_size=0.2, stratify=y_aug, random_state=RANDOM_STATE
)

X_train, X_valid, y_train, y_valid = train_test_split(
    X_train_temp, y_train_temp, test_size=0.25, stratify=y_train_temp, random_state=RANDOM_STATE
)

print("[INFO] Train/Valid/Test sizes:", len(X_train), len(X_valid), len(X_test))

# --------------------------------------
# 6) SMOTE on train only
# --------------------------------------
sm = SMOTE(random_state=RANDOM_STATE)
X_train_sm, y_train_sm = sm.fit_resample(X_train, y_train)

print("[INFO] After SMOTE:", pd.Series(y_train_sm).value_counts(), "\n")

# --------------------------------------
# 7) Scaling
# --------------------------------------
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_sm)
X_valid_scaled = scaler.transform(X_valid)
X_test_scaled = scaler.transform(X_test)

# --------------------------------------
# 8) ANOVA (Feature Selection)
# --------------------------------------
F_vals, p_vals = f_classif(X_train_scaled, y_train_sm)
anova_df = pd.DataFrame({'Feature': X_train.columns, 'F_value': F_vals, 'p_value': p_vals})

selected_features = anova_df[anova_df['p_value'] < 0.05]['Feature'].tolist()
if len(selected_features) == 0:
    selected_features = X_train.columns.tolist()

print("[INFO] ANOVA selected:", selected_features, "\n")
print("[INFO] Number selected:", len(selected_features), "\n")

X_train_sel = pd.DataFrame(X_train_scaled, columns=X_train.columns)[selected_features].values
X_valid_sel = pd.DataFrame(X_valid_scaled, columns=X_valid.columns)[selected_features].values
X_test_sel = pd.DataFrame(X_test_scaled, columns=X_test.columns)[selected_features].values

# --------------------------------------
# 9) Models
# --------------------------------------
models = {
    'RF': RandomForestClassifier(n_estimators=200, random_state=RANDOM_STATE),
    'XGB': XGBClassifier(eval_metric='mlogloss', random_state=RANDOM_STATE),
    'MLP': MLPClassifier(hidden_layer_sizes=(128,64), max_iter=300, random_state=RANDOM_STATE),
    'LR': LogisticRegression(max_iter=500),
    'SVM': SVC(probability=True, random_state=RANDOM_STATE),
    'MLR': LinearRegression()  # Multiclass regression baseline
}

trained = {}
for name, mdl in models.items():
    mdl.fit(X_train_sel, y_train_sm)
    trained[name] = mdl

ensemble = VotingClassifier(
    estimators=[('RF', trained['RF']), ('XGB', trained['XGB'])],
    voting='soft'
)
ensemble.fit(X_train_sel, y_train_sm)
trained['ENSEMBLE'] = ensemble

# --------------------------------------
# 10) Evaluation
# --------------------------------------
def predict_mdl(name, model, X):
    if name == 'MLR':
        y_hat = np.rint(model.predict(X)).astype(int)
        return np.clip(y_hat, y_enc.min(), y_enc.max())
    return model.predict(X)

print("\n--- ACCURACY (Train / Valid / Test) ---")
for name, mdl in trained.items():
    print(
        name,
        "| Train:", accuracy_score(y_train_sm, predict_mdl(name, mdl, X_train_sel)),
        "| Valid:", accuracy_score(y_valid, predict_mdl(name, mdl, X_valid_sel)),
        "| Test:", accuracy_score(y_test, predict_mdl(name, mdl, X_test_sel))
    )

print("\n--- TEST SET CLASSIFICATION REPORTS ---")
for name, mdl in trained.items():
    print("\n======", name, "======")
    y_pred = predict_mdl(name, mdl, X_test_sel)
    print(classification_report(y_test, y_pred, target_names=le_target.classes_))


[INFO] Original shape: (209, 42)
[INFO] Original class counts:
 Career_Interest
1    64
0    53
3    51
2    41
Name: count, dtype: int64 

[INFO] Continuous numeric: ['Math', 'English', 'Bio', 'Chemistry', 'Physics', 'ICT', 'Business', 'Average_Score', 'IELTS_Score']
[INFO] Ordinal: ['Parents_Education_Encoded', 'Family_Lifestyle_Encoded']
[INFO] Binary: ['Building miniatures / models', 'DIY projects', 'Debate / Public Speaking', 'Hackathons / App development projects', 'Programming / coding clubs', 'Science clubs', 'Student Council / Leadership Roles', 'Volunteering at hospitals, clinics, or NGOs', 'DIY / Project recognition', 'Debate / Public Speaking awards', 'Hackathon / App competition prize', 'Model building recognition', 'Other (please specify)', 'Programming / Coding award', 'Science fair / Olympiad prize', 'Student Council / Leadership role', 'Volunteering recognition']
[INFO] Categorical: ['Gender', 'Age_Group', 'Aptitude', 'if_HS_Student', 'Study_Method', 'Study_Habit', 'En

MI

In [None]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import mutual_info_classif # ADDED for MI
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE

RANDOM_STATE = 42
AUGMENTATION_FACTOR = 3
NOISE_STD_DEV = 0.1
TOP_K_FEATURES = 30 # Defined a value for MI selection

# ----------------------------
# 1) Load & clean
# ----------------------------
df = pd.read_csv("survey_data_cleaned.csv")
drop_cols = ['Timestamp', 'Date_Of_Birth', 'Age']
df = df.drop(columns=[c for c in drop_cols if c in df.columns])

target_col = 'Career_Interest'
X = df.drop(columns=[target_col])
y = df[target_col].astype(str)

print("[INFO] Original shape:", df.shape)
print("[INFO] Original class counts:\n", y.value_counts(), "\n")

# ----------------------------
# Detect feature types automatically
# ----------------------------
all_features = X.columns.tolist()

# numeric dtype columns (may include encoded categoricals and binary dummies)
numeric_dtype_cols = X.select_dtypes(include=[np.number]).columns.tolist()

# detect binary numeric columns (only 0/1)
binary_features = []
for c in numeric_dtype_cols:
    vals = X[c].dropna().unique()
    # consider values that are exactly integers 0/1 (allow float 0.0/1.0)
    if set(np.unique(vals)).issubset({0, 1}):
        binary_features.append(c)

# continuous numeric: numeric dtype and reasonably many unique values
continuous_numeric = []
ordinal_encoded = []  # numeric but small set (e.g., 0/1/2)
for c in numeric_dtype_cols:
    if c in binary_features:
        continue
    nun = X[c].nunique(dropna=True)
    if nun > 5:
        continuous_numeric.append(c)
    else:
        # small number of unique values (<=5) treated as ordinal/categorical encoded
        ordinal_encoded.append(c)

# object dtype columns: true categorical strings -> we will label encode them
object_categorical = X.select_dtypes(include=['object', 'category']).columns.tolist()

# Remove any overlap
for c in list(object_categorical):
    if c in continuous_numeric: object_categorical.remove(c)
for c in list(ordinal_encoded):
    if c in object_categorical: ordinal_encoded.remove(c)

print("[INFO] Detected continuous numeric features:", continuous_numeric)
print("[INFO] Detected ordinal/categorical-as-int features:", ordinal_encoded)
print("[INFO] Detected binary features:", binary_features)
print("[INFO] Detected object categorical features (to encode):", object_categorical, "\n")

# ----------------------------
# 2) Encode categorical features (only object/category ones)
# ----------------------------
X_enc = X.copy()
encoders = {}

# Label-encode object categorical columns (Address, Favorite_Subject, etc.)
for col in object_categorical:
    le = LabelEncoder()
    # convert to str to handle NaNs uniformly
    X_enc[col] = le.fit_transform(X_enc[col].astype(str))
    encoders[col] = le

# Keep ordinal_encoded as-is (they're already numeric, e.g., 0/1/2)
# Keep binary_features as-is (0/1)

# For completeness convert any remaining non-numeric to numeric safe
for col in X_enc.columns:
    if X_enc[col].dtype == 'object' or X_enc[col].dtype.name == 'category':
        # fallback: try numeric conversion
        X_enc[col] = pd.to_numeric(X_enc[col].astype(str).str.replace(',', ''), errors='coerce').fillna(0)

le_target = LabelEncoder()
y_enc = le_target.fit_transform(y)

df_enc = X_enc.copy()
df_enc[target_col] = y_enc

# ----------------------------
# 3) Augmentation (Gaussian Noise) — ONLY on continuous numeric features
# ----------------------------
def create_augmented_data(df_original, factor, target_column, noise_std_dev=0.05):
    """
    Add Gaussian noise only to continuous numeric columns (continuous_numeric).
    Other columns (binary, ordinal, encoded categorical) are left unchanged.
    """
    df_list = [df_original.copy()]
    feature_cols = df_original.columns.drop(target_column).tolist()

    # identify which of feature_cols are continuous numeric in current df
    cont_cols = [c for c in feature_cols if c in continuous_numeric]

    for _ in range(factor - 1):
        df_new = df_original.copy()
        # Add noise only to continuous columns
        for col in cont_cols:
            std = df_new[col].std()
            # if std is NaN or zero, skip adding noise
            if pd.isna(std) or std == 0:
                continue
            noise = np.random.normal(0, noise_std_dev * std, size=len(df_new))
            df_new[col] = df_new[col] + noise
            # optional: keep sensible bounds for scores (0-100) if column looks like a score
            if col in ['Math', 'English', 'Bio', 'Chemistry', 'Physics', 'ICT', 'Business', 'Average_Score', 'IELTS_Score']:
                df_new[col] = df_new[col].clip(lower=0)
        # For non-continuous columns, keep values as-is (no rounding necessity)
        df_list.append(df_new)

    return pd.concat(df_list, ignore_index=True)

df_augmented = create_augmented_data(df_enc, AUGMENTATION_FACTOR, target_col, NOISE_STD_DEV)
print(f"[INFO] Data augmented (GN on continuous numeric only). New total size: {df_augmented.shape[0]} samples.")

X_augmented = df_augmented.drop(columns=[target_col])
y_augmented = df_augmented[target_col]

# ----------------------------
# 4) Train/Validation/Test split (60/20/20)
# ----------------------------
X_train_temp, X_test, y_train_temp, y_test = train_test_split(
    X_augmented, y_augmented, test_size=0.2, stratify=y_augmented, random_state=RANDOM_STATE
)
X_train, X_valid, y_train, y_valid = train_test_split(
    X_train_temp, y_train_temp, test_size=0.25, stratify=y_train_temp, random_state=RANDOM_STATE
)

print(f"[INFO] Train Size: {len(X_train)} | Validation Size: {len(X_valid)} | Test Size: {len(X_test)}")
print("[INFO] Train class dist BEFORE SMOTE:\n", pd.Series(y_train).value_counts(), "\n")

# ----------------------------
# 5) SMOTE on training data ONLY
# ----------------------------
sm = SMOTE(random_state=RANDOM_STATE)
# SMOTE expects numeric array. All columns are numeric after label encoding; keep as-is.
X_train_sm, y_train_sm = sm.fit_resample(X_train, y_train)

print("[INFO] Train class dist AFTER SMOTE:\n", pd.Series(y_train_sm).value_counts(), "\n")

# ----------------------------
# 6) Scaling — ONLY continuous numeric features
# ----------------------------
scaler = StandardScaler()

# Fit scaler on continuous numeric columns of X_train_sm
# If a continuous column is missing after resampling (unlikely), handle gracefully
cont_cols_present = [c for c in continuous_numeric if c in X_train_sm.columns]
if cont_cols_present:
    scaler.fit(X_train_sm[cont_cols_present])
    # transform train, valid, test continuous columns
    X_train_scaled_cont = pd.DataFrame(scaler.transform(X_train_sm[cont_cols_present]), columns=cont_cols_present, index=X_train_sm.index)
    X_valid_scaled_cont = pd.DataFrame(scaler.transform(X_valid[cont_cols_present]), columns=cont_cols_present, index=X_valid.index)
    X_test_scaled_cont = pd.DataFrame(scaler.transform(X_test[cont_cols_present]), columns=cont_cols_present, index=X_test.index)
else:
    # no continuous columns detected, create empty frames
    X_train_scaled_cont = pd.DataFrame(index=X_train_sm.index)
    X_valid_scaled_cont = pd.DataFrame(index=X_valid.index)
    X_test_scaled_cont = pd.DataFrame(index=X_test.index)

# Now build full scaled DataFrames by combining:
# - scaled continuous numeric
# - ordinal_encoded (leave as-is)
# - binary_features (leave as-is)
# - object categorical which were label-encoded earlier (present in X_train_sm)
other_cols = [c for c in X_train_sm.columns if c not in cont_cols_present]

# Convert index alignment and create DataFrames for other cols
X_train_other = X_train_sm[other_cols].reset_index(drop=True)
X_valid_other = X_valid.reset_index(drop=True)[other_cols]
X_test_other = X_test.reset_index(drop=True)[other_cols]

# Reset index for scaled continuous
X_train_scaled_cont = X_train_scaled_cont.reset_index(drop=True)
X_valid_scaled_cont = X_valid_scaled_cont.reset_index(drop=True)
X_test_scaled_cont = X_test_scaled_cont.reset_index(drop=True)

# Concatenate scaled continuous + other (order columns same as original X_train_sm)
X_train_scaled = pd.concat([X_train_scaled_cont, X_train_other], axis=1)[X_train_sm.columns.tolist()]
X_valid_scaled = pd.concat([X_valid_scaled_cont, X_valid_other], axis=1)[X_valid.columns.tolist()]
X_test_scaled = pd.concat([X_test_scaled_cont, X_test_other], axis=1)[X_test.columns.tolist()]

# ----------------------------
# 7) Mutual Information (MI) Feature Selection
# ----------------------------
print("\n>>> [7/9] MUTUAL INFORMATION FEATURE SELECTION...")
# Calculate MI scores on the scaled, SMOTEd training data (all features numeric now)
MI_scores = mutual_info_classif(X_train_scaled.values, y_train_sm, random_state=RANDOM_STATE)
mi_df = pd.DataFrame({'Feature': X_train_scaled.columns, 'MI_Score': MI_scores})

# Select top K features based on MI score
selected_features = mi_df.nlargest(TOP_K_FEATURES, 'MI_Score')['Feature'].tolist()

if len(selected_features) == 0:
    print("[WARNING] MI selected 0 features. Using all features.")
    selected_features = X_train_scaled.columns.tolist()

# Transform all datasets to selected features
X_train_df = pd.DataFrame(X_train_scaled.values, columns=X_train_scaled.columns)
X_valid_df = pd.DataFrame(X_valid_scaled.values, columns=X_valid.columns)
X_test_df = pd.DataFrame(X_test_scaled.values, columns=X_test.columns)

X_train_sel = X_train_df[selected_features].values
X_valid_sel = X_valid_df[selected_features].values
X_test_sel = X_test_df[selected_features].values

print(f"[INFO] MI selected {len(selected_features)} features: {selected_features}")
print("[INFO] Numbers of MI selected features:", len(selected_features), "\n")

# ----------------------------
# 8) Train models
# -------------------
models = {
    'RF': RandomForestClassifier(n_estimators=200, random_state=RANDOM_STATE),
    'XGB': XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=RANDOM_STATE),
    'MLP': MLPClassifier(hidden_layer_sizes=(128,64), max_iter=300, random_state=RANDOM_STATE),
    'LR': LogisticRegression(max_iter=500, random_state=RANDOM_STATE),
    'SVM': SVC(probability=True, random_state=RANDOM_STATE),
    'MLR': LinearRegression()
}

trained = {}
for name, mdl in models.items():
    print(f"[INFO] Training {name} ...")
    mdl.fit(X_train_sel, y_train_sm)
    trained[name] = mdl

ensemble = VotingClassifier(
    estimators=[('RF', trained['RF']), ('XGB', trained['XGB'])],
    voting='soft'
)
ensemble.fit(X_train_sel, y_train_sm)
trained['ENSEMBLE'] = ensemble

# ----------------------------
# 9) Predict & evaluate
# ----------------------------
def predict_model(name, model, X):
    if name == 'MLR':
        y_hat = np.rint(model.predict(X)).astype(int)
        y_hat = np.clip(y_hat, y_enc.min(), y_enc.max())
        return y_hat
    return model.predict(X)

print("\n" + "="*60)
print(">>> EVALUATION: TRAINING, VALIDATION & TEST SETS <<<")
print("="*60)

# 9.1 Accuracy Comparison
print("\n--- ACCURACY COMPARISON with MI(Training vs. Validation vs. Test) ---")
for name, mdl in trained.items():
    y_pred_train = predict_model(name, mdl, X_train_sel)
    acc_train = accuracy_score(y_train_sm, y_pred_train)

    y_pred_valid = predict_model(name, mdl, X_valid_sel)
    acc_valid = accuracy_score(y_valid, y_pred_valid)

    y_pred_test = predict_model(name, mdl, X_test_sel)
    acc_test = accuracy_score(y_test, y_pred_test)

    print(f"   {name} Training: {acc_train:.4f} | Validation: {acc_valid:.4f} | Test: {acc_test:.4f}")

# 9.2 Full Classification Report for Test Set
print("\n--- FINAL TEST SET RESULTS (Unseen Data) ---")
for name, mdl in trained.items():
    y_pred = predict_model(name, mdl, X_test_sel)
    print(f"\n===== {name} =====")
    try:
        print(classification_report(y_test, y_pred, target_names=le_target.classes_))
    except ValueError:
        print(f"Classification Report failed for {name} due to class mismatch. Showing only Accuracy.")
    print("Test Accuracy:", accuracy_score(y_test, y_pred))


[INFO] Original shape: (209, 42)
[INFO] Original class counts:
 Career_Interest
1    64
0    53
3    51
2    41
Name: count, dtype: int64 

[INFO] Detected continuous numeric features: ['Math', 'English', 'Bio', 'Chemistry', 'Physics', 'ICT', 'Business', 'Average_Score', 'IELTS_Score']
[INFO] Detected ordinal/categorical-as-int features: ['Family_Lifestyle_Encoded']
[INFO] Detected binary features: ['Parents_Education_Encoded', 'Building miniatures / models', 'DIY projects', 'Debate / Public Speaking', 'Hackathons / App development projects', 'Programming / coding clubs', 'Science clubs', 'Student Council / Leadership Roles', 'Volunteering at hospitals, clinics, or NGOs', 'DIY / Project recognition', 'Debate / Public Speaking awards', 'Hackathon / App competition prize', 'Model building recognition', 'Other (please specify)', 'Programming / Coding award', 'Science fair / Olympiad prize', 'Student Council / Leadership role', 'Volunteering recognition']
[INFO] Detected object categorical

CV

In [None]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
# Import both feature selection methods
from sklearn.feature_selection import f_classif, mutual_info_classif
# Import for K-Fold splitting
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE

RANDOM_STATE = 42
AUGMENTATION_FACTOR = 3
NOISE_STD_DEV = 0.1
TOP_K_FEATURES = 30
N_SPLITS = 5 # K for K-Fold Cross-Validation

# ----------------------------
# 1) Load & clean
# ----------------------------
df = pd.read_csv("survey_data_cleaned.csv")
drop_cols = ['Timestamp', 'Date_Of_Birth', 'Age']
df = df.drop(columns=[c for c in drop_cols if c in df.columns])

target_col = 'Career_Interest'
X = df.drop(columns=[target_col])
y = df[target_col].astype(str)

# ----------------------------
# 2) Encode categorical features
# ----------------------------
X_enc = X.copy()
for col in X_enc.columns:
    if X_enc[col].dtype == 'object' or X_enc[col].dtype.name == 'category':
        le = LabelEncoder()
        X_enc[col] = le.fit_transform(X_enc[col].astype(str))

le_target = LabelEncoder()
y_enc = le_target.fit_transform(y)

df_enc = X_enc.copy()
df_enc[target_col] = y_enc

# ----------------------------
# 3) Augmentation (Gaussian Noise)
# ----------------------------
def create_augmented_data(df_original, factor, target_column, noise_std_dev=0.05):
    df_list = [df_original.copy()]
    feature_cols = df_original.columns.drop(target_column).tolist()

    for _ in range(factor - 1):
        df_new = df_original.copy()
        for col in feature_cols:
            noise = np.random.normal(0, noise_std_dev * df_new[col].std(), size=len(df_new))
            df_new[col] = np.maximum(df_new[col] + noise, 0)
        df_list.append(df_new)

    return pd.concat(df_list, ignore_index=True)

df_augmented = create_augmented_data(df_enc, AUGMENTATION_FACTOR, target_col, NOISE_STD_DEV)
print(f"[INFO] Data augmented (GN). New total size: {df_augmented.shape[0]} samples.")

X_augmented = df_augmented.drop(columns=[target_col])
y_augmented = df_augmented[target_col]

# ----------------------------
# 4) Train/Test split (80% for CV/Training, 20% for Final Test)
# ----------------------------
# X_train_temp is the pool for Cross-Validation
X_train_temp, X_test, y_train_temp, y_test = train_test_split(
    X_augmented, y_augmented, test_size=0.2, stratify=y_augmented, random_state=RANDOM_STATE
)

print(f"[INFO] CV Pool Size: {len(X_train_temp)} | Final Test Size: {len(X_test)}")

# Convert X_train_temp to DataFrame to preserve column names
X_train_temp = pd.DataFrame(X_train_temp, columns=X_augmented.columns)

# ----------------------------
# 5) Define Models for CV (Simplified to RF, MLP, LogReg for speed)
# ----------------------------
cv_models = {
    'RF': RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE),
    'MLP': MLPClassifier(hidden_layer_sizes=(64, 32), max_iter=300, random_state=RANDOM_STATE),
    'LR': LogisticRegression(max_iter=500, random_state=RANDOM_STATE),
}

# ----------------------------------------------------
# 6) Pipeline 1: ANOVA + K-Fold Cross-Validation
# ----------------------------------------------------
def run_cv_pipeline_anova(X, y, models, n_splits=N_SPLITS, top_k=TOP_K_FEATURES):
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_STATE)
    cv_results = {name: [] for name in models.keys()}

    print(f"\n--- Running ANOVA Pipeline ({n_splits} Folds) ---")
    for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
        X_train_f, X_val_f = X.iloc[train_idx], X.iloc[val_idx]
        y_train_f, y_val_f = y.iloc[train_idx], y.iloc[val_idx]

        # 6.1) SMOTE on Training Fold ONLY (Prevents Data Leakage)
        sm = SMOTE(random_state=RANDOM_STATE)
        X_train_sm, y_train_sm = sm.fit_resample(X_train_f, y_train_f)

        # 6.2) Scaling
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train_sm)
        X_val_scaled = scaler.transform(X_val_f)

        # 6.3) ANOVA Feature Selection
        F_vals, p_vals = f_classif(X_train_scaled, y_train_sm)
        anova_df = pd.DataFrame({'Feature': X.columns, 'p_value': p_vals})
        selected_features = anova_df[anova_df['p_value'] < 0.05]['Feature'].tolist()
        if len(selected_features) == 0:
            selected_features = X.columns.tolist()

        # 6.4) Apply Feature Selection
        X_train_sel = pd.DataFrame(X_train_scaled, columns=X.columns)[selected_features].values
        X_val_sel = pd.DataFrame(X_val_scaled, columns=X.columns)[selected_features].values

        # 6.5) Train and Evaluate Models
        for name, mdl in models.items():
            mdl.fit(X_train_sel, y_train_sm)
            y_pred = mdl.predict(X_val_sel)
            acc = accuracy_score(y_val_f, y_pred)
            cv_results[name].append(acc)

    return {name: np.mean(scores) for name, scores in cv_results.items()}, selected_features


# ----------------------------------------------------
# 7) Pipeline 2: Mutual Information + K-Fold Cross-Validation
# ----------------------------------------------------
def run_cv_pipeline_mi(X, y, models, n_splits=N_SPLITS, top_k=TOP_K_FEATURES):
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_STATE)
    cv_results = {name: [] for name in models.keys()}

    print(f"\n--- Running MI Pipeline ({n_splits} Folds) ---")
    for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
        X_train_f, X_val_f = X.iloc[train_idx], X.iloc[val_idx]
        y_train_f, y_val_f = y.iloc[train_idx], y.iloc[val_idx]

        # 7.1) SMOTE on Training Fold ONLY (Prevents Data Leakage)
        sm = SMOTE(random_state=RANDOM_STATE)
        X_train_sm, y_train_sm = sm.fit_resample(X_train_f, y_train_f)

        # 7.2) Scaling
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train_sm)
        X_val_scaled = scaler.transform(X_val_f)

        # 7.3) Mutual Information Feature Selection
        MI_scores = mutual_info_classif(X_train_scaled, y_train_sm, random_state=RANDOM_STATE)
        mi_df = pd.DataFrame({'Feature': X.columns, 'MI_Score': MI_scores})
        selected_features = mi_df.nlargest(top_k, 'MI_Score')['Feature'].tolist()
        if len(selected_features) == 0:
            selected_features = X.columns.tolist()

        # 7.4) Apply Feature Selection
        X_train_sel = pd.DataFrame(X_train_scaled, columns=X.columns)[selected_features].values
        X_val_sel = pd.DataFrame(X_val_scaled, columns=X.columns)[selected_features].values

        # 7.5) Train and Evaluate Models
        for name, mdl in models.items():
            mdl.fit(X_train_sel, y_train_sm)
            y_pred = mdl.predict(X_val_sel)
            acc = accuracy_score(y_val_f, y_pred)
            cv_results[name].append(acc)

    return {name: np.mean(scores) for name, scores in cv_results.items()}, selected_features


# ----------------------------------------------------
# 8) EXECUTION AND FINAL COMPARISON
# ----------------------------------------------------
print("\n" + "="*60)
print("RUNNING CROSS-VALIDATION PIPELINES (Requires IMBLEARN and XGBOOST)")
print("="*60)

# Run ANOVA Pipeline
cv_anova_scores, anova_features = run_cv_pipeline_anova(X_train_temp, y_train_temp, cv_models.copy())
print("ANOVA CV Results (Mean Accuracy):", cv_anova_scores)

# Run MI Pipeline
cv_mi_scores, mi_features = run_cv_pipeline_mi(X_train_temp, y_train_temp, cv_models.copy())
print("MI CV Results (Mean Accuracy):", cv_mi_scores)

print("\n" + "="*60)
print("FINAL STEP: SELECT BEST MODEL AND TEST ON X_TEST")
print("="*60)

# The logic below demonstrates how to select the best model based on CV results.
# For demonstration, we select the model with the highest average CV score from the best pipeline (ANOVA).

# Determine which pipeline is best based on the highest average score (e.g., from MLP/RF)
best_anova_score = max(cv_anova_scores.values())
best_mi_score = max(cv_mi_scores.values())

if best_anova_score >= best_mi_score:
    final_features = anova_features
    final_pipeline = "ANOVA"
    best_model_name = max(cv_anova_scores, key=cv_anova_scores.get)
    print(f"[RESULT] Selected Pipeline: {final_pipeline} (Best Model: {best_model_name})")
else:
    final_features = mi_features
    final_pipeline = "MI"
    best_model_name = max(cv_mi_scores, key=cv_mi_scores.get)
    print(f"[RESULT] Selected Pipeline: {final_pipeline} (Best Model: {best_model_name})")

# Final Training and Testing on the held-out X_test set

# 1. SMOTE on the entire X_train_temp pool
sm = SMOTE(random_state=RANDOM_STATE)
X_train_final, y_train_final = sm.fit_resample(X_train_temp, y_train_temp)

# 2. Scaling (Fit on SMOTEd train, Transform test)
scaler = StandardScaler()
X_train_scaled_final = scaler.fit_transform(X_train_final)
X_test_scaled_final = scaler.transform(X_test)

# 3. Apply Final Feature Selection
X_train_sel_final = pd.DataFrame(X_train_scaled_final, columns=X_augmented.columns)[final_features].values
X_test_sel_final = pd.DataFrame(X_test_scaled_final, columns=X_augmented.columns)[final_features].values

# 4. Train the Best Model
final_model = cv_models[best_model_name]
final_model.fit(X_train_sel_final, y_train_final)

# 5. Final Test Evaluation
y_pred_test_final = final_model.predict(X_test_sel_final)
final_accuracy = accuracy_score(y_test, y_pred_test_final)

print(f"\nFINAL TEST ACCURACY for {best_model_name} (using {final_pipeline} features): {final_accuracy:.4f}")

[INFO] Data augmented (GN). New total size: 627 samples.
[INFO] CV Pool Size: 501 | Final Test Size: 126

RUNNING CROSS-VALIDATION PIPELINES (Requires IMBLEARN and XGBOOST)

--- Running ANOVA Pipeline (5 Folds) ---
ANOVA CV Results (Mean Accuracy): {'RF': np.float64(0.8862178217821782), 'MLP': np.float64(0.9520990099009902), 'LR': np.float64(0.7345346534653465)}

--- Running MI Pipeline (5 Folds) ---
MI CV Results (Mean Accuracy): {'RF': np.float64(0.9021980198019802), 'MLP': np.float64(0.9520792079207923), 'LR': np.float64(0.7044950495049505)}

FINAL STEP: SELECT BEST MODEL AND TEST ON X_TEST
[RESULT] Selected Pipeline: ANOVA (Best Model: MLP)

FINAL TEST ACCURACY for MLP (using ANOVA features): 0.9524


CM ALl models ANOVA + MI

In [None]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import f_classif, mutual_info_classif
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
# Note: XGBClassifier and VotingClassifier were removed from imports for simplicity but can be added back.

RANDOM_STATE = 42
AUGMENTATION_FACTOR = 3
NOISE_STD_DEV = 0.1
TOP_K_FEATURES = 30
N_SPLITS = 5 # K for K-Fold Cross-Validation

# ----------------------------
# 1) Load & clean
# ----------------------------
df = pd.read_csv("survey_data_cleaned.csv")
drop_cols = ['Timestamp', 'Date_Of_Birth', 'Age']
df = df.drop(columns=[c for c in drop_cols if c in df.columns])

target_col = 'Career_Interest'
X = df.drop(columns=[target_col])
y = df[target_col].astype(str)

# ----------------------------
# 2) Encode categorical features
# ----------------------------
X_enc = X.copy()
for col in X_enc.columns:
    if X_enc[col].dtype == 'object' or X_enc[col].dtype.name == 'category':
        le = LabelEncoder()
        X_enc[col] = le.fit_transform(X_enc[col].astype(str))

le_target = LabelEncoder()
y_enc = le_target.fit_transform(y)

df_enc = X_enc.copy()
df_enc[target_col] = y_enc

# ----------------------------
# 3) Augmentation (Gaussian Noise)
# ----------------------------
def create_augmented_data(df_original, factor, target_column, noise_std_dev=0.05):
    df_list = [df_original.copy()]
    feature_cols = df_original.columns.drop(target_column).tolist()

    for _ in range(factor - 1):
        df_new = df_original.copy()
        for col in feature_cols:
            noise = np.random.normal(0, noise_std_dev * df_new[col].std(), size=len(df_new))
            df_new[col] = np.maximum(df_new[col] + noise, 0)
        df_list.append(df_new)
    return pd.concat(df_list, ignore_index=True)

df_augmented = create_augmented_data(df_enc, AUGMENTATION_FACTOR, target_col, NOISE_STD_DEV)
print(f"[INFO] Data augmented (GN). New total size: {df_augmented.shape[0]} samples.")

X_augmented = df_augmented.drop(columns=[target_col])
y_augmented = df_augmented[target_col]

# ----------------------------
# 4) Train/Test split (80% for CV/Training, 20% for Final Test)
# ----------------------------
X_train_temp, X_test, y_train_temp, y_test = train_test_split(
    X_augmented, y_augmented, test_size=0.2, stratify=y_augmented, random_state=RANDOM_STATE
)
X_train_temp = pd.DataFrame(X_train_temp, columns=X_augmented.columns)

# ----------------------------
# 5) Define Models for CV
# ----------------------------
cv_models = {
    'RF': RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE),
    'MLP': MLPClassifier(hidden_layer_sizes=(64, 32), max_iter=300, random_state=RANDOM_STATE),
    'LR': LogisticRegression(max_iter=500, random_state=RANDOM_STATE),
}

# ----------------------------------------------------
# 6) Pipeline 1: ANOVA + K-Fold Cross-Validation
# ----------------------------------------------------
def run_cv_pipeline_anova(X, y, models, n_splits=N_SPLITS):
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_STATE)
    cv_results = {name: [] for name in models.keys()}

    print(f"\n--- Running ANOVA Pipeline ({n_splits} Folds) ---")
    for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
        X_train_f, X_val_f = X.iloc[train_idx], X.iloc[val_idx]
        y_train_f, y_val_f = y.iloc[train_idx], y.iloc[val_idx]

        # SMOTE, Scaling, and Feature Selection applied INSIDE the fold
        sm = SMOTE(random_state=RANDOM_STATE)
        X_train_sm, y_train_sm = sm.fit_resample(X_train_f, y_train_f)

        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train_sm)
        X_val_scaled = scaler.transform(X_val_f)

        F_vals, p_vals = f_classif(X_train_scaled, y_train_sm)
        anova_df = pd.DataFrame({'Feature': X.columns, 'p_value': p_vals})
        selected_features = anova_df[anova_df['p_value'] < 0.05]['Feature'].tolist()
        if not selected_features: selected_features = X.columns.tolist()

        X_train_sel = pd.DataFrame(X_train_scaled, columns=X.columns)[selected_features].values
        X_val_sel = pd.DataFrame(X_val_scaled, columns=X.columns)[selected_features].values

        for name, mdl in models.items():
            mdl.fit(X_train_sel, y_train_sm)
            y_pred = mdl.predict(X_val_sel)
            cv_results[name].append(accuracy_score(y_val_f, y_pred))

    return {name: np.mean(scores) for name, scores in cv_results.items()}, selected_features


# ----------------------------------------------------
# 7) Pipeline 2: Mutual Information + K-Fold Cross-Validation
# ----------------------------------------------------
def run_cv_pipeline_mi(X, y, models, n_splits=N_SPLITS, top_k=TOP_K_FEATURES):
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_STATE)
    cv_results = {name: [] for name in models.keys()}

    print(f"\n--- Running MI Pipeline ({n_splits} Folds) ---")
    for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
        X_train_f, X_val_f = X.iloc[train_idx], X.iloc[val_idx]
        y_train_f, y_val_f = y.iloc[train_idx], y.iloc[val_idx]

        # SMOTE, Scaling, and Feature Selection applied INSIDE the fold
        sm = SMOTE(random_state=RANDOM_STATE)
        X_train_sm, y_train_sm = sm.fit_resample(X_train_f, y_train_f)

        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train_sm)
        X_val_scaled = scaler.transform(X_val_f)

        MI_scores = mutual_info_classif(X_train_scaled, y_train_sm, random_state=RANDOM_STATE)
        mi_df = pd.DataFrame({'Feature': X.columns, 'MI_Score': MI_scores})
        selected_features = mi_df.nlargest(top_k, 'MI_Score')['Feature'].tolist()
        if not selected_features: selected_features = X.columns.tolist()

        X_train_sel = pd.DataFrame(X_train_scaled, columns=X.columns)[selected_features].values
        X_val_sel = pd.DataFrame(X_val_scaled, columns=X.columns)[selected_features].values

        for name, mdl in models.items():
            mdl.fit(X_train_sel, y_train_sm)
            y_pred = mdl.predict(X_val_sel)
            cv_results[name].append(accuracy_score(y_val_f, y_pred))

    return {name: np.mean(scores) for name, scores in cv_results.items()}, selected_features


# ----------------------------------------------------
# 8) EXECUTION AND FINAL COMPARISON
# ----------------------------------------------------
print("\n" + "="*60)
print("RUNNING CROSS-VALIDATION PIPELINES (Requires IMBLEARN)")
print("="*60)

# Run ANOVA Pipeline
cv_anova_scores, anova_features = run_cv_pipeline_anova(X_train_temp, y_train_temp, cv_models.copy())
print("ANOVA CV Results (Mean Accuracy):", cv_anova_scores)

# Run MI Pipeline
cv_mi_scores, mi_features = run_cv_pipeline_mi(X_train_temp, y_train_temp, cv_models.copy())
print("MI CV Results (Mean Accuracy):", cv_mi_scores)

print("\n" + "="*60)
print("FINAL STEP: SELECT BEST MODEL AND TEST ON X_TEST")
print("="*60)

# Determine the best model and pipeline based on the highest average CV score
all_scores = {("ANOVA", k): v for k, v in cv_anova_scores.items()}
all_scores.update({("MI", k): v for k, v in cv_mi_scores.items()})

((final_pipeline, best_model_name), _) = max(all_scores.items(), key=lambda item: item[1])

final_features = anova_features if final_pipeline == "ANOVA" else mi_features

print(f"[RESULT] Selected Pipeline: {final_pipeline} (Best Model: {best_model_name}, Mean CV Acc: {all_scores[(final_pipeline, best_model_name)]:.4f})")

# Final Training and Testing on the held-out X_test set

# 1. SMOTE on the entire X_train_temp pool
sm = SMOTE(random_state=RANDOM_STATE)
X_train_final, y_train_final = sm.fit_resample(X_train_temp, y_train_temp)

# 2. Scaling (Fit on SMOTEd train, Transform test)
scaler = StandardScaler()
X_train_scaled_final = scaler.fit_transform(X_train_final)
X_test_scaled_final = scaler.transform(X_test)

# 3. Apply Final Feature Selection
X_train_sel_final = pd.DataFrame(X_train_scaled_final, columns=X_augmented.columns)[final_features].values
X_test_sel_final = pd.DataFrame(X_test_scaled_final, columns=X_augmented.columns)[final_features].values

# 4. Train the Best Model
final_model = cv_models[best_model_name]
final_model.fit(X_train_sel_final, y_train_final)

# 5. Final Test Evaluation
y_pred_test_final = final_model.predict(X_test_sel_final)
final_accuracy = accuracy_score(y_test, y_pred_test_final)

print(f"\nFINAL TEST ACCURACY for {best_model_name} (using {final_pipeline} features): {final_accuracy:.4f}")

[INFO] Data augmented (GN). New total size: 627 samples.

RUNNING CROSS-VALIDATION PIPELINES (Requires IMBLEARN)

--- Running ANOVA Pipeline (5 Folds) ---
ANOVA CV Results (Mean Accuracy): {'RF': np.float64(0.8941980198019802), 'MLP': np.float64(0.9541188118811881), 'LR': np.float64(0.7364356435643564)}

--- Running MI Pipeline (5 Folds) ---
MI CV Results (Mean Accuracy): {'RF': np.float64(0.9141386138613863), 'MLP': np.float64(0.95009900990099), 'LR': np.float64(0.7126138613861386)}

FINAL STEP: SELECT BEST MODEL AND TEST ON X_TEST
[RESULT] Selected Pipeline: ANOVA (Best Model: MLP, Mean CV Acc: 0.9541)

FINAL TEST ACCURACY for MLP (using ANOVA features): 0.9365


In [None]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import f_classif, mutual_info_classif
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC # New Model
from sklearn.naive_bayes import GaussianNB # New Model
from xgboost import XGBClassifier # New Model (Requires installation)
from imblearn.over_sampling import SMOTE

RANDOM_STATE = 42
AUGMENTATION_FACTOR = 3
NOISE_STD_DEV = 0.1
TOP_K_FEATURES = 30
N_SPLITS = 5 # K for K-Fold Cross-Validation

# ----------------------------
# 1) Load & clean
# ----------------------------
df = pd.read_csv("survey_data_cleaned.csv")
drop_cols = ['Timestamp', 'Date_Of_Birth', 'Age']
df = df.drop(columns=[c for c in drop_cols if c in df.columns])

target_col = 'Career_Interest'
X = df.drop(columns=[target_col])
y = df[target_col].astype(str)

# ----------------------------
# 2) Encode categorical features
# ----------------------------
X_enc = X.copy()
for col in X_enc.columns:
    if X_enc[col].dtype == 'object' or X_enc[col].dtype.name == 'category':
        le = LabelEncoder()
        X_enc[col] = le.fit_transform(X_enc[col].astype(str))

le_target = LabelEncoder()
y_enc = le_target.fit_transform(y)

df_enc = X_enc.copy()
df_enc[target_col] = y_enc

# ----------------------------
# 3) Augmentation (Gaussian Noise)
# ----------------------------
def create_augmented_data(df_original, factor, target_column, noise_std_dev=0.05):
    df_list = [df_original.copy()]
    feature_cols = df_original.columns.drop(target_column).tolist()

    for _ in range(factor - 1):
        df_new = df_original.copy()
        for col in feature_cols:
            noise = np.random.normal(0, noise_std_dev * df_new[col].std(), size=len(df_new))
            df_new[col] = np.maximum(df_new[col] + noise, 0)
        df_list.append(df_new)
    return pd.concat(df_list, ignore_index=True)

df_augmented = create_augmented_data(df_enc, AUGMENTATION_FACTOR, target_col, NOISE_STD_DEV)
X_augmented = df_augmented.drop(columns=[target_col])
y_augmented = df_augmented[target_col]

# ----------------------------
# 4) Train/Test split
# ----------------------------
X_train_temp, X_test, y_train_temp, y_test = train_test_split(
    X_augmented, y_augmented, test_size=0.2, stratify=y_augmented, random_state=RANDOM_STATE
)
X_train_temp = pd.DataFrame(X_train_temp, columns=X_augmented.columns)

# ----------------------------
# 5) Define Models for CV (Expanded)
# ----------------------------
cv_models = {
    'RF': RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE),
    'XGB': XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=RANDOM_STATE), # NEW
    'MLP': MLPClassifier(hidden_layer_sizes=(64, 32), max_iter=300, random_state=RANDOM_STATE),
    'LR': LogisticRegression(max_iter=500, random_state=RANDOM_STATE),
    'SVM': SVC(random_state=RANDOM_STATE), # NEW
    'GNB': GaussianNB() # NEW
}

# ----------------------------------------------------
# 6) Pipeline 1: ANOVA + K-Fold Cross-Validation
# ----------------------------------------------------
def run_cv_pipeline_anova(X, y, models, n_splits=N_SPLITS):
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_STATE)
    cv_results = {name: [] for name in models.keys()}

    for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
        X_train_f, X_val_f = X.iloc[train_idx], X.iloc[val_idx]
        y_train_f, y_val_f = y.iloc[train_idx], y.iloc[val_idx]

        # SMOTE, Scaling, and Feature Selection applied INSIDE the fold
        sm = SMOTE(random_state=RANDOM_STATE)
        X_train_sm, y_train_sm = sm.fit_resample(X_train_f, y_train_f)

        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train_sm)
        X_val_scaled = scaler.transform(X_val_f)

        F_vals, p_vals = f_classif(X_train_scaled, y_train_sm)
        anova_df = pd.DataFrame({'Feature': X.columns, 'p_value': p_vals})
        selected_features = anova_df[anova_df['p_value'] < 0.05]['Feature'].tolist()
        if not selected_features: selected_features = X.columns.tolist()

        X_train_sel = pd.DataFrame(X_train_scaled, columns=X.columns)[selected_features].values
        X_val_sel = pd.DataFrame(X_val_scaled, columns=X.columns)[selected_features].values

        for name, mdl in models.items():
            mdl.fit(X_train_sel, y_train_sm)
            y_pred = mdl.predict(X_val_sel)
            cv_results[name].append(accuracy_score(y_val_f, y_pred))

    return {name: np.mean(scores) for name, scores in cv_results.items()}, selected_features


# ----------------------------------------------------
# 7) Pipeline 2: Mutual Information + K-Fold Cross-Validation
# ----------------------------------------------------
def run_cv_pipeline_mi(X, y, models, n_splits=N_SPLITS, top_k=TOP_K_FEATURES):
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_STATE)
    cv_results = {name: [] for name in models.keys()}

    for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
        X_train_f, X_val_f = X.iloc[train_idx], X.iloc[val_idx]
        y_train_f, y_val_f = y.iloc[train_idx], y.iloc[val_idx]

        # SMOTE, Scaling, and Feature Selection applied INSIDE the fold
        sm = SMOTE(random_state=RANDOM_STATE)
        X_train_sm, y_train_sm = sm.fit_resample(X_train_f, y_train_f)

        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train_sm)
        X_val_scaled = scaler.transform(X_val_f)

        MI_scores = mutual_info_classif(X_train_scaled, y_train_sm, random_state=RANDOM_STATE)
        mi_df = pd.DataFrame({'Feature': X.columns, 'MI_Score': MI_scores})
        selected_features = mi_df.nlargest(top_k, 'MI_Score')['Feature'].tolist()
        if not selected_features: selected_features = X.columns.tolist()

        X_train_sel = pd.DataFrame(X_train_scaled, columns=X.columns)[selected_features].values
        X_val_sel = pd.DataFrame(X_val_scaled, columns=X.columns)[selected_features].values

        for name, mdl in models.items():
            mdl.fit(X_train_sel, y_train_sm)
            y_pred = mdl.predict(X_val_sel)
            cv_results[name].append(accuracy_score(y_val_f, y_pred))

    return {name: np.mean(scores) for name, scores in cv_results.items()}, selected_features


# ----------------------------------------------------
# 8) EXECUTION AND FINAL COMPARISON
# ----------------------------------------------------
print("\n" + "="*60)
print("RUNNING CROSS-VALIDATION PIPELINES (Requires IMBLEARN and XGBOOST)")
print("="*60)

# Run ANOVA Pipeline
cv_anova_scores, anova_features = run_cv_pipeline_anova(X_train_temp, y_train_temp, cv_models.copy())
print("ANOVA CV Results (Mean Accuracy):", cv_anova_scores)

# Run MI Pipeline
cv_mi_scores, mi_features = run_cv_pipeline_mi(X_train_temp, y_train_temp, cv_models.copy())
print("MI CV Results (Mean Accuracy):", cv_mi_scores)

print("\n" + "="*60)
print("FINAL STEP: SELECT BEST MODEL AND TEST ON X_TEST")
print("="*60)

# Determine the best model and pipeline based on the highest average CV score
all_scores = {("ANOVA", k): v for k, v in cv_anova_scores.items()}
all_scores.update({("MI", k): v for k, v in cv_mi_scores.items()})

((final_pipeline, best_model_name), _) = max(all_scores.items(), key=lambda item: item[1])

final_features = anova_features if final_pipeline == "ANOVA" else mi_features

print(f"[RESULT] Selected Pipeline: {final_pipeline} (Best Model: {best_model_name}, Mean CV Acc: {all_scores[(final_pipeline, best_model_name)]:.4f})")

# Final Training and Testing on the held-out X_test set

# 1. SMOTE on the entire X_train_temp pool
sm = SMOTE(random_state=RANDOM_STATE)
X_train_final, y_train_final = sm.fit_resample(X_train_temp, y_train_temp)

# 2. Scaling (Fit on SMOTEd train, Transform test)
scaler = StandardScaler()
X_train_scaled_final = scaler.fit_transform(X_train_final)
X_test_scaled_final = scaler.transform(X_test)

# 3. Apply Final Feature Selection
X_train_sel_final = pd.DataFrame(X_train_scaled_final, columns=X_augmented.columns)[final_features].values
X_test_sel_final = pd.DataFrame(X_test_scaled_final, columns=X_augmented.columns)[final_features].values

# 4. Train the Best Model
final_model = cv_models[best_model_name]
final_model.fit(X_train_sel_final, y_train_final)

# 5. Final Test Evaluation
y_pred_test_final = final_model.predict(X_test_sel_final)
final_accuracy = accuracy_score(y_test, y_pred_test_final)

print(f"\nFINAL TEST ACCURACY for {best_model_name} (using {final_pipeline} features): {final_accuracy:.4f}")


RUNNING CROSS-VALIDATION PIPELINES (Requires IMBLEARN and XGBOOST)
ANOVA CV Results (Mean Accuracy): {'RF': np.float64(0.8681782178217821), 'XGB': np.float64(0.8482178217821783), 'MLP': np.float64(0.9541188118811881), 'LR': np.float64(0.7285544554455445), 'SVM': np.float64(0.8961980198019802), 'GNB': np.float64(0.5427920792079207)}
MI CV Results (Mean Accuracy): {'RF': np.float64(0.8981386138613863), 'XGB': np.float64(0.8701980198019802), 'MLP': np.float64(0.9540990099009902), 'LR': np.float64(0.7045940594059406), 'SVM': np.float64(0.8981980198019801), 'GNB': np.float64(0.560950495049505)}

FINAL STEP: SELECT BEST MODEL AND TEST ON X_TEST
[RESULT] Selected Pipeline: ANOVA (Best Model: MLP, Mean CV Acc: 0.9541)

FINAL TEST ACCURACY for MLP (using ANOVA features): 0.9524


Open

open with mi

In [None]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import mutual_info_classif # ADDED for MI
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE

RANDOM_STATE = 42
AUGMENTATION_FACTOR = 2
NOISE_STD_DEV = 1
TOP_K_FEATURES = 8 # Defined a value for MI selection

# ----------------------------
# 1) Load & clean
# ----------------------------
df = pd.read_csv("Student Engagement Level-Multiclass.csv")
drop_cols = ['Student ID']
df = df.drop(columns=[c for c in drop_cols if c in df.columns])

target_col = 'Engagement Level'
X = df.drop(columns=[target_col])
y = df[target_col].astype(str)

print("[INFO] Original shape:", df.shape)
print("[INFO] Original class counts:\n", y.value_counts(), "\n")

# ----------------------------
# 2) Encode categorical features
# ----------------------------
X_enc = X.copy()
encoders = {}
for col in X_enc.columns:
    if X_enc[col].dtype == 'object' or X_enc[col].dtype.name == 'category':
        le = LabelEncoder()
        X_enc[col] = le.fit_transform(X_enc[col].astype(str))
        encoders[col] = le

le_target = LabelEncoder()
y_enc = le_target.fit_transform(y)

df_enc = X_enc.copy()
df_enc[target_col] = y_enc

# ----------------------------
# 3) Augmentation (Gaussian Noise)
# ----------------------------
def create_augmented_data(df_original, factor, target_column, noise_std_dev=0.05):
    df_list = [df_original.copy()]
    feature_cols = df_original.columns.drop(target_column).tolist()

    for _ in range(factor - 1):
        df_new = df_original.copy()
        for col in feature_cols:
            noise = np.random.normal(0, noise_std_dev * df_new[col].std(), size=len(df_new))
            df_new[col] = np.maximum(df_new[col] + noise, 0)
        df_list.append(df_new)

    return pd.concat(df_list, ignore_index=True)

df_augmented = create_augmented_data(df_enc, AUGMENTATION_FACTOR, target_col, NOISE_STD_DEV)
print(f"[INFO] Data augmented (GN). New total size: {df_augmented.shape[0]} samples.")

X_augmented = df_augmented.drop(columns=[target_col])
y_augmented = df_augmented[target_col]

# ----------------------------
# 4) Train/Validation/Test split (60/20/20)
# ----------------------------
X_train_temp, X_test, y_train_temp, y_test = train_test_split(
    X_augmented, y_augmented, test_size=0.2, stratify=y_augmented, random_state=RANDOM_STATE
)
X_train, X_valid, y_train, y_valid = train_test_split(
    X_train_temp, y_train_temp, test_size=0.25, stratify=y_train_temp, random_state=RANDOM_STATE
)

print(f"[INFO] Train Size: {len(X_train)} | Validation Size: {len(X_valid)} | Test Size: {len(X_test)}")
print("[INFO] Train class dist BEFORE SMOTE:\n", pd.Series(y_train).value_counts(), "\n")

# ----------------------------
# 5) SMOTE on training data ONLY
# ----------------------------
sm = SMOTE(random_state=RANDOM_STATE)
X_train_sm, y_train_sm = sm.fit_resample(X_train, y_train)

print("[INFO] Train class dist AFTER SMOTE:\n", pd.Series(y_train_sm).value_counts(), "\n")

# ----------------------------
# 6) Scaling
# ----------------------------
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_sm)
X_valid_scaled = scaler.transform(X_valid)
X_test_scaled = scaler.transform(X_test)

# ----------------------------
# 7) Mutual Information (MI) Feature Selection
# ----------------------------
print("\n>>> [7/9] MUTUAL INFORMATION FEATURE SELECTION...")
# Calculate MI scores on the scaled, SMOTEd training data
MI_scores = mutual_info_classif(X_train_scaled, y_train_sm, random_state=RANDOM_STATE)
mi_df = pd.DataFrame({'Feature': X_train.columns, 'MI_Score': MI_scores})

# Select top K features based on MI score
selected_features = mi_df.nlargest(TOP_K_FEATURES, 'MI_Score')['Feature'].tolist()

if len(selected_features) == 0:
    print("[WARNING] MI selected 0 features. Using all features.")
    selected_features = X_train.columns.tolist()

# Transform all datasets to selected features
X_train_df = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_valid_df = pd.DataFrame(X_valid_scaled, columns=X_valid.columns)
X_test_df = pd.DataFrame(X_test_scaled, columns=X_test.columns)

X_train_sel = X_train_df[selected_features].values
X_valid_sel = X_valid_df[selected_features].values
X_test_sel = X_test_df[selected_features].values

print(f"[INFO] MI selected {len(selected_features)} features: {selected_features}")
print("[INFO] Numbers of MI selected features:", len(selected_features), "\n")

# ----------------------------
# 8) Train models
# -------------------
models = {
    'RF': RandomForestClassifier(n_estimators=200, random_state=RANDOM_STATE),
    'XGB': XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=RANDOM_STATE),
    'MLP': MLPClassifier(hidden_layer_sizes=(128,64), max_iter=300, random_state=RANDOM_STATE),
    'LR': LogisticRegression(max_iter=500, random_state=RANDOM_STATE),
    'SVM': SVC(probability=True, random_state=RANDOM_STATE),
    'MLR': LinearRegression()
}

trained = {}
for name, mdl in models.items():
    print(f"[INFO] Training {name} ...")
    mdl.fit(X_train_sel, y_train_sm)
    trained[name] = mdl

ensemble = VotingClassifier(
    estimators=[('RF', trained['RF']), ('XGB', trained['XGB'])],
    voting='soft'
)
ensemble.fit(X_train_sel, y_train_sm)
trained['ENSEMBLE'] = ensemble

# ----------------------------
# 9) Predict & evaluate
# ----------------------------
def predict_model(name, model, X):
    if name == 'MLR':
        y_hat = np.rint(model.predict(X)).astype(int)
        y_hat = np.clip(y_hat, y_enc.min(), y_enc.max())
        return y_hat
    return model.predict(X)

print("\n" + "="*60)
print(">>> EVALUATION: TRAINING, VALIDATION & TEST SETS <<<")
print("="*60)

# 9.1 Accuracy Comparison
print("\n--- ACCURACY COMPARISON with MI(Training vs. Validation vs. Test) ---")
for name, mdl in trained.items():
    y_pred_train = predict_model(name, mdl, X_train_sel)
    acc_train = accuracy_score(y_train_sm, y_pred_train)

    y_pred_valid = predict_model(name, mdl, X_valid_sel)
    acc_valid = accuracy_score(y_valid, y_pred_valid)

    y_pred_test = predict_model(name, mdl, X_test_sel)
    acc_test = accuracy_score(y_test, y_pred_test)

    print(f"   {name} Training: {acc_train:.4f} | Validation: {acc_valid:.4f} | Test: {acc_test:.4f}")

# 9.2 Full Classification Report for Test Set
print("\n--- FINAL TEST SET RESULTS (Unseen Data) ---")
for name, mdl in trained.items():
    y_pred = predict_model(name, mdl, X_test_sel)
    print(f"\n===== {name} =====")
    try:
        print(classification_report(y_test, y_pred, target_names=le_target.classes_))
    except ValueError:
        print(f"Classification Report failed for {name} due to class mismatch. Showing only Accuracy.")
    print("Test Accuracy:", accuracy_score(y_test, y_pred))

[INFO] Original shape: (486, 13)
[INFO] Original class counts:
 Engagement Level
H    258
M    214
L     14
Name: count, dtype: int64 

[INFO] Data augmented (GN). New total size: 972 samples.
[INFO] Train Size: 582 | Validation Size: 195 | Test Size: 195
[INFO] Train class dist BEFORE SMOTE:
 Engagement Level
0    309
2    256
1     17
Name: count, dtype: int64 

[INFO] Train class dist AFTER SMOTE:
 Engagement Level
2    309
0    309
1    309
Name: count, dtype: int64 


>>> [7/9] MUTUAL INFORMATION FEATURE SELECTION...
[INFO] MI selected 8 features: ['Assignment 1 duration to submit (in hours)', 'Average time to submit assignment (in hours)', 'Assignment 3 duration to submit (in hours)', 'Assignment 2 duration to submit (in hours)', 'Assignment 1 lateness indicator', 'Assignment 3 lateness indicator', '# Logins', '# Content Reads']
[INFO] Numbers of MI selected features: 8 

[INFO] Training RF ...
[INFO] Training XGB ...
[INFO] Training MLP ...
[INFO] Training LR ...
[INFO] Training

open with anova

In [None]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import f_classif
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
# NOTE: XGBClassifier import may fail if not installed
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
# NOTE: SMOTE import will fail if 'imblearn' is not installed
from imblearn.over_sampling import SMOTE

RANDOM_STATE = 42
AUGMENTATION_FACTOR = 2
NOISE_STD_DEV = 1

# ----------------------------
# 1) Load & clean
# ----------------------------
df = pd.read_csv("Student Engagement Level-Multiclass.csv")
drop_cols = ['Student ID']
df = df.drop(columns=[c for c in drop_cols if c in df.columns])

target_col = 'Engagement Level'
X = df.drop(columns=[target_col])
y = df[target_col].astype(str)

print("[INFO] Original shape:", df.shape)
print("[INFO] Original class counts:\n", y.value_counts(), "\n")

# ----------------------------
# 2) Encode categorical features
# ----------------------------
X_enc = X.copy()
encoders = {}
for col in X_enc.columns:
    if X_enc[col].dtype == 'object' or X_enc[col].dtype.name == 'category':
        le = LabelEncoder()
        X_enc[col] = le.fit_transform(X_enc[col].astype(str))
        encoders[col] = le

le_target = LabelEncoder()
y_enc = le_target.fit_transform(y)

df_enc = X_enc.copy()
df_enc[target_col] = y_enc

# ----------------------------
# 3) Augmentation (Gaussian Noise)
# ----------------------------
def create_augmented_data(df_original, factor, target_column, noise_std_dev=0.05):
    df_list = [df_original.copy()]
    feature_cols = df_original.columns.drop(target_column).tolist()

    for _ in range(factor - 1):
        df_new = df_original.copy()
        for col in feature_cols:
            noise = np.random.normal(0, noise_std_dev * df_new[col].std(), size=len(df_new))
            df_new[col] = np.maximum(df_new[col] + noise, 0)
        df_list.append(df_new)

    return pd.concat(df_list, ignore_index=True)

df_augmented = create_augmented_data(df_enc, AUGMENTATION_FACTOR, target_col, NOISE_STD_DEV)
print(f"[INFO] Data augmented (GN). New total size: {df_augmented.shape[0]} samples.")

X_augmented = df_augmented.drop(columns=[target_col])
y_augmented = df_augmented[target_col]

# ----------------------------
# 4) Train/Validation/Test split (60/20/20)
# ----------------------------
# 1. Split into Train_Temp (80%) and Test (20%)
X_train_temp, X_test, y_train_temp, y_test = train_test_split(
    X_augmented, y_augmented, test_size=0.2, stratify=y_augmented, random_state=RANDOM_STATE
)
# 2. Split Train_Temp (80%) into Train (75% of 80% = 60%) and Validation (25% of 80% = 20%)
X_train, X_valid, y_train, y_valid = train_test_split(
    X_train_temp, y_train_temp, test_size=0.25, stratify=y_train_temp, random_state=RANDOM_STATE
)

print(f"[INFO] Train Size: {len(X_train)} | Validation Size: {len(X_valid)} | Test Size: {len(X_test)}")
print("[INFO] Train class dist BEFORE SMOTE:\n", pd.Series(y_train).value_counts(), "\n")

# ----------------------------
# 5) SMOTE on training data ONLY
# ----------------------------
sm = SMOTE(random_state=RANDOM_STATE)
# SMOTE is applied only to the 60% training set
X_train_sm, y_train_sm = sm.fit_resample(X_train, y_train)

print("[INFO] Train class dist AFTER SMOTE:\n", pd.Series(y_train_sm).value_counts(), "\n")

# ----------------------------
# 6) Scaling
# ----------------------------
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_sm)
X_valid_scaled = scaler.transform(X_valid) # Transform Validation set
X_test_scaled = scaler.transform(X_test)

# ----------------------------
# 7) ANOVA
# ----------------------------
F_vals, p_vals = f_classif(X_train_scaled, y_train_sm)
anova_df = pd.DataFrame({'Feature': X_train.columns, 'F_value': F_vals, 'p_value': p_vals})
selected_features = anova_df[anova_df['p_value'] < 0.05]['Feature'].tolist()
if len(selected_features) == 0:
    selected_features = X_train.columns.tolist()

# Apply feature selection to all three sets
X_train_sel = pd.DataFrame(X_train_scaled, columns=X_train.columns)[selected_features].values
X_valid_sel = pd.DataFrame(X_valid_scaled, columns=X_valid.columns)[selected_features].values
X_test_sel = pd.DataFrame(X_test_scaled, columns=X_test.columns)[selected_features].values
print("[INFO] ANOVA selected features:", selected_features, "\n")
print("[INFO] Numbers of ANOVA selected features:", len(selected_features), "\n")
# ----------------------------
# 8) Train models
# -------------------
models = {
    'RF': RandomForestClassifier(n_estimators=200, random_state=RANDOM_STATE),
    'XGB': XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=RANDOM_STATE),
    'MLP': MLPClassifier(hidden_layer_sizes=(128,64), max_iter=300, random_state=RANDOM_STATE),
    'LR': LogisticRegression(max_iter=500, random_state=RANDOM_STATE),
    'SVM': SVC(probability=True, random_state=RANDOM_STATE),
    'MLR': LinearRegression()
}

trained = {}
for name, mdl in models.items():
    print(f"[INFO] Training {name} ...")
    mdl.fit(X_train_sel, y_train_sm)
    trained[name] = mdl

ensemble = VotingClassifier(
    estimators=[('RF', trained['RF']), ('XGB', trained['XGB'])],
    voting='soft'
)
ensemble.fit(X_train_sel, y_train_sm)
trained['ENSEMBLE'] = ensemble

# ----------------------------
# 9) Predict & evaluate
# ----------------------------
def predict_model(name, model, X):
    if name == 'MLR':
        # LinearRegression for multiclass is an approximation, rounding results
        y_hat = np.rint(model.predict(X)).astype(int)
        # Clip to the range of target classes
        y_hat = np.clip(y_hat, y_enc.min(), y_enc.max())
        return y_hat
    return model.predict(X)

print("\n" + "="*60)
print(">>> EVALUATION: TRAINING, VALIDATION & TEST SETS <<<")
print("="*60)

# 9.1 Accuracy Comparison
print("\n--- ACCURACY COMPARISON with ANOVA(Training vs. Validation vs. Test) ---")
for name, mdl in trained.items():
    # Training Prediction (on augmented/SMOTEd data)
    y_pred_train = predict_model(name, mdl, X_train_sel)
    acc_train = accuracy_score(y_train_sm, y_pred_train)

    # Validation Prediction (on unseen, non-SMOTEd data)
    y_pred_valid = predict_model(name, mdl, X_valid_sel)
    acc_valid = accuracy_score(y_valid, y_pred_valid)

    # Test Prediction (on unseen, non-SMOTEd data)
    y_pred_test = predict_model(name, mdl, X_test_sel)
    acc_test = accuracy_score(y_test, y_pred_test)

    print(f"   {name} Training: {acc_train:.4f} | Validation: {acc_valid:.4f} | Test: {acc_test:.4f}")

# 9.2 Full Classification Report for Test Set
print("\n--- FINAL TEST SET RESULTS (Unseen Data) ---")
for name, mdl in trained.items():
    y_pred = predict_model(name, mdl, X_test_sel)
    print(f"\n===== {name} =====")
    # Note: If LinearRegression (MMR) is used, classes might not match perfectly
    try:
        print(classification_report(y_test, y_pred, target_names=le_target.classes_))
    except ValueError:
        print(f"Classification Report failed for {name} due to class mismatch. Showing only Accuracy.")
    print("Test Accuracy:", accuracy_score(y_test, y_pred))

[INFO] Original shape: (486, 13)
[INFO] Original class counts:
 Engagement Level
H    258
M    214
L     14
Name: count, dtype: int64 

[INFO] Data augmented (GN). New total size: 972 samples.
[INFO] Train Size: 582 | Validation Size: 195 | Test Size: 195
[INFO] Train class dist BEFORE SMOTE:
 Engagement Level
0    309
2    256
1     17
Name: count, dtype: int64 

[INFO] Train class dist AFTER SMOTE:
 Engagement Level
2    309
0    309
1    309
Name: count, dtype: int64 

[INFO] ANOVA selected features: ['# Logins', '# Content Reads', '# Forum Reads', '# Forum Posts', '# Quiz Reviews before submission', 'Assignment 1 lateness indicator', 'Assignment 2 lateness indicator', 'Assignment 3 lateness indicator', 'Assignment 1 duration to submit (in hours)', 'Assignment 2 duration to submit (in hours)', 'Assignment 3 duration to submit (in hours)', 'Average time to submit assignment (in hours)'] 

[INFO] Numbers of ANOVA selected features: 12 

[INFO] Training RF ...
[INFO] Training XGB ...
[