In [10]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import f_classif
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from imblearn.over_sampling import SMOTE
import copy

# =====================================================
# ðŸ”¹ Load & Clean Dataset
# =====================================================
df = pd.read_csv("survey_data_cleaned.csv")
drop_cols = ['Timestamp', 'Date_Of_Birth', 'Age']
df = df.drop(columns=[c for c in drop_cols if c in df.columns])
print("[INFO] Loaded dataset:", df.shape)

# =====================================================
# ðŸ”¹ Split Target / Features
# =====================================================
target = "Career_Interest"
X_raw = df.drop(columns=[target])
y_raw = df[target]

print("[INFO] Original Class Distribution:")
print(y_raw.value_counts())

# =====================================================
# ðŸ”¹ Encode Categorical Columns
# =====================================================
label_encoders = {}
X = X_raw.copy()
for col in X.columns:
    if X[col].dtype == "object":
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col].astype(str))
        label_encoders[col] = le

y = y_raw.copy()

# =====================================================
# ðŸ”¹ ANOVA Feature Selection
# =====================================================
F_vals, p_vals = f_classif(X, y)
anova_df = pd.DataFrame({"Feature": X.columns, "F_value": F_vals, "p_value": p_vals})
selected_features = anova_df[anova_df["p_value"] < 0.05]["Feature"].tolist()
X_sel = X[selected_features]

print("[INFO] Selected Features via ANOVA (<0.05):")
print(selected_features)

# =====================================================
# ðŸ”¹ Train-Test Split (80% Train / 20% Test)
# =====================================================
X_train, X_test, y_train, y_test = train_test_split(
    X_sel, y, test_size=0.2, random_state=42, stratify=y
)
print("[INFO] Training Class Distribution:")
print(y_train.value_counts())
print("[INFO] Testing Class Distribution:")
print(y_test.value_counts())

# =====================================================
# ðŸ”¹ Scaling
# =====================================================
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# =====================================================
# ðŸ”¹ SMOTE / Bootstrap Expansion (~1000 samples)
# =====================================================
n_classes = len(np.unique(y_train))
n_target_per_class = 1000 // n_classes  # ~250 per class if 4 classes
sm = SMOTE(sampling_strategy={cls: n_target_per_class for cls in np.unique(y_train)}, random_state=42)
X_res, y_res = sm.fit_resample(X_train_scaled, y_train)

print("[INFO] After SMOTE/Bootstrap Class Distribution:")
print(pd.Series(y_res).value_counts())

# =====================================================
# ðŸ”¹ 5-Fold Cross-Validation on Bootstrapped Training Set
# =====================================================
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
metrics_rf, metrics_xgb, metrics_ens = [], [], []

for fold, (train_idx, val_idx) in enumerate(skf.split(X_res, y_res), 1):
    print(f"[INFO] Fold {fold} Training...")

    X_fold_train, X_fold_val = X_res[train_idx], X_res[val_idx]
    y_fold_train, y_fold_val = y_res[train_idx], y_res[val_idx]

    # Internal Bootstrapping
    boot_idx = np.random.choice(len(X_fold_train), len(X_fold_train), replace=True)
    X_fold_train_boot = X_fold_train[boot_idx]
    y_fold_train_boot = y_fold_train.iloc[boot_idx]


    # Train Models
    rf = RandomForestClassifier(n_estimators=300, random_state=42)
    xgb = XGBClassifier(
        n_estimators=300,
        max_depth=6,
        learning_rate=0.05,
        subsample=0.9,
        colsample_bytree=0.8,
        objective="multi:softprob",
        eval_metric="mlogloss",
        random_state=42
    )
    ens = VotingClassifier(
        estimators=[("rf", rf), ("xgb", xgb)],
        voting="soft"
    )

    rf.fit(X_fold_train_boot, y_fold_train_boot)
    xgb.fit(X_fold_train_boot, y_fold_train_boot)
    ens.fit(X_fold_train_boot, y_fold_train_boot)

    # Evaluate on Original 20% Test Set
    for model, metric_list in zip([rf, xgb, ens], [metrics_rf, metrics_xgb, metrics_ens]):
        y_pred = model.predict(X_test_scaled)
        metric_list.append([
            precision_score(y_test, y_pred, average='macro'),
            recall_score(y_test, y_pred, average='macro'),
            f1_score(y_test, y_pred, average='macro'),
            accuracy_score(y_test, y_pred)
        ])

# =====================================================
# ðŸ”¹ Average Metrics
# =====================================================
def avg_metrics(metrics_list):
    return np.mean(np.array(metrics_list), axis=0)

avg_rf = avg_metrics(metrics_rf)
avg_xgb = avg_metrics(metrics_xgb)
avg_ens = avg_metrics(metrics_ens)

print("\n============ AVERAGE METRICS ACROSS 5-FOLDS ============")
print("RF   - Precision: {:.3f}, Recall: {:.3f}, F1: {:.3f}, Accuracy: {:.3f}".format(*avg_rf))
print("XGB  - Precision: {:.3f}, Recall: {:.3f}, F1: {:.3f}, Accuracy: {:.3f}".format(*avg_xgb))
print("Ensemble - Precision: {:.3f}, Recall: {:.3f}, F1: {:.3f}, Accuracy: {:.3f}".format(*avg_ens))


[INFO] Loaded dataset: (209, 42)
[INFO] Original Class Distribution:
Career_Interest
1    64
0    53
3    51
2    41
Name: count, dtype: int64
[INFO] Selected Features via ANOVA (<0.05):
['Gender', 'Math', 'Bio', 'Chemistry', 'Physics', 'Business', 'Average_Score', 'Study_Method', 'English_Proficiency', 'Favorite_Subject', 'Building miniatures / models', 'Hackathons / App development projects', 'Programming / coding clubs', 'Volunteering at hospitals, clinics, or NGOs', 'Model building recognition', 'Programming / Coding award']
[INFO] Training Class Distribution:
Career_Interest
1    51
0    42
3    41
2    33
Name: count, dtype: int64
[INFO] Testing Class Distribution:
Career_Interest
1    13
0    11
3    10
2     8
Name: count, dtype: int64
[INFO] After SMOTE/Bootstrap Class Distribution:
Career_Interest
2    250
1    250
0    250
3    250
Name: count, dtype: int64
[INFO] Fold 1 Training...
[INFO] Fold 2 Training...
[INFO] Fold 3 Training...
[INFO] Fold 4 Training...
[INFO] Fold 5 T