In [8]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import f_classif
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from imblearn.over_sampling import SMOTE
from sklearn.utils import resample
import copy

# ----------------------------
# 1️⃣ Load & Clean Dataset
# ----------------------------
df = pd.read_csv("survey_data_cleaned.csv")
drop_cols = ['Timestamp', 'Date_Of_Birth', 'Age']
df = df.drop(columns=[c for c in drop_cols if c in df.columns])

# ----------------------------
# 2️⃣ Split Target and Features
# ----------------------------
X = df.drop(columns=['Career_Interest'])
y = df['Career_Interest']

le = LabelEncoder()
for col in X.columns:
    if X[col].dtype == 'object':
        X[col] = le.fit_transform(X[col].astype(str))
y_encoded = le.fit_transform(y)

# ----------------------------
# 3️⃣ ANOVA Feature Selection
# ----------------------------
F_values, p_values = f_classif(X, y_encoded)
anova_df = pd.DataFrame({'Feature': X.columns, 'F_value': F_values, 'p_value': p_values})
selected_features = anova_df[anova_df['p_value'] < 0.05]['Feature'].tolist()
X_selected = X[selected_features]

# ----------------------------
# 4️⃣ Train-Test Split
# ----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X_selected, y_encoded, test_size=0.2, stratify=y_encoded, random_state=42
)

# ----------------------------
# 5️⃣ Scaling
# ----------------------------
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ----------------------------
# 6️⃣ SMOTE + 5-Fold CV with internal bootstrapping
# ----------------------------
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

metrics_rf, metrics_xgb, metrics_ens = [], [], []
best_models_rf, best_models_xgb, best_models_ens = [], [], []

orig_dist = y.value_counts() / len(y)

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_scaled, y_train)):
    print(f"[INFO] Fold {fold+1}")
    
    X_tr, X_val = X_train_scaled[train_idx], X_train_scaled[val_idx]
    y_tr, y_val = y_train[train_idx], y_train[val_idx]
    
    # Internal bootstrapping
    df_tr = pd.DataFrame(X_tr, columns=selected_features)
    df_tr['Career_Interest'] = y_tr
    df_list = []
    max_count = df_tr['Career_Interest'].value_counts().max()
    for cls in df_tr['Career_Interest'].unique():
        cls_df = df_tr[df_tr['Career_Interest'] == cls]
        cls_resampled = resample(cls_df, replace=True, n_samples=max_count, random_state=fold)
        df_list.append(cls_resampled)
    df_bootstrap = pd.concat(df_list)
    
    # SMOTE
    smote = SMOTE(random_state=fold)
    X_res, y_res = smote.fit_resample(df_bootstrap[selected_features], df_bootstrap['Career_Interest'])
    
    # ---------------- Train Models ----------------
    rf = RandomForestClassifier(n_estimators=100, bootstrap=True, random_state=42)
    rf.fit(X_res, y_res)
    
    xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
    xgb.fit(X_res, y_res)
    
    ensemble = VotingClassifier(estimators=[('RF', rf), ('XGB', xgb)], voting='soft')
    ensemble.fit(X_res, y_res)
    
    # -------------- Prediction with adjusted probabilities --------------
    boot_dist = pd.Series(y_res).value_counts() / len(y_res)
    
    def predict_adjusted_proba(model, X_val):
        raw_probs = model.predict_proba(X_val)
        classes = model.classes_
        adjusted_probs = []
        for prob_vector in raw_probs:
            adj = prob_vector * (orig_dist.loc[classes].values / boot_dist.loc[classes].values)
            adj /= adj.sum()
            adjusted_probs.append(adj)
        return np.array(adjusted_probs)
    
    y_pred_rf = np.argmax(predict_adjusted_proba(rf, X_val), axis=1)
    y_pred_xgb = np.argmax(predict_adjusted_proba(xgb, X_val), axis=1)
    y_pred_ens = np.argmax(predict_adjusted_proba(ensemble, X_val), axis=1)
    
    # ----------------- Save metrics -----------------
    def compute_metrics(y_true, y_pred):
        return {
            'precision': precision_score(y_true, y_pred, average='macro'),
            'recall': recall_score(y_true, y_pred, average='macro'),
            'f1': f1_score(y_true, y_pred, average='macro'),
            'accuracy': accuracy_score(y_true, y_pred)
        }
    
    metrics_rf.append(compute_metrics(y_val, y_pred_rf))
    metrics_xgb.append(compute_metrics(y_val, y_pred_xgb))
    metrics_ens.append(compute_metrics(y_val, y_pred_ens))
    
    best_models_rf.append(copy.deepcopy(rf))
    best_models_xgb.append(copy.deepcopy(xgb))
    best_models_ens.append(copy.deepcopy(ensemble))

# ---------------- Average metrics ----------------
def average_metrics(metrics_list):
    df = pd.DataFrame(metrics_list)
    return df.mean().to_dict()

print("[INFO] Average CV Metrics:")
print("RF:", average_metrics(metrics_rf))
print("XGB:", average_metrics(metrics_xgb))
print("ENSEMBLE:", average_metrics(metrics_ens))


[INFO] Fold 1


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[INFO] Fold 2


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[INFO] Fold 3


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[INFO] Fold 4


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[INFO] Fold 5


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[INFO] Average CV Metrics:
RF: {'precision': 0.5306990231990232, 'recall': 0.4939195526695526, 'f1': 0.47762012987012986, 'accuracy': 0.5023172905525847}
XGB: {'precision': 0.47563034188034187, 'recall': 0.4898340548340549, 'f1': 0.47490327645462926, 'accuracy': 0.49696969696969695}
ENSEMBLE: {'precision': 0.5013888888888889, 'recall': 0.5060840548340548, 'f1': 0.4915979658307024, 'accuracy': 0.5146167557932264}


In [10]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import f_classif
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from imblearn.over_sampling import SMOTE
from sklearn.utils import resample
import copy

# ----------------------------
# 1️⃣ Load & Clean Dataset
# ----------------------------
df = pd.read_csv("survey_data_cleaned.csv")
drop_cols = ['Timestamp', 'Date_Of_Birth', 'Age']
df = df.drop(columns=[c for c in drop_cols if c in df.columns])

# ----------------------------
# 2️⃣ Split Target and Features
# ----------------------------
X = df.drop(columns=['Career_Interest'])
y = df['Career_Interest']

# Encode categorical columns
le = LabelEncoder()
for col in X.columns:
    if X[col].dtype == 'object':
        X[col] = le.fit_transform(X[col].astype(str))
y_encoded = le.fit_transform(y)

# ----------------------------
# 3️⃣ ANOVA Feature Selection
# ----------------------------
F_values, p_values = f_classif(X, y_encoded)
anova_df = pd.DataFrame({'Feature': X.columns, 'F_value': F_values, 'p_value': p_values})
selected_features = anova_df[anova_df['p_value'] < 0.05]['Feature'].tolist()
X_selected = X[selected_features]

# ----------------------------
# 4️⃣ Train-Test Split
# ----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X_selected, y_encoded, test_size=0.2, stratify=y_encoded, random_state=42
)

# ----------------------------
# 5️⃣ Scaling
# ----------------------------
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ----------------------------
# 6️⃣ FW-SMOTE + 5-Fold CV with internal bootstrapping
# ----------------------------
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
metrics_rf, metrics_xgb, metrics_ens = [], [], []
best_models_rf, best_models_xgb, best_models_ens = [], [], []

orig_dist = y.value_counts() / len(y)

# Feature weights for FW-SMOTE
importance_dict = dict(zip(selected_features, F_values / F_values.sum()))
feature_weights = np.array([importance_dict[f] for f in selected_features])

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_scaled, y_train)):
    print(f"[INFO] Fold {fold+1}")
    
    X_tr, X_val = X_train_scaled[train_idx], X_train_scaled[val_idx]
    y_tr, y_val = y_train[train_idx], y_train[val_idx]
    
    # Internal bootstrapping
    df_tr = pd.DataFrame(X_tr, columns=selected_features)
    df_tr['Career_Interest'] = y_tr
    df_list = []
    max_count = df_tr['Career_Interest'].value_counts().max()
    for cls in df_tr['Career_Interest'].unique():
        cls_df = df_tr[df_tr['Career_Interest'] == cls]
        cls_resampled = resample(cls_df, replace=True, n_samples=max_count, random_state=fold)
        df_list.append(cls_resampled)
    df_bootstrap = pd.concat(df_list)
    
    # FW-SMOTE
    X_tr_weighted = df_bootstrap[selected_features].values * feature_weights
    smote = SMOTE(random_state=fold)
    X_res, y_res = smote.fit_resample(X_tr_weighted, df_bootstrap['Career_Interest'])
    X_res = X_res / feature_weights  # restore scale
    
    # ---------------- Train Models ----------------
    rf = RandomForestClassifier(n_estimators=100, bootstrap=True, random_state=42)
    rf.fit(X_res, y_res)
    
    xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
    xgb.fit(X_res, y_res)
    
    ensemble = VotingClassifier(estimators=[('RF', rf), ('XGB', xgb)], voting='soft')
    ensemble.fit(X_res, y_res)
    
    # -------------- Prediction with Adjusted Probabilities --------------
    boot_dist = pd.Series(y_res).value_counts() / len(y_res)
    
    def predict_adjusted_proba(model, X_val):
        raw_probs = model.predict_proba(X_val)
        classes = model.classes_
        adjusted_probs = []
        for prob_vector in raw_probs:
            adj = prob_vector * (orig_dist.loc[classes].values / boot_dist.loc[classes].values)
            adj /= adj.sum()
            adjusted_probs.append(adj)
        return np.array(adjusted_probs)
    
    y_pred_rf = np.argmax(predict_adjusted_proba(rf, X_val), axis=1)
    y_pred_xgb = np.argmax(predict_adjusted_proba(xgb, X_val), axis=1)
    y_pred_ens = np.argmax(predict_adjusted_proba(ensemble, X_val), axis=1)
    
    # ----------------- Save metrics -----------------
    def compute_metrics(y_true, y_pred):
        return {
            'precision': precision_score(y_true, y_pred, average='macro'),
            'recall': recall_score(y_true, y_pred, average='macro'),
            'f1': f1_score(y_true, y_pred, average='macro'),
            'accuracy': accuracy_score(y_true, y_pred)
        }
    
    metrics_rf.append(compute_metrics(y_val, y_pred_rf))
    metrics_xgb.append(compute_metrics(y_val, y_pred_xgb))
    metrics_ens.append(compute_metrics(y_val, y_pred_ens))
    
    best_models_rf.append(copy.deepcopy(rf))
    best_models_xgb.append(copy.deepcopy(xgb))
    best_models_ens.append(copy.deepcopy(ensemble))

# ---------------- Average metrics ----------------
def average_metrics(metrics_list):
    df = pd.DataFrame(metrics_list)
    return df.mean().to_dict()

print("[INFO] Average CV Metrics (FW-SMOTE):")
print("RF:", average_metrics(metrics_rf))
print("XGB:", average_metrics(metrics_xgb))
print("ENSEMBLE:", average_metrics(metrics_ens))


[INFO] Fold 1


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[INFO] Fold 2


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[INFO] Fold 3


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[INFO] Fold 4


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[INFO] Fold 5


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[INFO] Average CV Metrics (FW-SMOTE):
RF: {'precision': 0.5306990231990232, 'recall': 0.4939195526695526, 'f1': 0.47762012987012986, 'accuracy': 0.5023172905525847}
XGB: {'precision': 0.47563034188034187, 'recall': 0.4898340548340549, 'f1': 0.47490327645462926, 'accuracy': 0.49696969696969695}
ENSEMBLE: {'precision': 0.5013888888888889, 'recall': 0.5060840548340548, 'f1': 0.4915979658307024, 'accuracy': 0.5146167557932264}
