In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import f_classif
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE

# ----------------------------
# 1️⃣ Load & Clean Dataset
# ----------------------------
df = pd.read_csv("survey_data_cleaned.csv")
drop_cols = ['Timestamp', 'Date_Of_Birth', 'Age']
df = df.drop(columns=[c for c in drop_cols if c in df.columns])

# ----------------------------
# 2️⃣ Split Target and Features
# ----------------------------
X = df.drop(columns=['Career_Interest'])
y = df['Career_Interest']

# Encode categorical columns including target
le = LabelEncoder()
for col in X.columns:
    if X[col].dtype == 'object':
        X[col] = le.fit_transform(X[col].astype(str))
y_encoded = le.fit_transform(y)

# ----------------------------
# 3️⃣ ANOVA Feature Selection
# ----------------------------
F_values, p_values = f_classif(X, y_encoded)
anova_df = pd.DataFrame({'Feature': X.columns, 'F_value': F_values, 'p_value': p_values})
selected_features = anova_df[anova_df['p_value'] < 0.05]['Feature'].tolist()
X_selected = X[selected_features]

# ----------------------------
# 4️⃣ Train-Test Split
# ----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X_selected, y_encoded, test_size=0.2, stratify=y_encoded, random_state=42
)

# ----------------------------
# 5️⃣ Scaling
# ----------------------------
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ----------------------------
# 6️⃣ SMOTE on Training Data
# ----------------------------
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train_scaled, y_train)

# ----------------------------
# 7️⃣ Train Models (internal bootstrapping)
# ----------------------------
rf = RandomForestClassifier(n_estimators=100, bootstrap=True, random_state=42)
rf.fit(X_train_res, y_train_res)

xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
xgb.fit(X_train_res, y_train_res)

ensemble = VotingClassifier(estimators=[('RF', rf), ('XGB', xgb)], voting='soft')
ensemble.fit(X_train_res, y_train_res)

# ----------------------------
# 8️⃣ Prediction with Adjusted Probabilities
# ----------------------------
orig_dist = y.value_counts() / len(y)
boot_dist = pd.Series(y_train_res).value_counts() / len(y_train_res)

def predict_adjusted_proba(model, scaler, X_test_df):
    X_scaled = scaler.transform(X_test_df)
    raw_probs = model.predict_proba(X_scaled)
    classes = model.classes_
    
    adjusted_probs = []
    for prob_vector in raw_probs:
        adj = prob_vector * (orig_dist.loc[classes].values / boot_dist.loc[classes].values)
        adj /= adj.sum()
        adjusted_probs.append(adj)
    return np.array(adjusted_probs)

y_pred_rf_adj = np.argmax(predict_adjusted_proba(rf, scaler, X_test), axis=1)
y_pred_xgb_adj = np.argmax(predict_adjusted_proba(xgb, scaler, X_test), axis=1)
y_pred_ens_adj = np.argmax(predict_adjusted_proba(ensemble, scaler, X_test), axis=1)

# ----------------------------
# 9️⃣ Evaluation
# ----------------------------
def print_report(y_true, y_pred, model_name):
    print(f"============ {model_name} REPORT ============")
    print(classification_report(y_true, y_pred))
    print("Accuracy:", accuracy_score(y_true, y_pred), "\n")

print_report(y_test, y_pred_rf_adj, "RANDOM FOREST")
print_report(y_test, y_pred_xgb_adj, "XGBOOST")
print_report(y_test, y_pred_ens_adj, "ENSEMBLE")


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


              precision    recall  f1-score   support

           0       0.62      0.73      0.67        11
           1       0.60      0.69      0.64        13
           2       0.57      0.50      0.53         8
           3       0.71      0.50      0.59        10

    accuracy                           0.62        42
   macro avg       0.63      0.60      0.61        42
weighted avg       0.63      0.62      0.62        42

Accuracy: 0.6190476190476191 

              precision    recall  f1-score   support

           0       0.62      0.73      0.67        11
           1       0.70      0.54      0.61        13
           2       0.33      0.50      0.40         8
           3       0.71      0.50      0.59        10

    accuracy                           0.57        42
   macro avg       0.59      0.57      0.57        42
weighted avg       0.61      0.57      0.58        42

Accuracy: 0.5714285714285714 

              precision    recall  f1-score   support

           0 

In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import f_classif
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE

# ----------------------------
# 1️⃣ Load & Clean Dataset
# ----------------------------
df = pd.read_csv("survey_data_cleaned.csv")
drop_cols = ['Timestamp', 'Date_Of_Birth', 'Age']
df = df.drop(columns=[c for c in drop_cols if c in df.columns])

# ----------------------------
# 2️⃣ Split Target and Features
# ----------------------------
X = df.drop(columns=['Career_Interest'])
y = df['Career_Interest']

# Encode categorical columns including target
le = LabelEncoder()
for col in X.columns:
    if X[col].dtype == 'object':
        X[col] = le.fit_transform(X[col].astype(str))
y_encoded = le.fit_transform(y)

# ----------------------------
# 3️⃣ ANOVA Feature Selection
# ----------------------------
F_values, p_values = f_classif(X, y_encoded)
anova_df = pd.DataFrame({'Feature': X.columns, 'F_value': F_values, 'p_value': p_values})
selected_features = anova_df[anova_df['p_value'] < 0.05]['Feature'].tolist()
X_selected = X[selected_features]

# ----------------------------
# 4️⃣ Train-Test Split
# ----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X_selected, y_encoded, test_size=0.2, stratify=y_encoded, random_state=42
)

# ----------------------------
# 5️⃣ Scaling
# ----------------------------
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ----------------------------
# 6️⃣ Feature-Weighted SMOTE on Training Data
# ----------------------------
# Compute ANOVA-based feature weights
importance_dict = dict(zip(selected_features, F_values / F_values.sum()))
feature_weights = np.array([importance_dict[f] for f in selected_features])

X_train_weighted = X_train_scaled * feature_weights
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train_weighted, y_train)
X_train_res = X_train_res / feature_weights  # restore scale

# ----------------------------
# 7️⃣ Train Models (internal bootstrapping)
# ----------------------------
rf = RandomForestClassifier(n_estimators=100, bootstrap=True, random_state=42)
rf.fit(X_train_res, y_train_res)

xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
xgb.fit(X_train_res, y_train_res)

ensemble = VotingClassifier(estimators=[('RF', rf), ('XGB', xgb)], voting='soft')
ensemble.fit(X_train_res, y_train_res)

# ----------------------------
# 8️⃣ Prediction with Adjusted Probabilities
# ----------------------------
orig_dist = y.value_counts() / len(y)
boot_dist = pd.Series(y_train_res).value_counts() / len(y_train_res)

def predict_adjusted_proba(model, scaler, X_test_df):
    X_scaled = scaler.transform(X_test_df)
    raw_probs = model.predict_proba(X_scaled)
    classes = model.classes_
    
    adjusted_probs = []
    for prob_vector in raw_probs:
        adj = prob_vector * (orig_dist.loc[classes].values / boot_dist.loc[classes].values)
        adj /= adj.sum()
        adjusted_probs.append(adj)
    return np.array(adjusted_probs)

y_pred_rf_adj = np.argmax(predict_adjusted_proba(rf, scaler, X_test), axis=1)
y_pred_xgb_adj = np.argmax(predict_adjusted_proba(xgb, scaler, X_test), axis=1)
y_pred_ens_adj = np.argmax(predict_adjusted_proba(ensemble, scaler, X_test), axis=1)

# ----------------------------
# 9️⃣ Evaluation
# ----------------------------
def print_report(y_true, y_pred, model_name):
    print(f"============ {model_name} REPORT ============")
    print(classification_report(y_true, y_pred))
    print("Accuracy:", accuracy_score(y_true, y_pred), "\n")

print_report(y_test, y_pred_rf_adj, "RANDOM FOREST")
print_report(y_test, y_pred_xgb_adj, "XGBOOST")
print_report(y_test, y_pred_ens_adj, "ENSEMBLE")


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


              precision    recall  f1-score   support

           0       0.67      0.55      0.60        11
           1       0.53      0.69      0.60        13
           2       0.50      0.62      0.56         8
           3       0.67      0.40      0.50        10

    accuracy                           0.57        42
   macro avg       0.59      0.57      0.56        42
weighted avg       0.59      0.57      0.57        42

Accuracy: 0.5714285714285714 

              precision    recall  f1-score   support

           0       0.62      0.73      0.67        11
           1       0.69      0.69      0.69        13
           2       0.36      0.50      0.42         8
           3       0.80      0.40      0.53        10

    accuracy                           0.60        42
   macro avg       0.62      0.58      0.58        42
weighted avg       0.64      0.60      0.60        42

Accuracy: 0.5952380952380952 

              precision    recall  f1-score   support

           0 