In [6]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import f_classif
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
from sklearn.utils import resample

# ----------------------------
# 1️⃣ Load & Clean Dataset
# ----------------------------
df = pd.read_csv("survey_data_cleaned.csv")
drop_cols = ['Timestamp', 'Date_Of_Birth', 'Age']
df = df.drop(columns=[c for c in drop_cols if c in df.columns])

# ----------------------------
# 2️⃣ Split Target and Features
# ----------------------------
X = df.drop(columns=['Career_Interest'])
y = df['Career_Interest']

print("[INFO] Original Class Distribution:")
print(y.value_counts(), "\n")

# ----------------------------
# 3️⃣ Encode Categorical Columns (including target)
# ----------------------------
le = LabelEncoder()
X_encoded = X.copy()
for col in X_encoded.columns:
    if X_encoded[col].dtype == 'object':
        X_encoded[col] = le.fit_transform(X_encoded[col].astype(str))
y_encoded = le.fit_transform(y)

# ----------------------------
# 4️⃣ Epoch-Based Bootstrapping + SMOTE (Controlled)
# ----------------------------
epochs = 5
expansion_factor = 5  # target 5× largest class
X_epoch = X_encoded.copy()
y_epoch = pd.Series(y_encoded.copy())

orig_class_counts = y_epoch.value_counts()

for epoch in range(epochs):
    df_train = X_epoch.copy()
    df_train['Career_Interest'] = y_epoch.values
    df_list = []
    
    target_count = orig_class_counts.max() * expansion_factor  # fixed target based on original
    for cls in df_train['Career_Interest'].unique():
        cls_df = df_train[df_train['Career_Interest'] == cls]
        cls_resampled = resample(cls_df, replace=True, n_samples=target_count, random_state=42 + epoch)
        df_list.append(cls_resampled)
    
    df_bootstrap = pd.concat(df_list)
    
    # SMOTE after bootstrap
    smote = SMOTE(random_state=42 + epoch)
    X_epoch, y_epoch = smote.fit_resample(df_bootstrap.drop(columns=['Career_Interest']),
                                          df_bootstrap['Career_Interest'])

print("[INFO] After Epoch-Based Bootstrapping + SMOTE:")
print(pd.Series(y_epoch).value_counts(), "\n")

# ----------------------------
# 5️⃣ ANOVA Feature Selection
# ----------------------------
F_values, p_values = f_classif(X_epoch, y_epoch)
anova_df = pd.DataFrame({'Feature': X_epoch.columns, 'F_value': F_values, 'p_value': p_values})
selected_features = anova_df[anova_df['p_value'] < 0.05]['Feature'].tolist()
X_selected = X_epoch[selected_features]

print("[INFO] Selected Features via ANOVA (<0.05):")
print(selected_features, "\n")
print("[INFO] Class Distribution After ANOVA:")
print(pd.Series(y_epoch).value_counts(), "\n")

# ----------------------------
# 6️⃣ Train-Test Split
# ----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X_selected, y_epoch, test_size=0.2, stratify=y_epoch, random_state=42
)
print("[INFO] Training Class Distribution:")
print(pd.Series(y_train).value_counts(), "\n")
print("[INFO] Testing Class Distribution:")
print(pd.Series(y_test).value_counts(), "\n")

# ----------------------------
# 7️⃣ Scaling
# ----------------------------
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ----------------------------
# 8️⃣ SMOTE on Training Data (optional here since already applied in epoch)
# ----------------------------
# Already augmented, can skip extra SMOTE, or repeat lightly if desired.

# ----------------------------
# 9️⃣ Train Models
# ----------------------------
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_scaled, y_train)

xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
xgb.fit(X_train_scaled, y_train)

ensemble = VotingClassifier(estimators=[('RF', rf), ('XGB', xgb)], voting='soft')
ensemble.fit(X_train_scaled, y_train)

# ----------------------------
# 10️⃣ Prediction with Adjusted Probabilities
# ----------------------------
# Adjust based on original distribution
orig_dist = y.value_counts() / len(y)
boot_dist = pd.Series(y_epoch).value_counts() / len(y_epoch)

def predict_adjusted_proba(model, scaler, X_test_df):
    X_scaled = scaler.transform(X_test_df)
    raw_probs = model.predict_proba(X_scaled)
    classes = model.classes_
    
    adjusted_probs = []
    for prob_vector in raw_probs:
        adj = prob_vector * (orig_dist.loc[classes].values / boot_dist.loc[classes].values)
        adj /= adj.sum()  # normalize
        adjusted_probs.append(adj)
    return np.array(adjusted_probs)

y_pred_rf_adj = np.argmax(predict_adjusted_proba(rf, scaler, X_test), axis=1)
y_pred_xgb_adj = np.argmax(predict_adjusted_proba(xgb, scaler, X_test), axis=1)
y_pred_ens_adj = np.argmax(predict_adjusted_proba(ensemble, scaler, X_test), axis=1)

# ----------------------------
# 11️⃣ Evaluation
# ----------------------------
def print_report(y_true, y_pred, model_name):
    print(f"============ {model_name} REPORT ============")
    print(classification_report(y_true, y_pred))
    print("Accuracy:", accuracy_score(y_true, y_pred), "\n")

print_report(y_test, y_pred_rf_adj, "RANDOM FOREST")
print_report(y_test, y_pred_xgb_adj, "XGBOOST")
print_report(y_test, y_pred_ens_adj, "ENSEMBLE")


[INFO] Original Class Distribution:
Career_Interest
1    64
0    53
3    51
2    41
Name: count, dtype: int64 

[INFO] After Epoch-Based Bootstrapping + SMOTE:
Career_Interest
0    320
1    320
2    320
3    320
Name: count, dtype: int64 

[INFO] Selected Features via ANOVA (<0.05):
['Gender', 'Age_Group', 'Aptitude', 'Parents_Education_Encoded', 'Family_Lifestyle_Encoded', 'Math', 'English', 'Bio', 'Chemistry', 'Physics', 'ICT', 'Business', 'Average_Score', 'if_HS_Student', 'Study_Method', 'Study_Habit', 'English_Proficiency', 'IELTS_Score', 'Favorite_Subject', 'Personality_Trait', 'Address', 'Study_Country', 'Chosen_University', 'Building miniatures / models', 'DIY projects', 'Debate / Public Speaking', 'Hackathons / App development projects', 'Programming / coding clubs', 'Science clubs', 'Student Council / Leadership Roles', 'Volunteering at hospitals, clinics, or NGOs', 'DIY / Project recognition', 'Debate / Public Speaking awards', 'Hackathon / App competition prize', 'Model buil

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


              precision    recall  f1-score   support

           0       1.00      0.98      0.99        64
           1       1.00      1.00      1.00        64
           2       1.00      1.00      1.00        64
           3       0.98      1.00      0.99        64

    accuracy                           1.00       256
   macro avg       1.00      1.00      1.00       256
weighted avg       1.00      1.00      1.00       256

Accuracy: 0.99609375 

              precision    recall  f1-score   support

           0       1.00      0.98      0.99        64
           1       0.97      1.00      0.98        64
           2       1.00      1.00      1.00        64
           3       1.00      0.98      0.99        64

    accuracy                           0.99       256
   macro avg       0.99      0.99      0.99       256
weighted avg       0.99      0.99      0.99       256

Accuracy: 0.9921875 

              precision    recall  f1-score   support

           0       1.00      0

In [8]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import f_classif
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
from sklearn.utils import resample

# ----------------------------
# 1️⃣ Load & Clean Dataset
# ----------------------------
df = pd.read_csv("survey_data_cleaned.csv")
drop_cols = ['Timestamp', 'Date_Of_Birth', 'Age']
df = df.drop(columns=[c for c in drop_cols if c in df.columns])

# ----------------------------
# 2️⃣ Split Target and Features
# ----------------------------
X = df.drop(columns=['Career_Interest'])
y = df['Career_Interest']

# Encode categorical columns
le = LabelEncoder()
for col in X.columns:
    if X[col].dtype == 'object':
        X[col] = le.fit_transform(X[col].astype(str))
y_encoded = le.fit_transform(y)

# ----------------------------
# 3️⃣ Epoch-Based Bootstrapping + Feature-Weighted SMOTE
# ----------------------------
epochs = 5
expansion_factor = 5
X_epoch = X.copy()
y_epoch = pd.Series(y_encoded.copy())

orig_class_counts = y_epoch.value_counts()

# Compute ANOVA-based feature weights
F_values, _ = f_classif(X_epoch, y_epoch)
importance_dict = dict(zip(X_epoch.columns, F_values / F_values.sum()))
feature_weights = np.array([importance_dict[f] for f in X_epoch.columns])

for epoch in range(epochs):
    df_train = pd.DataFrame(X_epoch, columns=X.columns)
    df_train['Career_Interest'] = y_epoch.values
    df_list = []
    
    target_count = orig_class_counts.max() * expansion_factor
    
    for cls in df_train['Career_Interest'].unique():
        cls_df = df_train[df_train['Career_Interest'] == cls]
        cls_resampled = resample(cls_df, replace=True, n_samples=target_count, random_state=42 + epoch)
        df_list.append(cls_resampled)
    
    df_bootstrap = pd.concat(df_list)
    
    # Feature-weighted scaling before SMOTE
    X_weighted = df_bootstrap.drop(columns=['Career_Interest']).values * feature_weights
    smote = SMOTE(random_state=42 + epoch)
    X_res, y_res = smote.fit_resample(X_weighted, df_bootstrap['Career_Interest'])
    
    # Restore scale
    X_epoch = X_res / feature_weights
    y_epoch = y_res

print("[INFO] After Epoch-Based Bootstrapping + FW-SMOTE:")
print(pd.Series(y_epoch).value_counts(), "\n")

# ----------------------------
# 4️⃣ ANOVA Feature Selection
# ----------------------------
F_values, p_values = f_classif(pd.DataFrame(X_epoch, columns=X.columns), y_epoch)
anova_df = pd.DataFrame({'Feature': X.columns, 'F_value': F_values, 'p_value': p_values})
selected_features = anova_df[anova_df['p_value'] < 0.05]['Feature'].tolist()
X_selected = pd.DataFrame(X_epoch, columns=X.columns)[selected_features]

# ----------------------------
# 5️⃣ Train-Test Split
# ----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X_selected, y_epoch, test_size=0.2, stratify=y_epoch, random_state=42
)

# ----------------------------
# 6️⃣ Scaling
# ----------------------------
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ----------------------------
# 7️⃣ Train Models
# ----------------------------
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_scaled, y_train)

xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
xgb.fit(X_train_scaled, y_train)

ensemble = VotingClassifier(estimators=[('RF', rf), ('XGB', xgb)], voting='soft')
ensemble.fit(X_train_scaled, y_train)

# ----------------------------
# 8️⃣ Prediction with Adjusted Probabilities
# ----------------------------
orig_dist = y.value_counts() / len(y)
boot_dist = pd.Series(y_epoch).value_counts() / len(y_epoch)

def predict_adjusted_proba(model, scaler, X_test_df):
    X_scaled = scaler.transform(X_test_df)
    raw_probs = model.predict_proba(X_scaled)
    classes = model.classes_
    
    adjusted_probs = []
    for prob_vector in raw_probs:
        adj = prob_vector * (orig_dist.loc[classes].values / boot_dist.loc[classes].values)
        adj /= adj.sum()
        adjusted_probs.append(adj)
    return np.array(adjusted_probs)

y_pred_rf_adj = np.argmax(predict_adjusted_proba(rf, scaler, X_test), axis=1)
y_pred_xgb_adj = np.argmax(predict_adjusted_proba(xgb, scaler, X_test), axis=1)
y_pred_ens_adj = np.argmax(predict_adjusted_proba(ensemble, scaler, X_test), axis=1)

# ----------------------------
# 9️⃣ Evaluation
# ----------------------------
def print_report(y_true, y_pred, model_name):
    print(f"============ {model_name} REPORT ============")
    print(classification_report(y_true, y_pred))
    print("Accuracy:", accuracy_score(y_true, y_pred), "\n")

print_report(y_test, y_pred_rf_adj, "RANDOM FOREST")
print_report(y_test, y_pred_xgb_adj, "XGBOOST")
print_report(y_test, y_pred_ens_adj, "ENSEMBLE")


[INFO] After Epoch-Based Bootstrapping + FW-SMOTE:
Career_Interest
0    320
1    320
2    320
3    320
Name: count, dtype: int64 



Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


              precision    recall  f1-score   support

           0       1.00      0.98      0.99        64
           1       1.00      1.00      1.00        64
           2       1.00      1.00      1.00        64
           3       0.98      1.00      0.99        64

    accuracy                           1.00       256
   macro avg       1.00      1.00      1.00       256
weighted avg       1.00      1.00      1.00       256

Accuracy: 0.99609375 

              precision    recall  f1-score   support

           0       1.00      0.98      0.99        64
           1       0.97      1.00      0.98        64
           2       1.00      1.00      1.00        64
           3       1.00      0.98      0.99        64

    accuracy                           0.99       256
   macro avg       0.99      0.99      0.99       256
weighted avg       0.99      0.99      0.99       256

Accuracy: 0.9921875 

              precision    recall  f1-score   support

           0       1.00      0