In [None]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import f_classif
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.svm import SVC
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from imblearn.over_sampling import ADASYN

RANDOM_STATE = 42

# ----------------------------
# 1) Load data and drop unnecessary columns
# ----------------------------
df = pd.read_csv("survey_data_cleaned.csv")
drop_cols = ['Timestamp', 'Date_Of_Birth', 'Age']
df = df.drop(columns=[c for c in drop_cols if c in df.columns])

target_col = 'Career_Interest'
X = df.drop(columns=[target_col])
y = df[target_col].astype(str)

print("[INFO] Original shape:", df.shape)
print("[INFO] Original class counts:\n", y.value_counts(), "\n")

# ----------------------------
# 2) Encode categorical features
# ----------------------------
X_enc = X.copy()
encoders = {}
for col in X_enc.columns:
    if X_enc[col].dtype == 'object' or X_enc[col].dtype.name == 'category':
        le = LabelEncoder()
        X_enc[col] = le.fit_transform(X_enc[col].astype(str))
        encoders[col] = le

le_target = LabelEncoder()
y_enc = le_target.fit_transform(y)

# ----------------------------
# 3) ANOVA feature selection
# ----------------------------
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_enc)
F_vals, p_vals = f_classif(X_scaled, y_enc)
anova_df = pd.DataFrame({'Feature': X_enc.columns, 'F_value': F_vals, 'p_value': p_vals})
selected_features = anova_df[anova_df['p_value'] < 0.05]['Feature'].tolist()
if len(selected_features) == 0:
    selected_features = X_enc.columns.tolist()

X_sel = pd.DataFrame(X_scaled, columns=X_enc.columns)[selected_features].values
print("[INFO] ANOVA selected features:", selected_features, "\n")

# ----------------------------
# 4) Train/test split
# ----------------------------
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X_sel, y_enc, test_size=0.2, stratify=y_enc, random_state=RANDOM_STATE
)

print("[INFO] Train shape:", X_train.shape, " Test shape:", X_test.shape)
print("[INFO] Train class dist:\n", pd.Series(y_train).value_counts())
print("[INFO] Test class dist:\n", pd.Series(y_test).value_counts(), "\n")

# ----------------------------
# 5) Adaptive synthetic sampling (ADASYN)
# ----------------------------
# Memory-safe: generate a fixed max number of samples per class
unique_classes = np.unique(y_train)
max_samples_per_class = 200  # adjust as needed
sampling_strategy = {cls: max_samples_per_class for cls in unique_classes}

adasyn = ADASYN(sampling_strategy=sampling_strategy, random_state=RANDOM_STATE)
X_train_res, y_train_res = adasyn.fit_resample(X_train, y_train)
print("[INFO] After ADASYN, train class distribution:\n", pd.Series(y_train_res).value_counts(), "\n")

# ----------------------------
# 6) Define models
# ----------------------------
models = {
    'RF': RandomForestClassifier(n_estimators=200, random_state=RANDOM_STATE),
    'XGB': XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=RANDOM_STATE),
    'MLP': MLPClassifier(hidden_layer_sizes=(64,32), max_iter=300, random_state=RANDOM_STATE),
    'LogReg': LogisticRegression(max_iter=500, random_state=RANDOM_STATE),
    'SVM': SVC(probability=True, random_state=RANDOM_STATE),
    'MMR': LinearRegression()
}

# ----------------------------
# 7) 5-Fold Cross-validation with bootstrapping
# ----------------------------
def cross_val_metrics(model, X, y, n_splits=5):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_STATE)
    precisions, recalls, f1s, accuracies = [], [], [], []

    best_model = None
    best_acc = 0

    for train_idx, val_idx in skf.split(X, y):
        X_tr, X_val = X[train_idx], X[val_idx]
        y_tr, y_val = y[train_idx], y[val_idx]

        # Bootstrapping
        idxs = np.random.choice(len(X_tr), len(X_tr), replace=True)
        X_tr_boot, y_tr_boot = X_tr[idxs], y_tr[idxs]

        model.fit(X_tr_boot, y_tr_boot)
        y_pred = np.rint(model.predict(X_val)) if isinstance(model, LinearRegression) else model.predict(X_val)

        precisions.append(precision_score(y_val, y_pred, average='macro', zero_division=0))
        recalls.append(recall_score(y_val, y_pred, average='macro', zero_division=0))
        f1s.append(f1_score(y_val, y_pred, average='macro', zero_division=0))
        acc = accuracy_score(y_val, y_pred)
        accuracies.append(acc)

        if acc > best_acc:
            best_acc = acc
            best_model = model

    metrics = {
        'precision': np.mean(precisions),
        'recall': np.mean(recalls),
        'f1': np.mean(f1s),
        'accuracy': np.mean(accuracies)
    }
    return best_model, metrics

trained_models = {}
cv_metrics = {}

for name, mdl in models.items():
    print(f"[INFO] Cross-validating {name} ...")
    best_mdl, metrics = cross_val_metrics(mdl, X_train_res, y_train_res)
    trained_models[name] = best_mdl
    cv_metrics[name] = metrics

# ----------------------------
# 8) Ensemble of RF + XGB
# ----------------------------
ensemble = VotingClassifier(
    estimators=[('RF', trained_models['RF']), ('XGB', trained_models['XGB'])],
    voting='soft'
)
ensemble.fit(X_train_res, y_train_res)
trained_models['ENSEMBLE'] = ensemble

# ----------------------------
# 9) Prediction and evaluation
# ----------------------------
def predict_model(name, model, X):
    if name == 'MMR':
        y_hat = np.rint(model.predict(X)).astype(int)
        y_hat = np.clip(y_hat, 0, len(np.unique(y_enc))-1)
        return y_hat
    else:
        return model.predict(X)

for name, mdl in trained_models.items():
    y_pred = predict_model(name, mdl, X_test)
    print(f"\n===== {name} =====")
    print("Precision:", precision_score(y_test, y_pred, average='macro', zero_division=0))
    print("Recall:", recall_score(y_test, y_pred, average='macro', zero_division=0))
    print("F1-score:", f1_score(y_test, y_pred, average='macro', zero_division=0))
    print("Accuracy:", accuracy_score(y_test, y_pred))


[INFO] Original shape: (209, 42)
[INFO] Original class counts:
 Career_Interest
1    64
0    53
3    51
2    41
Name: count, dtype: int64 

[INFO] ANOVA selected features: ['Gender', 'Math', 'Bio', 'Chemistry', 'Physics', 'Business', 'Average_Score', 'Study_Method', 'English_Proficiency', 'Favorite_Subject', 'Building miniatures / models', 'Hackathons / App development projects', 'Programming / coding clubs', 'Volunteering at hospitals, clinics, or NGOs', 'Model building recognition', 'Programming / Coding award'] 

[INFO] Train shape: (167, 16)  Test shape: (42, 16)
[INFO] Train class dist:
 1    51
0    42
3    41
2    33
Name: count, dtype: int64
[INFO] Test class dist:
 1    13
0    11
3    10
2     8
Name: count, dtype: int64 

[INFO] After ADASYN, train class distribution:
 2    207
0    202
1    198
3    196
Name: count, dtype: int64 

[INFO] Cross-validating RF ...
[INFO] Cross-validating XGB ...
[INFO] Cross-validating MLP ...
