In [10]:
#v1
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import f_classif
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

RANDOM_STATE = 42
AUGMENTATION_FACTOR = 3
NOISE_STD_DEV = 0.3

# ----------------------------
# 1) Load & clean
# ----------------------------
df = pd.read_csv("survey_data_cleaned.csv")
drop_cols = ['Timestamp', 'Date_Of_Birth', 'Age']
df = df.drop(columns=[c for c in drop_cols if c in df.columns])

target_col = 'Career_Interest'
X = df.drop(columns=[target_col])
y = df[target_col].astype(str)

print("[INFO] Original shape:", df.shape)
print("[INFO] Original class counts:\n", y.value_counts(), "\n")

# ----------------------------
# 2) Encode categorical features
# ----------------------------
X_enc = X.copy()
encoders = {}
for col in X_enc.columns:
    if X_enc[col].dtype == 'object' or X_enc[col].dtype.name == 'category':
        le = LabelEncoder()
        X_enc[col] = le.fit_transform(X_enc[col].astype(str))
        encoders[col] = le

le_target = LabelEncoder()
y_enc = le_target.fit_transform(y)

# Combine for augmentation
df_enc = X_enc.copy()
df_enc[target_col] = y_enc

# ----------------------------
# 3) Augmentation function
# ----------------------------
def create_augmented_data(df_original, factor, target_column, noise_std_dev=0.05):
    df_list = [df_original.copy()]
    feature_cols = df_original.columns.drop(target_column).tolist()

    for _ in range(factor - 1):
        df_new = df_original.copy()
        for col in feature_cols:
            noise = np.random.normal(0, noise_std_dev * df_new[col].std(), size=len(df_new))
            df_new[col] = df_new[col] + noise
            df_new[col] = np.maximum(df_new[col], 0)
        df_list.append(df_new)

    return pd.concat(df_list, ignore_index=True)

df_augmented = create_augmented_data(df_enc, AUGMENTATION_FACTOR, target_col, NOISE_STD_DEV)
print(f"[INFO] Data augmented. New total size: {df_augmented.shape[0]} samples.")

X_augmented = df_augmented.drop(columns=[target_col])
y_augmented = df_augmented[target_col]

# ----------------------------
# 4) Train/test split
# ----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X_augmented, y_augmented, test_size=0.2, stratify=y_augmented, random_state=RANDOM_STATE
)

print("[INFO] Train shape:", X_train.shape, " Test shape:", X_test.shape)
print("[INFO] Train class dist:\n", pd.Series(y_train).value_counts())
print("[INFO] Test class dist:\n", pd.Series(y_test).value_counts(), "\n")

# ----------------------------
# 5) Scaling
# ----------------------------
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ----------------------------
# 6) ANOVA feature selection
# ----------------------------
F_vals, p_vals = f_classif(X_train_scaled, y_train)
anova_df = pd.DataFrame({'Feature': X_train.columns, 'F_value': F_vals, 'p_value': p_vals})
selected_features = anova_df[anova_df['p_value'] < 0.05]['Feature'].tolist()
if len(selected_features) == 0:
    selected_features = X_train.columns.tolist()

X_train_sel = pd.DataFrame(X_train_scaled, columns=X_train.columns)[selected_features].values
X_test_sel = pd.DataFrame(X_test_scaled, columns=X_test.columns)[selected_features].values
print("[INFO] ANOVA selected features:", selected_features, "\n")

# ----------------------------
# 7) Train models
# ----------------------------
models = {
    'RF': RandomForestClassifier(n_estimators=200, random_state=RANDOM_STATE),
    'XGB': XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=RANDOM_STATE),
    'MLP': MLPClassifier(hidden_layer_sizes=(128,64), max_iter=300, random_state=RANDOM_STATE),
    'LogReg': LogisticRegression(max_iter=500, random_state=RANDOM_STATE),
    'SVM': SVC(probability=True, random_state=RANDOM_STATE),
    'MMR': LinearRegression()
}

trained = {}
for name, mdl in models.items():
    print(f"[INFO] Training {name} ...")
    mdl.fit(X_train_sel, y_train)
    trained[name] = mdl

# Ensemble of RF + XGB
ensemble = VotingClassifier(
    estimators=[('RF', trained['RF']), ('XGB', trained['XGB'])],
    voting='soft'
)
ensemble.fit(X_train_sel, y_train)
trained['ENSEMBLE'] = ensemble

# ----------------------------
# 8) Predict & evaluate
# ----------------------------
def predict_model(name, model, X):
    if name == 'MMR':
        y_hat = np.rint(model.predict(X)).astype(int)
        y_hat = np.clip(y_hat, y_enc.min(), y_enc.max())
        return y_hat
    else:
        return model.predict(X)

for name, mdl in trained.items():
    y_pred = predict_model(name, mdl, X_test_sel)
    print(f"\n===== {name} =====")
    print(classification_report(y_test, y_pred))
    print("Accuracy:", accuracy_score(y_test, y_pred))


[INFO] Original shape: (209, 42)
[INFO] Original class counts:
 Career_Interest
1    64
0    53
3    51
2    41
Name: count, dtype: int64 

[INFO] Data augmented. New total size: 627 samples.
[INFO] Train shape: (501, 41)  Test shape: (126, 41)
[INFO] Train class dist:
 Career_Interest
1    154
0    127
3    122
2     98
Name: count, dtype: int64
[INFO] Test class dist:
 Career_Interest
1    38
0    32
3    31
2    25
Name: count, dtype: int64 

[INFO] ANOVA selected features: ['Gender', 'Age_Group', 'Aptitude', 'Family_Lifestyle_Encoded', 'Math', 'English', 'Bio', 'Chemistry', 'Physics', 'Business', 'Average_Score', 'if_HS_Student', 'Study_Method', 'English_Proficiency', 'IELTS_Score', 'Favorite_Subject', 'Address', 'Building miniatures / models', 'Hackathons / App development projects', 'Programming / coding clubs', 'Science clubs', 'Volunteering at hospitals, clinics, or NGOs', 'DIY / Project recognition', 'Debate / Public Speaking awards', 'Model building recognition', 'Other (plea

In [11]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import f_classif
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

RANDOM_STATE = 42
AUGMENTATION_FACTOR = 3
NOISE_STD_DEV = 0.3

# ----------------------------
# 1) Load & clean
# ----------------------------
df = pd.read_csv("survey_data_cleaned.csv")
drop_cols = ['Timestamp', 'Date_Of_Birth', 'Age']
df = df.drop(columns=[c for c in drop_cols if c in df.columns])

target_col = 'Career_Interest'
X = df.drop(columns=[target_col])
y = df[target_col].astype(str)

print("[INFO] Original shape:", df.shape)
print("[INFO] Original class counts:\n", y.value_counts(), "\n")

# ----------------------------
# 2) Encode categorical features
# ----------------------------
X_enc = X.copy()
encoders = {}
for col in X_enc.columns:
    if X_enc[col].dtype == 'object' or X_enc[col].dtype.name == 'category':
        le = LabelEncoder()
        X_enc[col] = le.fit_transform(X_enc[col].astype(str))
        encoders[col] = le

le_target = LabelEncoder()
y_enc = le_target.fit_transform(y)

# ----------------------------
# 3) Train/test split first
# ----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X_enc, y_enc, test_size=0.2, stratify=y_enc, random_state=RANDOM_STATE
)

print("[INFO] Train shape:", X_train.shape, " Test shape:", X_test.shape)
print("[INFO] Train class dist:\n", pd.Series(y_train).value_counts())
print("[INFO] Test class dist:\n", pd.Series(y_test).value_counts(), "\n")

# ----------------------------
# 4) Augmentation only on training data
# ----------------------------
# ----------------------------
# 4) Augmentation only on training data (fixed)
# ----------------------------
def create_augmented_data(df_original, factor, target_column, noise_std_dev=0.05):
    df_list = [df_original.copy()]
    feature_cols = df_original.columns.drop(target_column).tolist() if target_column in df_original.columns else df_original.columns.tolist()

    for _ in range(factor - 1):
        df_new = df_original.copy()
        for col in feature_cols:
            std = df_new[col].std()
            if std == 0:  # avoid zero std
                std = 1e-6
            noise = np.random.normal(0, noise_std_dev * std, size=len(df_new))
            df_new[col] = df_new[col] + noise
            # If categorical column (integers), round and clip
            if col in encoders:
                df_new[col] = np.rint(df_new[col]).astype(int)
                df_new[col] = np.clip(df_new[col], 0, df_original[col].max())
            else:
                df_new[col] = np.maximum(df_new[col], 0)
        df_list.append(df_new)

    df_aug = pd.concat(df_list, ignore_index=True)
    return df_aug

# Combine X_train and y_train for augmentation
df_train = X_train.copy()
df_train[target_col] = y_train
df_train_aug = create_augmented_data(df_train, AUGMENTATION_FACTOR, target_col, NOISE_STD_DEV)

X_train_aug = df_train_aug.drop(columns=[target_col])
y_train_aug = df_train_aug[target_col]

print(f"[INFO] Augmented training data shape: {X_train_aug.shape}, total samples: {y_train_aug.shape[0]}")
print("[INFO] Augmented class distribution:\n", pd.Series(y_train_aug).value_counts())


# ----------------------------
# 5) Scaling
# ----------------------------
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_aug)
X_test_scaled = scaler.transform(X_test)

# ----------------------------
# 6) ANOVA feature selection
# ----------------------------
F_vals, p_vals = f_classif(X_train_scaled, y_train_aug)
anova_df = pd.DataFrame({'Feature': X_train_aug.columns, 'F_value': F_vals, 'p_value': p_vals})
selected_features = anova_df[anova_df['p_value'] < 0.05]['Feature'].tolist()
if len(selected_features) == 0:
    selected_features = X_train_aug.columns.tolist()

X_train_sel = pd.DataFrame(X_train_scaled, columns=X_train_aug.columns)[selected_features].values
X_test_sel = pd.DataFrame(X_test_scaled, columns=X_test.columns)[selected_features].values
print("[INFO] ANOVA selected features:", selected_features, "\n")

# ----------------------------
# 7) Train models
# ----------------------------
models = {
    'RF': RandomForestClassifier(n_estimators=200, random_state=RANDOM_STATE),
    'XGB': XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=RANDOM_STATE),
    'MLP': MLPClassifier(hidden_layer_sizes=(128,64), max_iter=300, random_state=RANDOM_STATE),
    'LogReg': LogisticRegression(max_iter=500, random_state=RANDOM_STATE),
    'SVM': SVC(probability=True, random_state=RANDOM_STATE),
    'MMR': LinearRegression()
}

trained = {}
for name, mdl in models.items():
    print(f"[INFO] Training {name} ...")
    mdl.fit(X_train_sel, y_train_aug)
    trained[name] = mdl

# Ensemble of RF + XGB
ensemble = VotingClassifier(
    estimators=[('RF', trained['RF']), ('XGB', trained['XGB'])],
    voting='soft'
)
ensemble.fit(X_train_sel, y_train_aug)
trained['ENSEMBLE'] = ensemble

# ----------------------------
# 8) Predict & evaluate
# ----------------------------
def predict_model(name, model, X):
    if name == 'MMR':
        y_hat = np.rint(model.predict(X)).astype(int)
        y_hat = np.clip(y_hat, y_enc.min(), y_enc.max())
        return y_hat
    else:
        return model.predict(X)

for name, mdl in trained.items():
    y_pred = predict_model(name, mdl, X_test_sel)
    print(f"\n===== {name} =====")
    print(classification_report(y_test, y_pred))
    print("Accuracy:", accuracy_score(y_test, y_pred))


[INFO] Original shape: (209, 42)
[INFO] Original class counts:
 Career_Interest
1    64
0    53
3    51
2    41
Name: count, dtype: int64 

[INFO] Train shape: (167, 41)  Test shape: (42, 41)
[INFO] Train class dist:
 1    51
0    42
3    41
2    33
Name: count, dtype: int64
[INFO] Test class dist:
 1    13
0    11
3    10
2     8
Name: count, dtype: int64 

[INFO] Augmented training data shape: (501, 41), total samples: 501
[INFO] Augmented class distribution:
 Career_Interest
1    153
0    126
3    123
2     99
Name: count, dtype: int64
[INFO] ANOVA selected features: ['Gender', 'Age_Group', 'Aptitude', 'Family_Lifestyle_Encoded', 'Math', 'English', 'Bio', 'Chemistry', 'Physics', 'Business', 'Average_Score', 'if_HS_Student', 'Study_Method', 'English_Proficiency', 'Favorite_Subject', 'Personality_Trait', 'Address', 'Chosen_University', 'Building miniatures / models', 'DIY projects', 'Hackathons / App development projects', 'Programming / coding clubs', 'Science clubs', 'Student Counci