## **IMPORTING THE LIBRARIES**





In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

### **IMPORTING THE DATASET**

In [None]:
dataset=pd.read_csv('placedata v2.0 synthetic.csv')


In [None]:
dataset.head()

### **Data Pre Processing**

In [None]:
if 'StudentID' in dataset.columns:
    dataset = dataset.drop('StudentID', axis=1)

Encode Categorical Variables & Target

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

le_status = LabelEncoder()
le_extra = LabelEncoder()
le_training = LabelEncoder()

dataset['PlacementStatus'] = le_status.fit_transform(dataset['PlacementStatus'])
dataset['ExtracurricularActivities'] = le_extra.fit_transform(
    dataset['ExtracurricularActivities']
)
dataset['PlacementTraining'] = le_training.fit_transform(
    dataset['PlacementTraining']
)

Separate Features & Target

In [None]:
X = dataset.drop('PlacementStatus', axis=1)
y = dataset['PlacementStatus']

Train-Test Split (No Data Leakage)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


checking for null values

In [None]:
dataset.isna().sum()

Feature scaling

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)


In [None]:
dataset['CGPA_Aptitude'] = dataset['CGPA'] * dataset['AptitudeTestScore']
dataset['SoftSkills_Extracurricular'] = dataset['SoftSkillsRating'] + dataset['ExtracurricularActivities']

### **DATA VISUALIZATION**

In [None]:
dataset.describe()

Correlation Heat Maps

In [None]:
import seaborn as sns
plt.figure(figsize=(10,8))
sns.heatmap(dataset.corr(), annot=True, cmap='coolwarm')
plt.title("Feature Correlation Heatmap")
plt.show()

Count Plots

In [None]:
plt.figure(figsize=(6,4))
sns.countplot(data=dataset, x='PlacementStatus', palette='Set2')
plt.title('Placement Status Distribution')
plt.show()


In [None]:
categorical_features = ['PlacementTraining', 'ExtracurricularActivities']

for feature in categorical_features:
    plt.figure(figsize=(6,4))
    sns.countplot(x=feature, hue='PlacementStatus', data=dataset, palette='Set2')
    plt.title(f'{feature} vs Placement Status')
    plt.show()

Boxplots for Numerical Features vs Placement

In [None]:
numerical_features = ['CGPA', 'Internships', 'Projects', 'Workshops/Certifications',
                      'AptitudeTestScore', 'SoftSkillsRating', 'SSC_Marks', 'HSC_Marks']

for feature in numerical_features:
    plt.figure(figsize=(6,4))
    sns.boxplot(x='PlacementStatus', y=feature, data=dataset, palette='Set3')
    plt.title(f'{feature} vs Placement Status')
    plt.show()


### **TESTING MODELS**

In [None]:
# Import required libraries
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
import matplotlib.pyplot as plt

# Split the dataset - This line is removed as X_train, X_test, y_train, y_test are already defined.

# List of models
models = [
    [SVC(probability=True), "Support Vector Machine"], # Added probability=True
    [LogisticRegression(C=0.1, max_iter=1000), "Logistic Regression"],
    [RandomForestClassifier(n_estimators=100, random_state=0), "Random Forest"],
    [DecisionTreeClassifier(max_depth=7, random_state=0), "Decision Tree"],
    [KNeighborsClassifier(n_neighbors=7), "K-Nearest Neighbors"],
    [xgb.XGBClassifier(objective="binary:logistic", random_state=42, use_label_encoder=False, eval_metric='logloss'), "XGBoost"],
    [AdaBoostClassifier(n_estimators=25, random_state=0), "AdaBoost"],
    [GradientBoostingClassifier(n_estimators=50, learning_rate=0.1, max_depth=10, random_state=0), "Gradient Boosting"]
]

# Train and evaluate each model
for model, name in models:
    print(f"--- {name} ---")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Accuracy
    acc = accuracy_score(y_test, y_pred) * 100
    print(f"Accuracy: {acc:.2f}%")

    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot(cmap=plt.cm.Blues)
    plt.title(f'{name} - Confusion Matrix')
    plt.show()
    print("\n")

Combine models to improve accuracy

In [None]:
from sklearn.ensemble import VotingClassifier

voting_clf = VotingClassifier(estimators=[
    ('rf', RandomForestClassifier(n_estimators=100, random_state=0)),
    ('xgb', xgb.XGBClassifier(objective="binary:logistic", random_state=42, use_label_encoder=False, eval_metric='logloss')),
    ('lr', LogisticRegression(C=10, max_iter=1000))
], voting='soft')

voting_clf.fit(X_train, y_train)
y_pred = voting_clf.predict(X_test)
print("Voting Classifier Accuracy:", accuracy_score(y_test, y_pred)*100)

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap=plt.cm.Blues)
plt.title(f'Voting Classifier - Confusion Matrix')
plt.show()
print("\n")

Accuracy + ROC-AUC on Test Set

In [None]:
from sklearn.metrics import accuracy_score, roc_auc_score

print("MODEL PERFORMANCE (TEST SET)\n")

for model, name in models:
    model.fit(X_train, y_train)

    y_prob = model.predict_proba(X_test)[:, 1]
    y_pred = (y_prob >= 0.5).astype(int)

    acc = accuracy_score(y_test, y_pred)
    roc = roc_auc_score(y_test, y_prob)

    print(f"{name}")
    print(f"Accuracy: {acc * 100:.2f}%")
    print(f"ROC-AUC : {roc:.4f}\n")

VotingClassifier (LR + SVM + AdaBoost)

In [None]:
voting_clf = VotingClassifier(
    estimators=[
        ('lr', LogisticRegression(max_iter=1000)),
        ('svm', SVC(probability=True, C=10)),
        ('ada', AdaBoostClassifier(n_estimators=200, learning_rate=0.5))
    ],
    voting='soft',
    weights=[2, 2, 1]
)

voting_clf.fit(X_train, y_train)

y_prob_voting = voting_clf.predict_proba(X_test)[:, 1]
y_pred_voting = (y_prob_voting >= 0.5).astype(int)

print("Voting Classifier")
print("Accuracy:", accuracy_score(y_test, y_pred_voting) * 100)
print("ROC-AUC :", roc_auc_score(y_test, y_prob_voting))


Threshold Tuning (Best Accuracy)

In [None]:
best_acc = 0
best_t = 0

for t in np.arange(0.1, 0.9, 0.01):
    preds = (y_prob_voting >= t).astype(int)
    acc = accuracy_score(y_test, preds)

    if acc > best_acc:
        best_acc = acc
        best_t = t

print("\nBest Threshold:", round(best_t, 2))
print("Best Accuracy (Tuned):", round(best_acc * 100, 2))


In [None]:
from sklearn.model_selection import StratifiedKFold, cross_val_score # Added StratifiedKFold and cross_val_score

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

print("\nCROSS-VALIDATION RESULTS (ROC-AUC)\n")

for model, name in models: # Corrected iteration for list 'models'
    scores = cross_val_score(
        model,
        X_train,
        y_train,
        cv=cv,
        scoring='roc_auc'
    )

    print(f"{name}")
    print(f"ROC-AUC: {scores.mean():.4f} Â± {scores.std():.4f}\n")

### **HYPERPARAMETER TUNING**

GridSearchCV Example (Random Forest)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# Define the model
rf = RandomForestClassifier(random_state=42)

# Define the hyperparameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt']
}

# Set up GridSearchCV
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=5,                  # 5-fold cross-validation
    scoring='roc_auc',     # optimize ROC-AUC
    n_jobs=-1,             # use all CPUs
    verbose=2
)

# Fit the grid search
grid_search.fit(X_train, y_train)

# Best hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)

# Best model
best_rf = grid_search.best_estimator_

# Evaluate on test set
from sklearn.metrics import accuracy_score, roc_auc_score

y_pred = best_rf.predict(X_test)
y_prob = best_rf.predict_proba(X_test)[:, 1]

print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("Test ROC-AUC:", roc_auc_score(y_test, y_prob))


RandomizedSearchCV Example (XGBoost)

In [None]:
from sklearn.model_selection import RandomizedSearchCV
import xgboost as xgb
from scipy.stats import uniform, randint

# Define model
xgb_model = xgb.XGBClassifier(
    objective='binary:logistic',
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)

# Hyperparameter distributions
param_dist = {
    'n_estimators': randint(100, 500),
    'max_depth': randint(3, 10),
    'learning_rate': uniform(0.01, 0.3),
    'subsample': uniform(0.6, 0.4),
    'colsample_bytree': uniform(0.6, 0.4)
}

# RandomizedSearchCV
rand_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_dist,
    n_iter=50,             # number of random combinations
    cv=5,
    scoring='roc_auc',
    verbose=2,
    n_jobs=-1,
    random_state=42
)

# Fit
rand_search.fit(X_train, y_train)

# Best hyperparameters
print("Best Hyperparameters:", rand_search.best_params_)

# Evaluate
best_xgb = rand_search.best_estimator_
y_pred = best_xgb.predict(X_test)
y_prob = best_xgb.predict_proba(X_test)[:, 1]

print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("Test ROC-AUC:", roc_auc_score(y_test, y_prob))


### **EXPLAINABILITY**

In [None]:
import shap
import matplotlib.pyplot as plt
import numpy as np
def shap_feature_importance(model, model_name, X_train, X_test, feature_names):

    print(f"\nSHAP for {model_name}")

    # Fit model
    model.fit(X_train, y_train)

    # ---- Choose Explainer Correctly ----
    if hasattr(model, "estimators_"):
        # Tree-based models
        explainer = shap.TreeExplainer(model)
        shap_values = explainer.shap_values(X_test)

    elif model_name in ["Logistic Regression"]:
        explainer = shap.LinearExplainer(model, X_train)
        shap_values = explainer.shap_values(X_test)

    else:
        # Kernel-based models (SVM, KNN, Voting)
        background = shap.sample(X_train, 100)
        explainer = shap.KernelExplainer(model.predict_proba, background)
        shap_values = explainer.shap_values(X_test[:100])

    # ---- Binary classification handling ----
    if isinstance(shap_values, list):
        shap_vals = shap_values[1]
    else:
        shap_vals = shap_values

    # ---- Summary Plot ----
    shap.summary_plot(
        shap_vals,
        X_test[:shap_vals.shape[0]],
        feature_names=feature_names,
        show=True
    )

    # ---- Bar Plot (Global Importance) ----
    shap.summary_plot(
        shap_vals,
        X_test[:shap_vals.shape[0]],
        feature_names=feature_names,
        plot_type="bar",
        show=True
    )


In [None]:
feature_names = X.columns.tolist() # Get feature names from the original DataFrame X

for model, name in models:
    shap_feature_importance(
        model,
        name,
        X_train,
        X_test,
        feature_names
    )


SHAP for Support Vector Machine


  0%|          | 0/100 [00:00<?, ?it/s]

In [None]:

lr_model = voting_clf.named_estimators_['lr']

shap_feature_importance(
    lr_model,
    "VotingClassifier (Logistic Regression component)",
    X_train,
    X_test,
    feature_names
)


### **DATA VISUALIZATION**

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import (
    confusion_matrix,
    accuracy_score,
    roc_curve,
    auc
)

sns.set(style="whitegrid")


In [None]:
def visualize_train_test(model, model_name, X_train, y_train, X_test, y_test):

    # Fit model
    model.fit(X_train, y_train)

    # Probabilities
    y_train_prob = model.predict_proba(X_train)[:, 1]
    y_test_prob = model.predict_proba(X_test)[:, 1]

    # Predictions (default threshold)
    y_train_pred = (y_train_prob >= 0.5).astype(int)
    y_test_pred = (y_test_prob >= 0.5).astype(int)

    # Accuracy
    train_acc = accuracy_score(y_train, y_train_pred)
    test_acc = accuracy_score(y_test, y_test_pred)

    # Confusion matrices
    cm_train = confusion_matrix(y_train, y_train_pred)
    cm_test = confusion_matrix(y_test, y_test_pred)

    # ROC curves
    fpr_tr, tpr_tr, _ = roc_curve(y_train, y_train_prob)
    fpr_te, tpr_te, _ = roc_curve(y_test, y_test_prob)

    auc_tr = auc(fpr_tr, tpr_tr)
    auc_te = auc(fpr_te, tpr_te)

    # ---------------- PLOTS ----------------
    fig, axes = plt.subplots(1, 3, figsize=(18, 5))

    # Confusion Matrix - Train
    sns.heatmap(cm_train, annot=True, fmt="d", cmap="Blues", ax=axes[0])
    axes[0].set_title(f"{model_name} - Train CM")

    # Confusion Matrix - Test
    sns.heatmap(cm_test, annot=True, fmt="d", cmap="Greens", ax=axes[1])
    axes[1].set_title(f"{model_name} - Test CM")

    # ROC Curve
    axes[2].plot(fpr_tr, tpr_tr, label=f"Train AUC = {auc_tr:.2f}")
    axes[2].plot(fpr_te, tpr_te, label=f"Test AUC = {auc_te:.2f}")
    axes[2].plot([0, 1], [0, 1], linestyle="--", color="gray")
    axes[2].set_title(f"{model_name} - ROC Curve")
    axes[2].set_xlabel("False Positive Rate")
    axes[2].set_ylabel("True Positive Rate")
    axes[2].legend()

    plt.suptitle(
        f"{model_name} | Train Acc: {train_acc*100:.2f}% | Test Acc: {test_acc*100:.2f}%",
        fontsize=14
    )
    plt.show()


In [None]:
for model, name in models:
    visualize_train_test(
        model,
        name,
        X_train,
        y_train,
        X_test,
        y_test
    )

In [None]:
visualize_train_test(
    voting_clf,
    "Voting Classifier",
    X_train,
    y_train,
    X_test,
    y_test
)


In [None]:
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

train_accs = []
test_accs = []
model_names = []

for model, name in models:
    model.fit(X_train, y_train)
    train_accs.append(
        accuracy_score(y_train, model.predict(X_train))
    )
    test_accs.append(
        accuracy_score(y_test, model.predict(X_test))
    )
    model_names.append(name)

plt.figure(figsize=(10,5))
plt.bar(model_names, train_accs, label="Train Accuracy")
plt.bar(model_names, test_accs, label="Test Accuracy", alpha=0.7)
plt.xticks(rotation=45)
plt.ylabel("Accuracy")
plt.title("Train vs Test Accuracy Comparison")
plt.legend()
plt.show()