# Student Academic Performance Prediction

This notebook implements the methodology described in the paper 'Beyond Performance: Explaining and Ensuring Fairness in Student Academic Performance Prediction with Machine Learning' (Appl. Sci. 2020, 10). It uses the UCI Student Performance dataset to predict student performance (G3) using Logistic Regression, Random Forest, and XGBoost, with SMOTE for class imbalance, fairness analysis via AIF360, and explainability via SHAP and LIME.

## Dependencies
- Python 3.10
- pandas (1.5.3)
- numpy (1.25.2)
- scikit-learn (1.2.2)
- xgboost (1.5.0)
- imbalanced-learn (0.10.1)
- aif360 (0.5.0)
- shap (0.41.0)
- lime (0.2.0.1)

## Dataset
- UCI Student Performance Dataset: https://archive.ics.uci.edu/ml/machine-learning-databases/00320/student-por.csv


In [None]:
# Install required libraries
!pip install pandas==1.5.3 numpy==1.25.2 scikit-learn==1.2.2 xgboost==1.5.0 imbalanced-learn==0.10.1 aif360==0.5.0 shap==0.41.0 lime==0.2.0.1

In [None]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score
from imblearn.over_sampling import SMOTE
from aif360.datasets import BinaryLabelDataset
from aif360.algorithms.preprocessing import Reweighing
from aif360.metrics import ClassificationMetric
import shap
import lime
import lime.lime_tabular
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load the UCI Student Performance dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00320/student-por.csv"
data = pd.read_csv(url, sep=';')

# Preprocess the data
# Convert the target variable G3 to binary: pass (>=10) or fail (<10)
data['G3'] = data['G3'].apply(lambda x: 1 if x >= 10 else 0)

# Define features and target
features = ['G1', 'G2', 'absences', 'failures', 'Medu', 'Fedu', 'sex', 'school', 'Pstatus', 'famsize']
X = data[features]
y = data['G3']

# Encode categorical variables
X = pd.get_dummies(X, columns=['sex', 'school', 'Pstatus', 'famsize'], drop_first=True)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTE to handle class imbalance
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [None]:
# Model Training: Logistic Regression, Random Forest, XGBoost
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Random Forest': RandomForestClassifier(random_state=42),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

# Hyperparameter tuning for Random Forest and XGBoost
param_grid_rf = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None]
}
param_grid_xgb = {
    'n_estimators': [100, 200],
    'max_depth': [3, 6, 9],
    'learning_rate': [0.01, 0.1]
}

grid_rf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid_rf, cv=5, scoring='f1')
grid_xgb = GridSearchCV(XGBClassifier(use_label_encoder=False, eval_metric='logloss'), param_grid_xgb, cv=5, scoring='f1')

# Train models
models['Random Forest'] = grid_rf.fit(X_train_smote, y_train_smote).best_estimator_
models['XGBoost'] = grid_xgb.fit(X_train_smote, y_train_smote).best_estimator_
models['Logistic Regression'].fit(X_train_smote, y_train_smote)

In [None]:
# Evaluate models
results = {}
for name, model in models.items():
    y_pred = model.predict(X_test)
    results[name] = {
        'Accuracy': accuracy_score(y_test, y_pred),
        'F1 Score': f1_score(y_test, y_pred)
    }

# Print model performance
for name, metrics in results.items():
    print(f"{name} - Accuracy: {metrics['Accuracy']:.3f}, F1 Score: {metrics['F1 Score']:.3f}")

# Save results to CSV
results_df = pd.DataFrame(results).T
results_df.to_csv('model_performance.csv')

In [None]:
# Fairness Analysis with AIF360
# Define protected attributes (e.g., sex_M)
privileged_groups = [{'sex_M': 1}]
unprivileged_groups = [{'sex_M': 0}]
dataset_orig = BinaryLabelDataset(df=pd.concat([X_test, y_test], axis=1), 
                                 label_names=['G3'], 
                                 protected_attribute_names=['sex_M'])

# Apply Reweighing for fairness
rw = Reweighing(unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups)
dataset_transf = rw.fit_transform(dataset_orig)

# Evaluate fairness metrics
metric = ClassificationMetric(dataset_orig, dataset_transf, 
                             unprivileged_groups=unprivileged_groups, 
                             privileged_groups=privileged_groups)
print(f"Demographic Parity Difference: {metric.differential_fairness_bias_metrics()['DP']:.3f}")
print(f"Equalized Odds Difference: {metric.equal_opportunity_difference():.3f}")

In [None]:
# Explainability with SHAP
explainer = shap.LinearExplainer(models['Logistic Regression'], X_train_smote)
shap_values = explainer.shap_values(X_test)
shap.summary_plot(shap_values, X_test, plot_type="bar")

In [None]:
# Explainability with LIME
lime_explainer = lime.lime_tabular.LimeTabularExplainer(
    X_train_smote.values, 
    feature_names=X_train_smote.columns, 
    class_names=['Fail', 'Pass'], 
    mode='classification'
)
lime_exp = lime_explainer.explain_instance(X_test.iloc[0].values, models['Logistic Regression'].predict_proba)
lime_exp.show_in_notebook()