In [22]:
#import neccessary libraries
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score 
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import joblib
from sklearn.preprocessing import LabelEncoder
import numpy as np  

In [23]:
#load the cleaned dataset
df=pd.read_csv(r"C:\Users\kumar\Academic_Decision_Support_System\data\processed\student_performance_cleaned.csv")
df.head()

Unnamed: 0,study_hours_per_day,attendance_percentage,absences,sleep_hours,stress_level,motivation_level,family_support,school_support,activity_level,avg_grade,pass_fail
0,2.6,71,9,4.5,1,2,2,2,1,35.5,0
1,5.7,75,17,5.0,1,2,1,2,1,51.0,0
2,4.5,83,7,8.6,1,1,0,2,1,65.5,1
3,3.8,78,20,6.1,1,0,2,1,2,67.0,1
4,1.4,67,10,6.0,0,1,1,0,2,61.0,0


In [24]:
#separate features and target variable
X = df.drop("pass_fail", axis=1)
y = df["pass_fail"]


In [25]:
#split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [26]:
# train a XGBClassifier
model = XGBClassifier(
n_estimators=300,
learning_rate=0.05,
max_depth=4,
subsample=0.9,
colsample_bytree=0.9,
random_state=42
)


model.fit(X_train, y_train)


In [27]:
# make predictions and display the first 10 predictions
y_pred = model.predict(X_test)
y_pred[:10]

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 0])

In [28]:
# check model accuracy 
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")


Model Accuracy: 0.96


In [29]:
# detailed performance report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.96      0.97       120
           1       0.94      0.97      0.96        80

    accuracy                           0.96       200
   macro avg       0.96      0.97      0.96       200
weighted avg       0.97      0.96      0.97       200



In [30]:
# confusion matrix

print(confusion_matrix(y_test, y_pred))

[[115   5]
 [  2  78]]


In [31]:
# feature importance
importance = pd.Series(model.feature_importances_, index=X.columns)
importance.sort_values(ascending=False)

avg_grade                0.396998
study_hours_per_day      0.198744
attendance_percentage    0.140525
motivation_level         0.057302
absences                 0.053269
school_support           0.033504
stress_level             0.033138
family_support           0.031121
sleep_hours              0.030339
activity_level           0.025059
dtype: float32

In [32]:
# check training and testing accuracy to detect overfitting and underfitting
train_acc = model.score(X_train, y_train)
test_acc = model.score(X_test, y_test)

print("Training Accuracy:", train_acc)
print("Testing Accuracy:", test_acc)


Training Accuracy: 1.0
Testing Accuracy: 0.965


In [33]:
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(5,4))
plt.imshow(cm)
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.colorbar()
plt.savefig(r"C:\Users\kumar\Academic_Decision_Support_System\reports\confusion_matrix.png")
plt.close()


In [34]:
# save the trained model and encoders
joblib.dump(model, r"C:\Users\kumar\Academic_Decision_Support_System\models\pass_fail_model.pkl")
print("Model saved successfully.")



Model saved successfully.


In [35]:
# load the trained model and encoder
loaded_model = joblib.load(r"C:\Users\kumar\Academic_Decision_Support_System\models\pass_fail_model.pkl")


In [36]:
# verify the loaded model classes
print(loaded_model.classes_)

[0 1]


In [37]:
# Example input data for prediction

new_student = {
    "study_hours_per_day": 7,
    "attendance_percentage": 95,
    "absences": 2,
    "sleep_hours": 7,
    "stress_level": 1,
    "motivation_level": 5,
    "family_support": 1,
    "school_support": 1,
    "activity_level": 1,
    "avg_grade": 42
}

input_df = pd.DataFrame([new_student])

prediction = loaded_model.predict(input_df)
probability = loaded_model.predict_proba(input_df)

print("Prediction:", "Pass" if prediction[0] == 1 else "Fail")
print(f"Pass Probability: {float(probability[0][1]):.2f}")
print(f"Fail Probability: {float(probability[0][0]):.2f}")

Prediction: Fail
Pass Probability: 0.06
Fail Probability: 0.94
