In [4]:
pip install pandas numpy scikit-learn matplotlib seaborn

Note: you may need to restart the kernel to use updated packages.


In [5]:
import pandas as pd

LOAD AND PREPARE DATASET

In [6]:
# Load dataset
df = pd.read_csv("dataset_info/synthetic_student_dataset_20000.csv")
df.head()

Unnamed: 0,student_id,level,attendance_rate,num_quizzes,quiz_avg,assignment_avg,mid_sem_score,forum_participation,study_time_hours,dashboard_time_hours,current_gpa,predicted_gpa,target_gpa,final_gpa,risk_category,recommended_action
0,S00001,200,76.96,3,54.25,76.63,63.05,3,16.93,14.44,3.17,2.13,2.19,2.19,Average,"Maintain steady progress, focus on weak areas"
1,S00002,400,90.86,7,64.27,62.99,49.22,5,8.63,9.54,2.91,2.26,2.47,2.18,Average,"Maintain steady progress, focus on weak areas"
2,S00003,300,98.06,5,59.85,74.18,55.22,4,6.43,15.79,2.52,2.65,2.77,2.64,Average,"Maintain steady progress, focus on weak areas"
3,S00004,300,72.73,3,91.02,57.9,76.99,0,28.38,9.0,3.08,2.44,2.42,2.37,Average,"Maintain steady progress, focus on weak areas"
4,S00005,100,100.0,7,65.45,76.74,77.0,6,6.66,3.61,3.43,2.67,2.95,2.73,Average,"Maintain steady progress, focus on weak areas"


In [7]:
# Select useful features
features = [
    "attendance_rate", "num_quizzes", "quiz_avg", "assignment_avg",
    "mid_sem_score", "forum_participation", "study_time_hours",
    "dashboard_time_hours", "current_gpa"
]

In [8]:
# Regression target
y_reg = df["final_gpa"]

# Classification target
y_clf = df["risk_category"]

# Features matrix
X = df[features]

SPLIT INTO TRAIN/TEST

In [9]:
#Split data into train/test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train_reg, y_test_reg = train_test_split(X, y_reg, test_size=0.2, random_state=42)
_, _, y_train_clf, y_test_clf = train_test_split(X, y_clf, test_size=0.2, random_state=42)


TRAIN THE REGRESSION MODEL

In [10]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

In [11]:
reg_model = RandomForestRegressor(n_estimators=150, random_state=42)
reg_model.fit(X_train, y_train_reg)

# Evaluate
y_pred_reg = reg_model.predict(X_test)
print("MAE:", mean_absolute_error(y_test_reg, y_pred_reg))
print("R² Score:", r2_score(y_test_reg, y_pred_reg))

MAE: 0.16777865000000003
R² Score: 0.4719786693265605


TRAIN THE CLASSIFICATION MODEL

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

clf_model = RandomForestClassifier(n_estimators=150, random_state=42)
clf_model.fit(X_train, y_train_clf)

# Evaluate
y_pred_clf = clf_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test_clf, y_pred_clf))
print(classification_report(y_test_clf, y_pred_clf))

SAVE MODELS FOR DEPLOYMENT

In [None]:
import joblib

joblib.dump(reg_model, "gpa_regressor.pkl")
joblib.dump(clf_model, "risk_classifier.pkl")


TEST WITH NEW INPUTS

In [None]:
sample = X_test.iloc[[0]]
pred_gpa = reg_model.predict(sample)[0]
pred_risk = clf_model.predict(sample)[0]

print("Predicted GPA:", round(pred_gpa, 2))
print("Predicted Risk Category:", pred_risk)
