In [3]:
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')

data_path = "/content/drive/MyDrive/EduPredict_Project/data"
df = pd.read_csv(f"{data_path}/academic_cleaned.csv")
df.head()


Mounted at /content/drive


Unnamed: 0,Marital Status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target_Enrolled,Target_Graduate
0,1,17,5,171,1,1,122.0,1,19,12,...,0,0,0,0.0,0,10.8,1.4,1.74,False,False
1,1,15,1,9254,1,1,160.0,1,1,3,...,6,6,6,13.666667,0,13.9,-0.3,0.79,False,True
2,1,1,5,9070,1,1,122.0,1,37,37,...,6,0,0,0.0,0,10.8,1.4,1.74,False,False
3,1,17,2,9773,1,1,122.0,1,38,37,...,6,10,5,12.4,0,9.4,-0.8,-3.12,False,True
4,2,39,1,8014,0,1,100.0,1,37,38,...,6,6,6,13.0,0,13.9,-0.3,0.79,False,True


In [4]:
df.columns.to_list()

['Marital Status',
 'Application mode',
 'Application order',
 'Course',
 'Daytime/evening attendance',
 'Previous qualification',
 'Previous qualification (grade)',
 'Nacionality',
 "Mother's qualification",
 "Father's qualification",
 "Mother's occupation",
 "Father's occupation",
 'Admission grade',
 'Displaced',
 'Educational special needs',
 'Debtor',
 'Tuition fees up to date',
 'Gender',
 'Scholarship holder',
 'Age at enrollment',
 'International',
 'Curricular units 1st sem (credited)',
 'Curricular units 1st sem (enrolled)',
 'Curricular units 1st sem (evaluations)',
 'Curricular units 1st sem (approved)',
 'Curricular units 1st sem (grade)',
 'Curricular units 1st sem (without evaluations)',
 'Curricular units 2nd sem (credited)',
 'Curricular units 2nd sem (enrolled)',
 'Curricular units 2nd sem (evaluations)',
 'Curricular units 2nd sem (approved)',
 'Curricular units 2nd sem (grade)',
 'Curricular units 2nd sem (without evaluations)',
 'Unemployment rate',
 'Inflation rat

In [5]:
def get_label(row):
    if row['Target_Graduate'] == 1:
        return 'Graduate'
    elif row['Target_Enrolled'] == 1:
        return 'Enrolled'
    else:
        return 'Dropout'

df['Target'] = df.apply(get_label, axis=1)


In [6]:
df.drop(['Target_Graduate', 'Target_Enrolled'], axis=1, inplace=True)


In [7]:
X = df.drop('Target', axis=1)
y = df['Target']


In [8]:
df["Target"].value_counts()


Unnamed: 0_level_0,count
Target,Unnamed: 1_level_1
Graduate,2209
Dropout,1421
Enrolled,794


In [9]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)


In [10]:
print(label_encoder.classes_)


['Dropout' 'Enrolled' 'Graduate']


In [11]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)


In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

log_model = LogisticRegression(max_iter=5000)
rf_model = RandomForestClassifier()
xgb_model = XGBClassifier(eval_metric='mlogloss')

log_model.fit(X_train, y_train)
rf_model.fit(X_train, y_train)
xgb_model.fit(X_train, y_train)


In [13]:
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

models = {
    "Logistic Regression": log_model,
    "Random Forest": rf_model,
    "XGBoost": xgb_model
}

for name, model in models.items():
    y_pred = model.predict(X_test)
    print(f"\nModel: {name}")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("F1 Score:", f1_score(y_test, y_pred, average='weighted'))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))



Model: Logistic Regression
Accuracy: 0.768361581920904
F1 Score: 0.7531276658422533
Confusion Matrix:
 [[218  29  37]
 [ 43  53  63]
 [ 14  19 409]]
Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.77      0.78       284
           1       0.52      0.33      0.41       159
           2       0.80      0.93      0.86       442

    accuracy                           0.77       885
   macro avg       0.71      0.68      0.68       885
weighted avg       0.75      0.77      0.75       885


Model: Random Forest
Accuracy: 0.7728813559322034
F1 Score: 0.7602881482190712
Confusion Matrix:
 [[214  25  45]
 [ 36  60  63]
 [  9  23 410]]
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.75      0.79       284
           1       0.56      0.38      0.45       159
           2       0.79      0.93      0.85       442

    accuracy                           0.77       885
   m

In [14]:
import joblib

model_path = "/content/drive/MyDrive/EduPredict_Project/models"
joblib.dump(rf_model, f"{model_path}/rf_model.pkl")


['/content/drive/MyDrive/EduPredict_Project/models/rf_model.pkl']

In [15]:
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score

results = []

for name, model in models.items():
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    results.append({'Model': name, 'Accuracy': acc, 'F1 Score': f1})

results_df = pd.DataFrame(results)
results_df.to_csv("/content/drive/MyDrive/EduPredict_Project/reports/model_comparison.csv", index=False)


In [17]:
# Anomaly Detection Model
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
import joblib

anom_model = IsolationForest(contamination=0.05, random_state=42)
anom_model.fit(X)

joblib.dump(anom_model, f"{model_path}/anomaly_model.pkl")


['/content/drive/MyDrive/EduPredict_Project/models/anomaly_model.pkl']

In [18]:
# Academic Trend Prediction (Grade Progression)
from sklearn.linear_model import LinearRegression

df_trend = df[df["Curricular units 1st sem (grade)"].notnull() & df["Curricular units 2nd sem (grade)"].notnull()]
X_trend = df_trend[["Curricular units 1st sem (grade)"]]
y_trend = df_trend["Curricular units 2nd sem (grade)"]

trend_model = LinearRegression()
trend_model.fit(X_trend, y_trend)

joblib.dump(trend_model, f"{model_path}/trend_model.pkl")


['/content/drive/MyDrive/EduPredict_Project/models/trend_model.pkl']