In [None]:
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')

data_path = "/content/drive/MyDrive/EduPredict_Project/data"
df = pd.read_csv(f"{data_path}/academic_cleaned.csv")
df.head()


Mounted at /content/drive


Unnamed: 0,Marital Status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target_Enrolled,Target_Graduate
0,1,17,5,171,1,1,122.0,1,19,12,...,0,0,0,0.0,0,10.8,1.4,1.74,False,False
1,1,15,1,9254,1,1,160.0,1,1,3,...,6,6,6,13.666667,0,13.9,-0.3,0.79,False,True
2,1,1,5,9070,1,1,122.0,1,37,37,...,6,0,0,0.0,0,10.8,1.4,1.74,False,False
3,1,17,2,9773,1,1,122.0,1,38,37,...,6,10,5,12.4,0,9.4,-0.8,-3.12,False,True
4,2,39,1,8014,0,1,100.0,1,37,38,...,6,6,6,13.0,0,13.9,-0.3,0.79,False,True


In [None]:
df.columns.to_list()

['Marital Status',
 'Application mode',
 'Application order',
 'Course',
 'Daytime/evening attendance',
 'Previous qualification',
 'Previous qualification (grade)',
 'Nacionality',
 "Mother's qualification",
 "Father's qualification",
 "Mother's occupation",
 "Father's occupation",
 'Admission grade',
 'Displaced',
 'Educational special needs',
 'Debtor',
 'Tuition fees up to date',
 'Gender',
 'Scholarship holder',
 'Age at enrollment',
 'International',
 'Curricular units 1st sem (credited)',
 'Curricular units 1st sem (enrolled)',
 'Curricular units 1st sem (evaluations)',
 'Curricular units 1st sem (approved)',
 'Curricular units 1st sem (grade)',
 'Curricular units 1st sem (without evaluations)',
 'Curricular units 2nd sem (credited)',
 'Curricular units 2nd sem (enrolled)',
 'Curricular units 2nd sem (evaluations)',
 'Curricular units 2nd sem (approved)',
 'Curricular units 2nd sem (grade)',
 'Curricular units 2nd sem (without evaluations)',
 'Unemployment rate',
 'Inflation rat

In [None]:
def get_label(row):
    if row['Target_Graduate'] == 1:
        return 'Graduate'
    elif row['Target_Enrolled'] == 1:
        return 'Enrolled'
    else:
        return 'Dropout'

df['Target'] = df.apply(get_label, axis=1)


In [None]:
df.drop(['Target_Graduate', 'Target_Enrolled'], axis=1, inplace=True)


In [None]:
X = df.drop('Target', axis=1)
y = df['Target']


In [None]:
df["Target"].value_counts()


Unnamed: 0_level_0,count
Target,Unnamed: 1_level_1
Graduate,2209
Dropout,1421
Enrolled,794


In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)


In [None]:
print(label_encoder.classes_)


['Dropout' 'Enrolled' 'Graduate']


In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

log_model = LogisticRegression(max_iter=5000)
rf_model = RandomForestClassifier()
xgb_model = XGBClassifier(eval_metric='mlogloss')

log_model.fit(X_train, y_train)
rf_model.fit(X_train, y_train)
xgb_model.fit(X_train, y_train)


In [None]:
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

models = {
    "Logistic Regression": log_model,
    "Random Forest": rf_model,
    "XGBoost": xgb_model
}

for name, model in models.items():
    y_pred = model.predict(X_test)
    print(f"\nModel: {name}")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("F1 Score:", f1_score(y_test, y_pred, average='weighted'))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))



Model: Logistic Regression
Accuracy: 0.768361581920904
F1 Score: 0.7531276658422533
Confusion Matrix:
 [[218  29  37]
 [ 43  53  63]
 [ 14  19 409]]
Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.77      0.78       284
           1       0.52      0.33      0.41       159
           2       0.80      0.93      0.86       442

    accuracy                           0.77       885
   macro avg       0.71      0.68      0.68       885
weighted avg       0.75      0.77      0.75       885


Model: Random Forest
Accuracy: 0.7728813559322034
F1 Score: 0.7602881482190712
Confusion Matrix:
 [[214  25  45]
 [ 36  60  63]
 [  9  23 410]]
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.75      0.79       284
           1       0.56      0.38      0.45       159
           2       0.79      0.93      0.85       442

    accuracy                           0.77       885
   m

In [None]:
import joblib

model_path = "/content/drive/MyDrive/EduPredict_Project/models"
joblib.dump(rf_model, f"{model_path}/rf_model.pkl")


['/content/drive/MyDrive/EduPredict_Project/models/rf_model.pkl']

In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score

results = []

for name, model in models.items():
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    results.append({'Model': name, 'Accuracy': acc, 'F1 Score': f1})

results_df = pd.DataFrame(results)
results_df.to_csv("/content/drive/MyDrive/EduPredict_Project/reports/model_comparison.csv", index=False)


In [None]:
# Anomaly Detection Model
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
import joblib

anom_model = IsolationForest(contamination=0.05, random_state=42)
anom_model.fit(X)

joblib.dump(anom_model, f"{model_path}/anomaly_model.pkl")


['/content/drive/MyDrive/EduPredict_Project/models/anomaly_model.pkl']

In [None]:
# Academic Trend Prediction (Grade Progression)
from sklearn.linear_model import LinearRegression

df_trend = df[df["Curricular units 1st sem (grade)"].notnull() & df["Curricular units 2nd sem (grade)"].notnull()]
X_trend = df_trend[["Curricular units 1st sem (grade)"]]
y_trend = df_trend["Curricular units 2nd sem (grade)"]

trend_model = LinearRegression()
trend_model.fit(X_trend, y_trend)

joblib.dump(trend_model, f"{model_path}/trend_model.pkl")


['/content/drive/MyDrive/EduPredict_Project/models/trend_model.pkl']

In [None]:
from google.colab import drive
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report

# Re-mount Google Drive
drive.mount('/content/drive')

# Re-define data_path
data_path = "/content/drive/MyDrive/EduPredict_Project/data"

# Re-load data
df = pd.read_csv(f"{data_path}/academic_cleaned.csv")

# Re-create Target column
def get_label(row):
    if row['Target_Graduate'] == 1:
        return 'Graduate'
    elif row['Target_Enrolled'] == 1:
        return 'Enrolled'
    else:
        return 'Dropout'

df['Target'] = df.apply(get_label, axis=1)
df.drop(['Target_Graduate', 'Target_Enrolled'], axis=1, inplace=True)

# Re-create X and y
X = df.drop('Target', axis=1)
y = df['Target']

# Re-encode y
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Re-scale X
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Re-split data
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

# Re-define and train the models
log_model = LogisticRegression(max_iter=5000)
rf_model = RandomForestClassifier()
xgb_model = XGBClassifier(eval_metric='mlogloss')

log_model.fit(X_train, y_train)
rf_model.fit(X_train, y_train)
xgb_model.fit(X_train, y_train)

models = {
    "Logistic Regression": log_model,
    "Random Forest": rf_model,
    "XGBoost": xgb_model
}

results = []

for name, model in models.items():
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    results.append({'Model': name, 'Accuracy': acc, 'F1 Score': f1})
    print(f"\nModel: {name}")
    print("Accuracy:", acc)
    print("F1 Score:", f1)
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))

results_df = pd.DataFrame(results)

print("\nAnalysis of Model Performance:")

# Analyze Logistic Regression
print("\n--- Logistic Regression Analysis ---")
print("Accuracy:", results_df[results_df['Model'] == 'Logistic Regression']['Accuracy'].values[0])
print("F1 Score:", results_df[results_df['Model'] == 'Logistic Regression']['F1 Score'].values[0])

print("Confusion Matrix Analysis:")
print("- 'Dropout' (0): Analysis based on the printed confusion matrix above.")
print("- 'Enrolled' (1): Analysis based on the printed confusion matrix above.")
print("- 'Graduate' (2): Analysis based on the printed confusion matrix above.")
print("Classification Report Analysis:")
print("- 'Enrolled' class has the lowest precision, recall, and F1-score, indicating it's the most challenging to predict for this model.")
print("- 'Graduate' class has the highest recall and F1-score.")

# Analyze Random Forest
print("\n--- Random Forest Analysis ---")
print("Accuracy:", results_df[results_df['Model'] == 'Random Forest']['Accuracy'].values[0])
print("F1 Score:", results_df[results_df['Model'] == 'Random Forest']['F1 Score'].values[0])

print("Confusion Matrix Analysis:")
print("- 'Dropout' (0): Analysis based on the printed confusion matrix above.")
print("- 'Enrolled' (1): Analysis based on the printed confusion matrix above.")
print("- 'Graduate' (2): Analysis based on the printed confusion matrix above.")
print("Classification Report Analysis:")
print("- Similar to Logistic Regression, 'Enrolled' class has the lowest performance.")
print("- Random Forest has slightly better precision for 'Dropout' compared to Logistic Regression.")

# Analyze XGBoost
print("\n--- XGBoost Analysis ---")
print("Accuracy:", results_df[results_df['Model'] == 'XGBoost']['Accuracy'].values[0])
print("F1 Score:", results_df[results_df['Model'] == 'XGBoost']['F1 Score'].values[0])

print("Confusion Matrix Analysis:")
print("- 'Dropout' (0): Analysis based on the printed confusion matrix above.")
print("- 'Enrolled' (1): Analysis based on the printed confusion matrix above.")
print("- 'Graduate' (2): Analysis based on the printed confusion matrix above.")
print("Classification Report Analysis:")
print("- XGBoost also struggles most with the 'Enrolled' class, although it may have slightly better recall and F1-score for this class compared to the other models.")
print("- XGBoost has high precision for 'Dropout' and 'Graduate'.")

print("\nOverall Summary:")
print("All models perform reasonably well overall, with accuracies around 76-77%.")
print("All models struggle significantly with predicting the 'Enrolled' class, exhibiting lower precision, recall, and F1-scores for this category.")
print("'Graduate' is the easiest class to predict for all models, with high recall and F1-scores.")
print("Random Forest and XGBoost generally show slightly better performance on the 'Enrolled' class compared to Logistic Regression.")

Mounted at /content/drive

Model: Logistic Regression
Accuracy: 0.768361581920904
F1 Score: 0.7531276658422533
Confusion Matrix:
 [[218  29  37]
 [ 43  53  63]
 [ 14  19 409]]
Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.77      0.78       284
           1       0.52      0.33      0.41       159
           2       0.80      0.93      0.86       442

    accuracy                           0.77       885
   macro avg       0.71      0.68      0.68       885
weighted avg       0.75      0.77      0.75       885


Model: Random Forest
Accuracy: 0.7717514124293785
F1 Score: 0.7588760003551247
Confusion Matrix:
 [[211  25  48]
 [ 36  61  62]
 [ 11  20 411]]
Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.74      0.78       284
           1       0.58      0.38      0.46       159
           2       0.79      0.93      0.85       442

    accuracy                    

In [None]:
from sklearn.model_selection import GridSearchCV

log_param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l2']
}

rf_param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

xgb_param_grid = {
    'n_estimators': [100, 200, 500],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7]
}

In [None]:
grid_search_log = GridSearchCV(log_model, log_param_grid, cv=5, scoring='f1_weighted')
grid_search_log.fit(X_train, y_train)
best_log_model = grid_search_log.best_estimator_

grid_search_rf = GridSearchCV(rf_model, rf_param_grid, cv=5, scoring='f1_weighted')
grid_search_rf.fit(X_train, y_train)
best_rf_model = grid_search_rf.best_estimator_

grid_search_xgb = GridSearchCV(xgb_model, xgb_param_grid, cv=5, scoring='f1_weighted')
grid_search_xgb.fit(X_train, y_train)
best_xgb_model = grid_search_xgb.best_estimator_

tuned_models = {
    "Tuned Logistic Regression": best_log_model,
    "Tuned Random Forest": best_rf_model,
    "Tuned XGBoost": best_xgb_model
}

In [None]:
print("Analysis of Model Performance:")

# Analyze Logistic Regression
print("\n--- Logistic Regression Analysis ---")
print("Accuracy:", results_df[results_df['Model'] == 'Logistic Regression']['Accuracy'].values[0])
print("F1 Score:", results_df[results_df['Model'] == 'Logistic Regression']['F1 Score'].values[0])
# From the previous output:
# Confusion Matrix:
# [[218  29  37]
#  [ 43  53  63]
#  [ 14  19 409]]
# Classification Report:
#               precision    recall  f1-score   support
#            0       0.79      0.77      0.78       284
#            1       0.52      0.33      0.41       159
#            2       0.80      0.93      0.86       442

print("Confusion Matrix Analysis:")
print("- 'Dropout' (0): 218 correctly predicted, 29 misclassified as 'Enrolled', 37 misclassified as 'Graduate'")
print("- 'Enrolled' (1): 53 correctly predicted, 43 misclassified as 'Dropout', 63 misclassified as 'Graduate'")
print("- 'Graduate' (2): 409 correctly predicted, 14 misclassified as 'Dropout', 19 misclassified as 'Enrolled'")
print("Classification Report Analysis:")
print("- 'Enrolled' class has the lowest precision (0.52), recall (0.33), and F1-score (0.41), indicating it's the most challenging to predict for this model.")
print("- 'Graduate' class has the highest recall (0.93) and F1-score (0.86).")

# Analyze Random Forest
print("\n--- Random Forest Analysis ---")
print("Accuracy:", results_df[results_df['Model'] == 'Random Forest']['Accuracy'].values[0])
print("F1 Score:", results_df[results_df['Model'] == 'Random Forest']['F1 Score'].values[0])
# From the previous output:
# Confusion Matrix:
# [[214  25  45]
#  [ 36  60  63]
#  [  9  23 410]]
# Classification Report:
#               precision    recall  f1-score   support
#            0       0.83      0.75      0.79       284
#            1       0.56      0.38      0.45       159
#            2       0.79      0.93      0.85       442

print("Confusion Matrix Analysis:")
print("- 'Dropout' (0): 214 correctly predicted, 25 misclassified as 'Enrolled', 45 misclassified as 'Graduate'")
print("- 'Enrolled' (1): 60 correctly predicted, 36 misclassified as 'Dropout', 63 misclassified as 'Graduate'")
print("- 'Graduate' (2): 410 correctly predicted, 9 misclassified as 'Dropout', 23 misclassified as 'Enrolled'")
print("Classification Report Analysis:")
print("- Similar to Logistic Regression, 'Enrolled' class has the lowest performance (precision 0.56, recall 0.38, F1-score 0.45).")
print("- Random Forest has slightly better precision for 'Dropout' (0.83) compared to Logistic Regression.")

# Analyze XGBoost
print("\n--- XGBoost Analysis ---")
print("Accuracy:", results_df[results_df['Model'] == 'XGBoost']['Accuracy'].values[0])
print("F1 Score:", results_df[results_df['Model'] == 'XGBoost']['F1 Score'].values[0])
# From the previous output:
# Confusion Matrix:
# [[211  35  38]
#  [ 38  72  49]
#  [ 13  33 396]]
# Classification Report:
#               precision    recall  f1-score   support
#            0       0.81      0.74      0.77       284
#            1       0.51      0.45      0.48       159
#            2       0.82      0.90      0.86       442

print("Confusion Matrix Analysis:")
print("- 'Dropout' (0): 211 correctly predicted, 35 misclassified as 'Enrolled', 38 misclassified as 'Graduate'")
print("- 'Enrolled' (1): 72 correctly predicted, 38 misclassified as 'Dropout', 49 misclassified as 'Graduate'")
print("- 'Graduate' (2): 396 correctly predicted, 13 misclassified as 'Dropout', 33 misclassified as 'Enrolled'")
print("Classification Report Analysis:")
print("- XGBoost also struggles most with the 'Enrolled' class, although it has slightly better recall (0.45) and F1-score (0.48) for this class compared to the other models.")
print("- XGBoost has high precision for 'Dropout' (0.81) and 'Graduate' (0.82).")

print("\nOverall Summary:")
print("All models perform reasonably well overall, with accuracies around 76-77%.")
print("All models struggle significantly with predicting the 'Enrolled' class, exhibiting lower precision, recall, and F1-scores for this category.")
print("'Graduate' is the easiest class to predict for all models, with high recall and F1-scores.")
print("Random Forest and XGBoost generally show slightly better performance on the 'Enrolled' class compared to Logistic Regression.")

Analysis of Model Performance:

--- Logistic Regression Analysis ---
Accuracy: 0.768361581920904
F1 Score: 0.7531276658422533
Confusion Matrix Analysis:
- 'Dropout' (0): 218 correctly predicted, 29 misclassified as 'Enrolled', 37 misclassified as 'Graduate'
- 'Enrolled' (1): 53 correctly predicted, 43 misclassified as 'Dropout', 63 misclassified as 'Graduate'
- 'Graduate' (2): 409 correctly predicted, 14 misclassified as 'Dropout', 19 misclassified as 'Enrolled'
Classification Report Analysis:
- 'Enrolled' class has the lowest precision (0.52), recall (0.33), and F1-score (0.41), indicating it's the most challenging to predict for this model.
- 'Graduate' class has the highest recall (0.93) and F1-score (0.86).

--- Random Forest Analysis ---
Accuracy: 0.7717514124293785
F1 Score: 0.7588760003551247
Confusion Matrix Analysis:
- 'Dropout' (0): 214 correctly predicted, 25 misclassified as 'Enrolled', 45 misclassified as 'Graduate'
- 'Enrolled' (1): 60 correctly predicted, 36 misclassifie

In [None]:
from sklearn.model_selection import GridSearchCV

log_param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l2']
}

rf_param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

xgb_param_grid = {
    'n_estimators': [100, 200, 500],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7]
}

In [None]:
grid_search_log = GridSearchCV(log_model, log_param_grid, cv=5, scoring='f1_weighted')
grid_search_log.fit(X_train, y_train)
best_log_model = grid_search_log.best_estimator_

grid_search_rf = GridSearchCV(rf_model, rf_param_grid, cv=5, scoring='f1_weighted')
grid_search_rf.fit(X_train, y_train)
best_rf_model = grid_search_rf.best_estimator_

grid_search_xgb = GridSearchCV(xgb_model, xgb_param_grid, cv=5, scoring='f1_weighted')
grid_search_xgb.fit(X_train, y_train)
best_xgb_model = grid_search_xgb.best_estimator_

tuned_models = {
    "Tuned Logistic Regression": best_log_model,
    "Tuned Random Forest": best_rf_model,
    "Tuned XGBoost": best_xgb_model
}

In [None]:
results_tuned = []

for name, model in tuned_models.items():
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    results_tuned.append({'Model': name, 'Accuracy': acc, 'F1 Score': f1})
    print(f"\nModel: {name}")
    print("Accuracy:", acc)
    print("F1 Score:", f1)
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))

results_tuned_df = pd.DataFrame(results_tuned)

print("\n--- Comparison of Original and Tuned Model Performance ---")
print("\nOriginal Models:")
display(results_df)
print("\nTuned Models:")
display(results_tuned_df)


Model: Tuned Logistic Regression
Accuracy: 0.768361581920904
F1 Score: 0.7531276658422533
Confusion Matrix:
 [[218  29  37]
 [ 43  53  63]
 [ 14  19 409]]
Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.77      0.78       284
           1       0.52      0.33      0.41       159
           2       0.80      0.93      0.86       442

    accuracy                           0.77       885
   macro avg       0.71      0.68      0.68       885
weighted avg       0.75      0.77      0.75       885


Model: Tuned Random Forest
Accuracy: 0.7740112994350282
F1 Score: 0.7588214872942213
Confusion Matrix:
 [[218  19  47]
 [ 41  56  62]
 [  9  22 411]]
Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.77      0.79       284
           1       0.58      0.35      0.44       159
           2       0.79      0.93      0.85       442

    accuracy                           0.77   

Unnamed: 0,Model,Accuracy,F1 Score
0,Logistic Regression,0.768362,0.753128
1,Random Forest,0.771751,0.758876
2,XGBoost,0.767232,0.762175



Tuned Models:


Unnamed: 0,Model,Accuracy,F1 Score
0,Tuned Logistic Regression,0.768362,0.753128
1,Tuned Random Forest,0.774011,0.758821
2,Tuned XGBoost,0.753672,0.748331


In [None]:
# Anomaly Detection Model
from sklearn.ensemble import IsolationForest
import joblib

# Assuming X is still available from previous cells.
# If not, it would need to be re-created or loaded.

anom_model = IsolationForest(contamination=0.05, random_state=42)
anom_model.fit(X)

# Assuming model_path is still defined.
# If not, it would need to be re-defined.
model_path = "/content/drive/MyDrive/EduPredict_Project/models"
joblib.dump(anom_model, f"{model_path}/anomaly_model.pkl")

['/content/drive/MyDrive/EduPredict_Project/models/anomaly_model.pkl']

In [None]:
# Academic Trend Prediction (Grade Progression)
from sklearn.linear_model import LinearRegression
import joblib

# Assuming df is still available and the 'Curricular units 1st sem (grade)' and
# 'Curricular units 2nd sem (grade)' columns exist.
# If not, df and these columns would need to be re-created or loaded.

df_trend = df[df["Curricular units 1st sem (grade)"].notnull() & df["Curricular units 2nd sem (grade)"].notnull()]
X_trend = df_trend[["Curricular units 1st sem (grade)"]]
y_trend = df_trend["Curricular units 2nd sem (grade)"]

trend_model = LinearRegression()
trend_model.fit(X_trend, y_trend)

# Assuming model_path is still defined.
# If not, it would need to be re-defined.
model_path = "/content/drive/MyDrive/EduPredict_Project/models"
joblib.dump(trend_model, f"{model_path}/trend_model.pkl")

['/content/drive/MyDrive/EduPredict_Project/models/trend_model.pkl']

In [None]:
import joblib
import pandas as pd

# Assuming tuned_models dictionary and model_path are still defined
# If not, they would need to be re-created or loaded.
model_path = "/content/drive/MyDrive/EduPredict_Project/models"

# Save the tuned classification models
for name, model in tuned_models.items():
    model_filename = f"{name.replace(' ', '_').lower()}_model.pkl"
    joblib.dump(model, f"{model_path}/{model_filename}")
    print(f"Saved {name} to {model_path}/{model_filename}")

# Assuming results_tuned_df is available from the evaluation step.
# If not, it would need to be re-created from the tuned_models and test data.

# Update the model comparison report
report_path = "/content/drive/MyDrive/EduPredict_Project/reports"
results_tuned_df.to_csv(f"{report_path}/model_comparison_tuned.csv", index=False)

print(f"\nUpdated model comparison report saved to {report_path}/model_comparison_tuned.csv")

Saved Tuned Logistic Regression to /content/drive/MyDrive/EduPredict_Project/models/tuned_logistic_regression_model.pkl
Saved Tuned Random Forest to /content/drive/MyDrive/EduPredict_Project/models/tuned_random_forest_model.pkl
Saved Tuned XGBoost to /content/drive/MyDrive/EduPredict_Project/models/tuned_xgboost_model.pkl

Updated model comparison report saved to /content/drive/MyDrive/EduPredict_Project/reports/model_comparison_tuned.csv
