In [6]:
import pandas as pd
import joblib
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, f1_score


df = pd.read_csv("../data/StudentsPerformance_features.csv")

print(df.columns)
df.head()


Index(['math score', 'reading score', 'writing score', 'results',
       'results_binary', 'result_binary', 'gender_male',
       'race/ethnicity_group B', 'race/ethnicity_group C',
       'race/ethnicity_group D', 'race/ethnicity_group E',
       'parental level of education_bachelor's degree',
       'parental level of education_high school',
       'parental level of education_master's degree',
       'parental level of education_some college',
       'parental level of education_some high school', 'lunch_standard',
       'test preparation course_none', 'total_score', 'avg_score',
       'performance_category'],
      dtype='object')


Unnamed: 0,math score,reading score,writing score,results,results_binary,result_binary,gender_male,race/ethnicity_group B,race/ethnicity_group C,race/ethnicity_group D,...,parental level of education_bachelor's degree,parental level of education_high school,parental level of education_master's degree,parental level of education_some college,parental level of education_some high school,lunch_standard,test preparation course_none,total_score,avg_score,performance_category
0,0.390024,0.193999,0.391492,Pass,1,1,False,True,False,False,...,True,False,False,False,False,True,True,0.975514,0.325171,Low
1,0.192076,1.427476,1.313269,Pass,1,1,False,False,True,False,...,False,False,False,True,False,True,False,2.93282,0.977607,Low
2,1.577711,1.770109,1.642475,Pass,1,1,False,True,False,False,...,False,False,True,False,False,True,True,4.990295,1.663432,Low
3,-1.259543,-0.833899,-1.583744,Pass,1,1,True,False,False,False,...,False,False,False,False,False,False,True,-3.677187,-1.225729,
4,0.653954,0.605158,0.457333,Pass,1,1,True,False,True,False,...,False,False,False,True,False,True,True,1.716445,0.572148,Low


## Load Feature-Engineered Data

- Load the dataset processed from the Feature Engineering step.
- All categorical variables are encoded, numerical features scaled, and engineered features included.
- Target column: `result_binary` (1 = Pass, 0 = Fail).


In [2]:
# ----------------------
# 3️⃣ Split Features and Target
# ----------------------
y = df['result_binary']
X = df.drop(columns=['result_binary', 'results'])  # drop original string column

# Optional: check class balance
print("Target distribution:\n", y.value_counts())


Target distribution:
 result_binary
1    971
0     29
Name: count, dtype: int64


## Split Features and Target

- `X` contains all predictors (numerical + encoded categorical + engineered features).
- `y` is the target variable `result_binary`.
- Original string column `result` is dropped to avoid leakage.


In [3]:
rf = RandomForestClassifier(random_state=42)
lr = LogisticRegression(max_iter=500, random_state=42)

## Baseline Models for Tuning

- Random Forest Classifier and Logistic Regression are selected for hyperparameter tuning.
- Random Forest: ensemble tree-based model suitable for non-linear relationships.
- Logistic Regression: linear model, interpretable and quick to train.


In [None]:
cat_cols = X.select_dtypes(include='object').columns
print("Categorical columns to encode:", list(cat_cols))

X_encoded = pd.get_dummies(X, columns=cat_cols, drop_first=True)

rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
f1_scorer = make_scorer(f1_score)

rf = RandomForestClassifier(random_state=42)

rf_grid = GridSearchCV(
    estimator=rf,
    param_grid=rf_param_grid,
    cv=cv,
    scoring=f1_scorer,
    n_jobs=-1,
    verbose=2
)

rf_grid.fit(X_encoded, y)

best_rf = rf_grid.best_estimator_
print("Best Random Forest Hyperparameters:", rf_grid.best_params_)


Categorical columns to encode: ['performance_category']
Fitting 5 folds for each of 162 candidates, totalling 810 fits


## Random Forest Hyperparameter Tuning

- Used GridSearchCV with 5-fold Stratified Cross-Validation to tune hyperparameters.
- Hyperparameters tuned:
    - `n_estimators`: number of trees
    - `max_depth`: maximum depth of tree
    - `min_samples_split`: minimum samples to split a node
    - `min_samples_leaf`: minimum samples in a leaf
    - `max_features`: number of features to consider for split
- F1 Score used as the scoring metric to account for class imbalance.


In [None]:
lr_param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'penalty': ['l2'], 
    'solver': ['lbfgs', 'liblinear']
}

lr_grid = GridSearchCV(
    estimator=lr,
    param_grid=lr_param_grid,
    cv=cv,
    scoring=f1_scorer,
    n_jobs=-1,
    verbose=2
)

lr_grid.fit(X, y)

best_lr = lr_grid.best_estimator_
print("Best Logistic Regression Hyperparameters:", lr_grid.best_params_)

## Logistic Regression Hyperparameter Tuning

- GridSearchCV used with same 5-fold stratified cross-validation.
- Hyperparameters tuned:
    - `C`: inverse regularization strength
    - `solver`: optimization algorithm
- F1 Score used as metric.


In [7]:
import os
import joblib

os.makedirs("../data", exist_ok=True)

joblib.dump(best_rf, "../data/Random_Forest_tuned.pkl")
print("Model saved successfully!")


Model saved successfully!


## Save Tuned Models

- The best hyperparameter-tuned models are saved using `joblib`.
- These models can be loaded later in the evaluation notebook to test performance on unseen data.
