In [13]:
import pandas as pd
import sys
import os

sys.path.append(os.path.abspath(".."))
# Load the dataset from Feature Engineering step
df = pd.read_csv("../data/StudentsPerformance_features.csv")

# Check columns
print(df.columns)
df.head()

Index(['math score', 'reading score', 'writing score', 'results',
       'results_binary', 'result_binary', 'gender_male',
       'race/ethnicity_group B', 'race/ethnicity_group C',
       'race/ethnicity_group D', 'race/ethnicity_group E',
       'parental level of education_bachelor's degree',
       'parental level of education_high school',
       'parental level of education_master's degree',
       'parental level of education_some college',
       'parental level of education_some high school', 'lunch_standard',
       'test preparation course_none', 'total_score', 'avg_score',
       'performance_category'],
      dtype='object')


Unnamed: 0,math score,reading score,writing score,results,results_binary,result_binary,gender_male,race/ethnicity_group B,race/ethnicity_group C,race/ethnicity_group D,...,parental level of education_bachelor's degree,parental level of education_high school,parental level of education_master's degree,parental level of education_some college,parental level of education_some high school,lunch_standard,test preparation course_none,total_score,avg_score,performance_category
0,0.390024,0.193999,0.391492,Pass,1,1,False,True,False,False,...,True,False,False,False,False,True,True,0.975514,0.325171,Low
1,0.192076,1.427476,1.313269,Pass,1,1,False,False,True,False,...,False,False,False,True,False,True,False,2.93282,0.977607,Low
2,1.577711,1.770109,1.642475,Pass,1,1,False,True,False,False,...,False,False,True,False,False,True,True,4.990295,1.663432,Low
3,-1.259543,-0.833899,-1.583744,Pass,1,1,True,False,False,False,...,False,False,False,False,False,False,True,-3.677187,-1.225729,
4,0.653954,0.605158,0.457333,Pass,1,1,True,False,True,False,...,False,False,False,True,False,True,True,1.716445,0.572148,Low


## Load Feature-Engineered Data

- This notebook uses the feature-engineered dataset saved in the previous step.
- All categorical variables are encoded, numerical features are scaled, and additional features like `total_score` and `performance_category` are included.
- The target column is `result_binary` (1 = Pass, 0 = Fail).


In [14]:
target = 'result_binary'

feature_cols = [
    'gender_male',
    'race/ethnicity_group B',
    'race/ethnicity_group C',
    'race/ethnicity_group D',
    'race/ethnicity_group E',
    "parental level of education_bachelor's degree",
    'parental level of education_high school',
    "parental level of education_master's degree",
    'parental level of education_some college',
    'parental level of education_some high school',
    'lunch_standard',
    'test preparation course_none'
]

X = df[feature_cols]
y = df[target]


## Split Features and Target

- `X` contains all predictor variables (numerical + encoded categorical + engineered features).
- `y` is the target variable (`result_binary`).
- Original string column `result` is dropped to avoid leakage.


In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    stratify=y,
    random_state=42
)


## Train/Test Split

- The dataset is split into training (80%) and testing (20%) sets.
- `stratify=y` ensures the target class distribution is preserved in both sets.
- Random state is set for reproducibility.


In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

# Initialize models
models = {
    "Logistic Regression": LogisticRegression(max_iter=500),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(n_estimators=100, random_state=42, use_label_encoder=False, eval_metric='logloss')
}


## Model Selection

- Three models are chosen for comparison: Logistic Regression, Random Forest, and XGBoost.
- Logistic Regression: interpretable linear model.
- Random Forest: tree-based ensemble, handles non-linear relationships well.
- XGBoost: gradient boosting model, often achieves state-of-the-art performance.


In [17]:
print(y.value_counts())
print(y.value_counts(normalize=True))


result_binary
1    971
0     29
Name: count, dtype: int64
result_binary
1    0.971
0    0.029
Name: proportion, dtype: float64


In [19]:
from src.pipeline import train_pipeline

trained_pipeline = train_pipeline(df)

import joblib
joblib.dump(trained_pipeline, "../data/random_forest_pipeline.pkl")


Validation F1 Score: 0.9526


['../data/random_forest_pipeline.pkl']