In [30]:
import pandas as pd
import numpy as np 

df = pd.read_csv('../data/StudentsPerformance_processed.csv')
print(df.columns)
df.head()

Index(['gender', 'race/ethnicity', 'parental level of education', 'lunch',
       'test preparation course', 'math score', 'reading score',
       'writing score', 'results', 'results_binary'],
      dtype='object')


Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,results,results_binary
0,female,group B,bachelor's degree,standard,none,72,72,74,Pass,1
1,female,group C,some college,standard,completed,69,90,88,Pass,1
2,female,group B,master's degree,standard,none,90,95,93,Pass,1
3,male,group A,associate's degree,free/reduced,none,47,57,44,Pass,1
4,male,group C,some college,standard,none,76,78,75,Pass,1


## Load Processed Data

- The dataset processed in the EDA step is loaded.
- Includes the newly created `result` column (Pass/Fail) as the target variable.
- All features are retained from the EDA step, including numerical and categorical features.


In [32]:
df['result_binary'] = df['results'].map({'Pass':1, 'Fail':0})

## Encode Target

- The `result` column is converted from categorical ('Pass'/'Fail') to numeric (1/0) for modeling.
- `1` represents Pass, `0` represents Fail.
- This allows compatibility with classification models.


In [33]:
categorical_cols = ['gender', 'race/ethnicity', 'parental level of education', 'lunch', 'test preparation course']

# One-Hot Encoding
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)


## Encode Categorical Features

- Categorical features are converted to numeric using One-Hot Encoding.
- `drop_first=True` avoids multicollinearity.
- Features ready for machine learning models.


In [34]:
from sklearn.preprocessing import StandardScaler

numerical_cols = ['math score', 'reading score', 'writing score']

scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])


## Scale Numerical Features

- Numerical features are scaled using StandardScaler.
- Scaling standardizes features to have mean=0 and std=1.
- Essential for models sensitive to feature magnitude (e.g., Logistic Regression, SVM).
- Tree-based models (RandomForest, XGBoost) do not require scaling but including it for completeness.


In [35]:
# Total Score
df['total_score'] = df[['math score', 'reading score', 'writing score']].sum(axis=1)

# Average Score
df['avg_score'] = df['total_score'] / 3

# Performance Category
df['performance_category'] = pd.cut(df['avg_score'],
                                    bins=[0,50,70,100],
                                    labels=['Low','Medium','High'])


## Feature Creation

- `total_score` = sum of math, reading, and writing scores.
- `avg_score` = average of all three scores.
- `performance_category` = categorical binning of average score into Low / Medium / High.
- These features capture overall academic performance and non-linear relationships.


In [40]:
# Save processed features for modeling
df.to_csv("../data/StudentsPerformance_features.csv", index=False)
