# eda

In [3]:
import pandas as pd

In [7]:
datasetPath = "StudentsPerformance.csv"

df = pd.read_csv(datasetPath)

In [8]:
df

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75
...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,88,99,95
996,male,group C,high school,free/reduced,none,62,55,55
997,female,group C,high school,free/reduced,completed,59,71,65
998,female,group D,some college,standard,completed,68,78,77


use the categorical columns as features, math score as predictor

## preprocess

In [35]:
features = df[
    ['gender', 'race/ethnicity', 'parental level of education', 'lunch',
       'test preparation course'] 
]

target = df[
    ['math score']
]

In [36]:
features.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course
0,female,group B,bachelor's degree,standard,none
1,female,group C,some college,standard,completed
2,female,group B,master's degree,standard,none
3,male,group A,associate's degree,free/reduced,none
4,male,group C,some college,standard,none


In [37]:
target.head()

Unnamed: 0,math score
0,72
1,69
2,90
3,47
4,76


In [40]:
from sklearn.preprocessing import LabelEncoder

labelEncoderStore = {}

X_encoded = features.copy()

for col in features.columns:
    labelEncoder = LabelEncoder()
    X_encoded[col] = labelEncoder.fit_transform(X_encoded[col])
    labelEncoderStore[col] = labelEncoder

In [44]:
for col in labelEncoderStore:
    print(col, labelEncoderStore[col].classes_)

gender ['female' 'male']
race/ethnicity ['group A' 'group B' 'group C' 'group D' 'group E']
parental level of education ["associate's degree" "bachelor's degree" 'high school' "master's degree"
 'some college' 'some high school']
lunch ['free/reduced' 'standard']
test preparation course ['completed' 'none']


In [48]:
X_encoded.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course
0,0,1,1,1,1
1,0,2,4,1,0
2,0,1,3,1,1
3,1,0,0,0,1
4,1,2,4,1,1


In [49]:
features.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course
0,female,group B,bachelor's degree,standard,none
1,female,group C,some college,standard,completed
2,female,group B,master's degree,standard,none
3,male,group A,associate's degree,free/reduced,none
4,male,group C,some college,standard,none


In [50]:
target.head()

Unnamed: 0,math score
0,72
1,69
2,90
3,47
4,76


In [55]:
X_encoded.shape, target.shape

((1000, 5), (1000, 1))

In [47]:
# X = X_encoded.to_numpy()
# y = target.to_numpy()

# X.shape, y.shape

In [65]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, target,
    test_size=0.2, random_state=1,
    stratify=X_encoded[X_encoded.columns[:-3]]
)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((800, 5), (200, 5), (800, 1), (200, 1))

In [70]:
X_train.head(2)

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course
744,1,1,4,0,1
679,1,3,4,0,1


In [72]:
X_test.head(2)

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course
247,0,1,2,1,0
35,1,4,0,1,0


In [73]:
y_train.head(2)

Unnamed: 0,math score
744,55
679,63


In [74]:
y_test.head(2)

Unnamed: 0,math score
247,58
35,81


## train RFRegressor

In [75]:
from sklearn.ensemble import RandomForestRegressor


model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

  return fit_method(estimator, *args, **kwargs)


0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [76]:
from sklearn.metrics import mean_squared_error

y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Test MSE: {mse:.2f}")

Test MSE: 245.97


In [82]:
for i,j in zip(y_pred[:10], y_test.iloc[:10,0]):
    print(i,j, abs(i-j))

71.35172222222222 58 13.351722222222222
76.91527453102452 81 4.084725468975478
61.29440834810374 63 1.7055916518962633
64.29685714285715 91 26.70314285714285
46.9716053113553 50 3.0283946886447026
61.99713754172386 48 13.997137541723859
63.773642857142875 87 23.226357142857125
66.13303518649292 69 2.8669648135070815
64.58573105932857 58 6.585731059328566
77.64711291486292 76 1.647112914862916


# save model and test loading and prediction again

In [83]:
# === Save model and encoders ===
import joblib


joblib.dump(model, "random_forest_model.joblib")
joblib.dump(labelEncoderStore, "label_encoders.joblib")
print("Model and encoders saved.")

Model and encoders saved.


In [84]:
# test loading

test_model = joblib.load("random_forest_model.joblib")
label_encoders = joblib.load("label_encoders.joblib")

In [85]:
test_model

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [87]:
for col in label_encoders:
    print(col, label_encoders[col].classes_)

gender ['female' 'male']
race/ethnicity ['group A' 'group B' 'group C' 'group D' 'group E']
parental level of education ["associate's degree" "bachelor's degree" 'high school' "master's degree"
 'some college' 'some high school']
lunch ['free/reduced' 'standard']
test preparation course ['completed' 'none']


In [91]:
X_test.shape

(200, 5)

In [93]:
import numpy as np


testArr = np.array([
    1,1,1,1,1
]).reshape(1,-1)

test_model.predict(testArr)



array([62.90832143])

In [98]:
label_encoders.keys()

dict_keys(['gender', 'race/ethnicity', 'parental level of education', 'lunch', 'test preparation course'])

In [99]:
label_encoders_column_mapping = {
    0: 'gender', 
    1: 'race/ethnicity', 
    2: 'parental level of education', 
    3: 'lunch', 
    4: 'test preparation course'
}

In [107]:
from typing import List


def get_features(single_X_input: np.array) -> List[str]:
    assert single_X_input.shape == (1, 5)
    
    res = []
    for i, num in enumerate(single_X_input.reshape(-1,1)):
        colName = label_encoders_column_mapping[i]
        f = label_encoders[colName].classes_[num][0]
        res.append(f)

    return res

In [108]:
testArr

array([[1, 1, 1, 1, 1]])

In [109]:
get_features(testArr)

['male', 'group B', "bachelor's degree", 'standard', 'none']

In [110]:
labelEncoderStore

{'gender': LabelEncoder(),
 'race/ethnicity': LabelEncoder(),
 'parental level of education': LabelEncoder(),
 'lunch': LabelEncoder(),
 'test preparation course': LabelEncoder()}

In [111]:
labelEncoderStore["race_ethnicity"] = labelEncoderStore["race/ethnicity"]
del labelEncoderStore["race/ethnicity"]

labelEncoderStore["parental_level_of_education"] = labelEncoderStore["parental level of education"]
del labelEncoderStore["parental level of education"]

labelEncoderStore["test_preparation_course"] = labelEncoderStore["test preparation course"]
del labelEncoderStore["test preparation course"]

In [None]:
{
  "gender": "female",
  "race_ethnicity": "group A",
  "parental_level_of_education": "associate's degree",
  "lunch": "free/reduced",
  "test_preparation_course": "completed"
}

In [112]:
labelEncoderStore

{'gender': LabelEncoder(),
 'lunch': LabelEncoder(),
 'race_ethnicity': LabelEncoder(),
 'parental_level_of_education': LabelEncoder(),
 'test_preparation_course': LabelEncoder()}

In [113]:
joblib.dump(labelEncoderStore, "label_encoders.joblib")


['label_encoders.joblib']

In [114]:
d = {
  "gender": "female",
  "race_ethnicity": "group A",
  "parental_level_of_education": "associate's degree",
  "lunch": "free/reduced",
  "test_preparation_course": "completed"
}

In [115]:
d["gender"]

'female'

In [119]:
q = labelEncoderStore["gender"].transform([d["gender"]])

In [121]:
q.item()

0

In [126]:
a = np.array([
    [0,0,0,0,0]
])

model.predict(a).item()



57.851499999999994

In [125]:
print(a)

[[0 0 0 0 0]]


In [124]:
a.shape

(1, 5)

In [None]:
mode