# Predicting Student GPA
Using a random forest regression model

## Imports

In [3]:
# import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# import dataset
df = pd.read_csv("Student_performance_data.csv")

# Display dataset head
df.head()

Unnamed: 0,StudentID,Age,Gender,Ethnicity,ParentalEducation,StudyTimeWeekly,Absences,Tutoring,ParentalSupport,Extracurricular,Sports,Music,Volunteering,GPA,GradeClass
0,1001,17,1,0,2,19.833723,7,1,2,0,0,1,0,2.929196,2.0
1,1002,18,0,0,1,15.408756,0,0,1,0,0,0,0,3.042915,1.0
2,1003,15,0,2,3,4.21057,26,0,2,0,0,0,0,0.112602,4.0
3,1004,17,1,0,3,10.028829,14,0,3,1,0,0,0,2.054218,3.0
4,1005,17,1,0,2,4.672495,17,1,3,0,0,0,0,1.288061,4.0


## Data Cleaning & Preprocessing
Removing columns that clearly are not good predicotrs, like student ID number and grade-class (which would be considered data leakage)

In [4]:
# drop columns
df = df.drop(columns=["StudentID", "GradeClass"])

In [6]:
# identify feature types
categorical_features = ["Gender", "Ethnicity", "ParentalEducation", "Tutoring", "ParentalSupport", "Extracurricular", "Sports", "Music", "Volunteering"]
continuous_features = ["Age", "StudyTimeWeekly", "Absences"]

In [7]:
# encode categorical variables
df_encoded = df.copy()
for col in categorical_features:
    if df[col].nunique() <= 10:
        df_encoded[col] = df_encoded[col].astype("category").cat.codes
    else:
        one_hot = pd.get_dummies(df_encoded[col], prefix=col)
        df_encoded = df_encoded.drop(col, axis=1).join(one_hot)

In [8]:
# Define features (X) and target variable (y)
X = df_encoded.drop(columns=["GPA"])
y = df_encoded["GPA"]

## Model Building

In [12]:
# train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=15)

# dataset dimensions
print(f"Training: {X_train.shape}\nTesting: {X_test.shape}")

Training: (1913, 12)
Testing: (479, 12)


A random forest model was chosen for a number of reasons:
- it handles categorical and continuous data well
- robust to overfitting
- automatically selects features

In [14]:
# initialize random forest regressor
rf_regressor = RandomForestRegressor(n_estimators=100, max_depth=None, random_state=42)
rf_regressor.fit(X_train, y_train)

# Make predictions
y_pred = rf_regressor.predict(X_test)

## Model Evalutaion

In [15]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.4f}")
print(f"R² Score: {r2:.4f}")

Mean Squared Error: 0.0609
R² Score: 0.9213


# Feature Importance

In [17]:
importances = rf_regressor.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
feature_importance_df

Unnamed: 0,Feature,Importance
5,Absences,0.855949
4,StudyTimeWeekly,0.057615
7,ParentalSupport,0.033315
6,Tutoring,0.012216
8,Extracurricular,0.008195
9,Sports,0.006854
0,Age,0.006514
3,ParentalEducation,0.006094
2,Ethnicity,0.004878
10,Music,0.003943
