# Student Exam Score Prediction

* Sandikha Rahardi (Kuldii Project)
* https://kuldiiproject.com

This notebook builds a machine learning solution to predict students' exam scores (math, reading, writing) based on their study-related features. The workflow includes EDA, preprocessing, model training, hyperparameter tuning, model comparison, and a Gradio interface for interactive predictions.

In [None]:
import warnings
warnings.filterwarnings("ignore")

import os
import sys
import shutil

import numpy as np
import gradio as gr
import pandas as pd
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

## 2. Load Dataset

Download the dataset from [Kaggle](https://www.kaggle.com/datasets/spscientist/students-performance-in-exams/data) and place the CSV file in this project directory.

In [None]:
# Load the dataset using Kaggle API if not present
csv_file = 'StudentsPerformance.csv'
kaggle_dataset = 'spscientist/students-performance-in-exams'

if not os.path.exists(csv_file):
    # Install kaggle if needed
    try:
        import kaggle
    except ImportError:
        !{sys.executable} -m pip install kaggle
    # Set up Kaggle API credentials
    kaggle_json_path = os.path.expanduser('~/.kaggle/kaggle.json')
    if not os.path.exists(kaggle_json_path):
        shutil.copy('kaggle.json', kaggle_json_path)
    os.chmod(kaggle_json_path, 0o600)
    # Download dataset
    !kaggle datasets download -d {kaggle_dataset} --unzip -p datasets
    print(f"Extracted {csv_file}")
else:
    print(f"{csv_file} already exists.")

In [None]:
df = pd.read_csv('datasets/'+csv_file)
df

## 3. Exploratory Data Analysis (EDA)

Let's explore the dataset: view the first and last rows, shape, info, summary statistics, and check for missing values.

In [None]:
# Basic EDA
print('First 5 rows:')
display(df.head())
print('Last 5 rows:')
display(df.tail())
print(f'Shape: {df.shape}')
print('\nInfo:')
df.info()
print('\nDescribe:')
display(df.describe())
print('\nMissing values:')
display(df.isnull().sum())

In [None]:
# Analyze categorical variables
categorical_features = ['gender', 'race/ethnicity', 'parental level of education', 'lunch', 'test preparation course']

for col in categorical_features:
    print(f'\n{col} unique values:')
    print(df[col].value_counts())
    sns.countplot(data=df, x=col)
    plt.title(f'Distribution of {col}')
    plt.xticks(rotation=45)
    plt.show()

In [None]:
# Analyze target variables: histograms and boxplots
score_columns = ['math score', 'reading score', 'writing score']

for col in score_columns:
    fig, axes = plt.subplots(1, 2, figsize=(12, 4))
    sns.histplot(df[col], kde=True, ax=axes[0], color='skyblue')
    axes[0].set_title(f'Histogram of {col}')
    sns.boxplot(x=df[col], ax=axes[1], color='lightgreen')
    axes[1].set_title(f'Boxplot of {col}')
    plt.tight_layout()
    plt.show()

In [None]:
# Pairplot of target variables
sns.pairplot(df[score_columns], kind='reg', diag_kind='kde')
plt.suptitle('Pairplot of Target Variables', y=1.02)
plt.show()

# Correlation heatmap
corr = df[score_columns].corr()
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap of Target Variables')
plt.show()

In [None]:
# Analyze scores vs categorical variables: grouped averages and boxplots
for cat in categorical_features:
    print(f'\nAverage scores grouped by {cat}:')
    display(df.groupby(cat)[score_columns].mean())
    df_grouped = df.groupby(cat)[score_columns].mean().reset_index()
    df_grouped_melt = df_grouped.melt(id_vars=cat, var_name='Score Type', value_name='Average Score')
    plt.figure(figsize=(10,5))
    sns.barplot(data=df_grouped_melt, x=cat, y='Average Score', hue='Score Type')
    plt.title(f'Average Scores by {cat}')
    plt.xticks(rotation=45)
    plt.show()
    for score in score_columns:
        plt.figure(figsize=(8,4))
        sns.boxplot(data=df, x=cat, y=score)
        plt.title(f'{score} by {cat}')
        plt.xticks(rotation=45)
        plt.show()

In [None]:
# Identify outliers using z-score
from scipy.stats import zscore
z_scores = np.abs(zscore(df[score_columns]))
outliers = (z_scores > 3).any(axis=1)
print(f'Number of potential outliers: {outliers.sum()}')
display(df[outliers])

## 4. Data Preprocessing

Encode categorical variables, separate features and targets, and split the data into training and test sets.

In [None]:
# Features and target
X = df[categorical_features]
y = df[score_columns]

# OneHotEncode categorical features
ohe = OneHotEncoder(sparse_output=False, drop='first')
X_encoded = ohe.fit_transform(X)
X_encoded_df = pd.DataFrame(X_encoded, columns=ohe.get_feature_names_out(categorical_features))

# Optionally scale features (not strictly necessary for tree-based models)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_encoded_df)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

print(f'Training set: {X_train.shape}, Test set: {X_test.shape}')

## 5. Model Training

Train and compare LinearRegression, RandomForestRegressor, and GradientBoostingRegressor using MultiOutputRegressor for multi-target regression.

In [None]:
# Train and evaluate models
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    return {'RMSE': rmse, 'MAE': mae, 'R2': r2}

models = {
    'LinearRegression': MultiOutputRegressor(LinearRegression()),
    'RandomForestRegressor': MultiOutputRegressor(RandomForestRegressor(random_state=42)),
    'GradientBoostingRegressor': MultiOutputRegressor(GradientBoostingRegressor(random_state=42))
}

results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    results[name] = evaluate_model(model, X_test, y_test)
    print(f'{name} evaluation:', results[name])

## 6. Hyperparameter Tuning

Tune RandomForestRegressor and GradientBoostingRegressor using GridSearchCV. Evaluate using RMSE, MAE, and R².

In [None]:
# Hyperparameter tuning for RandomForestRegressor
rf_param_grid = {
    'estimator__n_estimators': [50, 100],
    'estimator__max_depth': [None, 5, 10],
    'estimator__min_samples_split': [2, 5]
}
rf_grid = GridSearchCV(MultiOutputRegressor(RandomForestRegressor(random_state=42)), rf_param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
rf_grid.fit(X_train, y_train)
print('Best RandomForestRegressor params:', rf_grid.best_params_)
rf_best = rf_grid.best_estimator_
rf_eval = evaluate_model(rf_best, X_test, y_test)
print('RandomForestRegressor (tuned) evaluation:', rf_eval)

# Hyperparameter tuning for GradientBoostingRegressor
gb_param_grid = {
    'estimator__n_estimators': [50, 100],
    'estimator__learning_rate': [0.05, 0.1],
    'estimator__max_depth': [3, 5]
}
gb_grid = GridSearchCV(MultiOutputRegressor(GradientBoostingRegressor(random_state=42)), gb_param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
gb_grid.fit(X_train, y_train)
print('Best GradientBoostingRegressor params:', gb_grid.best_params_)
gb_best = gb_grid.best_estimator_
gb_eval = evaluate_model(gb_best, X_test, y_test)
print('GradientBoostingRegressor (tuned) evaluation:', gb_eval)

# Update results
tuned_results = results.copy()
tuned_results['RandomForestRegressor (tuned)'] = rf_eval
tuned_results['GradientBoostingRegressor (tuned)'] = gb_eval

## 7. Model Comparison

Compare the performance of all models using RMSE, MAE, and R² metrics.

In [None]:
# Model comparison table
comparison_df = pd.DataFrame(tuned_results).T
comparison_df = comparison_df[['RMSE', 'MAE', 'R2']]
display(comparison_df.sort_values('RMSE'))

## 8. Select Best Model

Refit the best-performing model on the entire training set for final use.

In [None]:
# Select and refit the best model
best_model_name = comparison_df['RMSE'].idxmin()
print(f'Best model: {best_model_name}')

if best_model_name == 'RandomForestRegressor (tuned)':
    final_model = rf_best
elif best_model_name == 'GradientBoostingRegressor (tuned)':
    final_model = gb_best
else:
    final_model = models.get(best_model_name, None)

# Refit on all training data
final_model.fit(X_train, y_train)

## 9. Feature Importance

Plot feature importances for tree-based models (RandomForestRegressor, GradientBoostingRegressor) if applicable.

In [None]:
# Plot feature importances for tree-based models
if hasattr(final_model.estimators_[0], 'feature_importances_'):
    importances = np.mean([est.feature_importances_ for est in final_model.estimators_], axis=0)
    feature_names = ohe.get_feature_names_out(categorical_features)
    feat_imp_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
    feat_imp_df = feat_imp_df.sort_values('Importance', ascending=False)
    plt.figure(figsize=(10,6))
    sns.barplot(data=feat_imp_df, x='Importance', y='Feature')
    plt.title('Feature Importances')
    plt.show()
else:
    print('Feature importances not available for this model.')

## 10. Gradio Interface

Create a Gradio UI to select a model and input features for prediction.

In [None]:
# Gradio interface for predictions
# Prepare model dictionary for Gradio
model_dict = {
    'LinearRegression': models['LinearRegression'],
    'RandomForestRegressor': rf_best,
    'GradientBoostingRegressor': gb_best
}

def preprocess_input(gender, race, parental, lunch, prep):
    input_df = pd.DataFrame({
        'gender': [gender],
        'race/ethnicity': [race],
        'parental level of education': [parental],
        'lunch': [lunch],
        'test preparation course': [prep]
    })
    input_encoded = ohe.transform(input_df)
    input_scaled = scaler.transform(input_encoded)
    return input_scaled

def predict_scores(model_name, gender, race, parental, lunch, prep):
    model = model_dict[model_name]
    X_input = preprocess_input(gender, race, parental, lunch, prep)
    pred = model.predict(X_input)[0]
    return round(pred[0], 2), round(pred[1], 2), round(pred[2], 2)

# Get dropdown options from data
options = {col: sorted(df[col].unique()) for col in categorical_features}

gradio_inputs = [
    gr.Dropdown(choices=list(model_dict.keys()), label='Model'),
    gr.Dropdown(choices=options['gender'], label='Gender'),
    gr.Dropdown(choices=options['race/ethnicity'], label='Race/Ethnicity'),
    gr.Dropdown(choices=options['parental level of education'], label='Parental Level of Education'),
    gr.Dropdown(choices=options['lunch'], label='Lunch'),
    gr.Dropdown(choices=options['test preparation course'], label='Test Preparation Course')
]

gradio_outputs = [
    gr.Number(label='Predicted Math Score'),
    gr.Number(label='Predicted Reading Score'),
    gr.Number(label='Predicted Writing Score')
]

demo = gr.Interface(
    fn=predict_scores,
    inputs=gradio_inputs,
    outputs=gradio_outputs,
    title="Student Exam Score Prediction",
    description="Select a model and input student features to predict math, reading, and writing scores."
)

demo.launch()
