In [3]:
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, r2_score, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold


import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow import keras

### Understand the Data

In [4]:
df = pd.read_csv('student_exam_scores.csv')

print(df.info())
print("----------------------------------------------")

print(df.describe())
print("----------------------------------------------")

print(df.isnull().sum())
print("----------------------------------------------")

pearson = df.corr(numeric_only=True)['exam_score'].sort_values(ascending=False)
print("Pearson correlation with exam_score:\n", pearson)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   student_id          200 non-null    object 
 1   hours_studied       200 non-null    float64
 2   sleep_hours         200 non-null    float64
 3   attendance_percent  200 non-null    float64
 4   previous_scores     200 non-null    int64  
 5   exam_score          200 non-null    float64
dtypes: float64(4), int64(1), object(1)
memory usage: 9.5+ KB
None
----------------------------------------------
       hours_studied  sleep_hours  attendance_percent  previous_scores  \
count     200.000000   200.000000          200.000000       200.000000   
mean        6.325500     6.622000           74.830000        66.800000   
std         3.227317     1.497138           14.249905        15.663869   
min         1.000000     4.000000           50.300000        40.000000   
25%         3.500000

### Define the data

In [5]:
# Target 
label = "exam_score"
id_col = ["student_id"]

# Select features (for now)
cols = [c for c in df.columns if c not in id_col + [label]] 
print(cols)

X = df.drop(columns = id_col + [label])
y = df[label].copy()

['hours_studied', 'sleep_hours', 'attendance_percent', 'previous_scores']


### Training Set-up

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

Defining a baseline model for comparison

In [7]:
# --- 1. Linear Regression baseline
lin = LinearRegression()
lin.fit(X_train, y_train)
pred_lin = lin.predict(X_test)

print(f"Linear Regression | "
      f"MAE={mean_absolute_error(y_test, pred_lin):.3f}  "
      f"RMSE={root_mean_squared_error(y_test, pred_lin):.3f}  "
      f"R²={r2_score(y_test, pred_lin):.3f}")

Linear Regression | MAE=2.311  RMSE=2.786  R²=0.854


- MAE = 2.311: On average this model’s predictions are about 2.3 points away from the true exam score
- RMSE = 2.786: The “typical” error magnitude (giving more weight to big misses) is about 2.8 points
- R² = 0.854: 85.4% of the variation in exam scores can be explained by the features (hours studied, sleep, attendance, previous scores, etc.)

Now, let's design our neural network model!

In [8]:
X_train = X_train.copy()
X_test = X_test.copy()

print(X_train.info())

Xtr = X_train.to_numpy(dtype=np.float32)
Xte = X_test.to_numpy(dtype=np.float32)
ytr = y_train.to_numpy(dtype=np.float32)
yte = y_test.to_numpy(dtype=np.float32)

print(Xtr.shape)
print(Xte.shape)

<class 'pandas.core.frame.DataFrame'>
Index: 160 entries, 79 to 102
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   hours_studied       160 non-null    float64
 1   sleep_hours         160 non-null    float64
 2   attendance_percent  160 non-null    float64
 3   previous_scores     160 non-null    int64  
dtypes: float64(3), int64(1)
memory usage: 6.2 KB
None
(160, 4)
(40, 4)


In [9]:
tf.keras.utils.set_random_seed(42)

 **Initial 4-Layer Network**
   - Input layer → 64 neurons → 32 neurons → 16 neurons → Output (linear)
   - Activation: ReLU for hidden layers
   - Test with learning rates 0.1

In [10]:
nn_model = keras.Sequential([
    keras.layers.Input(shape=(Xtr.shape[1],)),    # input size = number of features
    keras.layers.Dense(64, activation="relu"),
    keras.layers.Dense(32, activation="relu"),
    keras.layers.Dense(16, activation="relu"),
    keras.layers.Dense(1, activation="linear")    # regression output
])

nn_model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=0.1),
    loss="mae",
    metrics=[tf.keras.metrics.RootMeanSquaredError(name="rmse"), "mae"]
)

history = nn_model.fit(
    Xtr, ytr,
    validation_split=0.2,   # 20% of training data for validation
    epochs=300,       
    batch_size=32,
    verbose=1
)

# Evaluate test metrics
test_loss, test_rmse, test_mae = nn_model.evaluate(Xte, yte, verbose=0)

# Predict test set
pred = nn_model.predict(Xte, verbose=0).ravel()

# Compute R²
r2 = r2_score(yte, pred)

print(f"Test MAE={test_mae:.3f}  RMSE={test_rmse:.3f}  R²={r2:.3f}")

Epoch 1/300
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 36ms/step - loss: 74.5906 - mae: 74.5906 - rmse: 118.2237 - val_loss: 27.0699 - val_mae: 27.0699 - val_rmse: 27.8194
Epoch 2/300
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 15.1706 - mae: 15.1706 - rmse: 17.8433 - val_loss: 11.5301 - val_mae: 11.5301 - val_rmse: 12.7015
Epoch 3/300
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 8.7511 - mae: 8.7511 - rmse: 10.2844 - val_loss: 5.7981 - val_mae: 5.7981 - val_rmse: 7.1905
Epoch 4/300
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 8.3175 - mae: 8.3175 - rmse: 9.7352 - val_loss: 3.9610 - val_mae: 3.9610 - val_rmse: 4.8802
Epoch 5/300
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 6.1758 - mae: 6.1758 - rmse: 7.6404 - val_loss: 8.5857 - val_mae: 8.5857 - val_rmse: 9.7647
Epoch 6/300
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s

### Experimenting with hyperparameters

Test with learning rates 0.001


In [11]:
# Try default learning rate
nn_model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=0.001),
    loss="mae",
    metrics=[tf.keras.metrics.RootMeanSquaredError(name="rmse"), "mae"]
)

history = nn_model.fit(
    Xtr, ytr,
    validation_split=0.2,  
    epochs=150,          
    batch_size=32,
    verbose=1     
)

# Evaluate test metrics
test_loss, test_rmse, test_mae = nn_model.evaluate(Xte, yte, verbose=0)

# Predict test set
pred = nn_model.predict(Xte, verbose=0).ravel()

# Compute R²
r2 = r2_score(yte, pred)

print(f"Test MAE={test_mae:.3f}  RMSE={test_rmse:.3f}  R²={r2:.3f}")

Epoch 1/150
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 37ms/step - loss: 2.2944 - mae: 2.2944 - rmse: 2.8117 - val_loss: 2.1819 - val_mae: 2.1819 - val_rmse: 2.7680
Epoch 2/150
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 2.2675 - mae: 2.2675 - rmse: 2.7978 - val_loss: 2.2127 - val_mae: 2.2127 - val_rmse: 2.7674
Epoch 3/150
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 2.2726 - mae: 2.2726 - rmse: 2.8049 - val_loss: 2.1899 - val_mae: 2.1899 - val_rmse: 2.7665
Epoch 4/150
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 2.2681 - mae: 2.2681 - rmse: 2.7923 - val_loss: 2.1840 - val_mae: 2.1840 - val_rmse: 2.7708
Epoch 5/150
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 2.2656 - mae: 2.2656 - rmse: 2.7872 - val_loss: 2.1860 - val_mae: 2.1860 - val_rmse: 2.7709
Epoch 6/150
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step -

Let's try redesigning the neural network.

   - Input layer → 32 neurons → 16 neurons → Output (linear)
   - Optimizer: Adam with learning rate = 1e-3
   - Loss function: MAE (Mean Absolute Error)
   - Metrics: RMSE and MAE
   - Early stopping with patience=10, restoring best weights

In [12]:
model = keras.Sequential([
    keras.layers.Input(shape=(Xtr.shape[1],)),   
    keras.layers.Dense(32, activation="relu"),
    keras.layers.Dense(16, activation="relu"),
    keras.layers.Dense(1, activation="linear")
])

model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=1e-3),
    loss="mae",
    metrics=[tf.keras.metrics.RootMeanSquaredError(name="rmse"), "mae"]
)


early_stop = keras.callbacks.EarlyStopping(
    monitor="val_loss",
    patience=10,
    restore_best_weights=True
)

hist = model.fit(
    Xtr, ytr,
    validation_split=0.2,
    epochs=150,
    batch_size=16,
    verbose=0
)

# Predict exam scores on unseen test data
pred = model.predict(Xte, verbose=0).ravel()

# Calculate regression metrics
mae  = mean_absolute_error(yte, pred)
rmse = root_mean_squared_error(yte, pred)
r2   = r2_score(yte, pred)

print(f"Neural Network (Scaled) | MAE={mae:.3f}  RMSE={rmse:.3f}  R²={r2:.3f}")

Neural Network (Scaled) | MAE=2.335  RMSE=2.828  R²=0.849


Let's experiment with different learning rates and batch sizes

- Learning rates tested: [1e-2, 1e-3, 1e-4]
- Batch sizes tested: [8, 16, 32, 64]
- Epochs: 150 with early stopping

In [13]:
def build_model(input_dim, lr):
    model = keras.Sequential([
        keras.layers.Input(shape=(input_dim,)),
        keras.layers.Dense(32, activation="relu"),
        keras.layers.Dense(16, activation="relu"),
        keras.layers.Dense(1, activation="linear")
    ])
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=lr),
        loss="mae",
        metrics=[tf.keras.metrics.RootMeanSquaredError(name="rmse"), "mae"]
    )
    return model

early_stop = keras.callbacks.EarlyStopping(
    monitor="val_loss",
    patience=10,
    restore_best_weights=True
)

learning_rates = [1e-2, 1e-3, 1e-4]
batch_sizes = [8, 16, 32, 64]

results = []

for lr in learning_rates:
    for bs in batch_sizes:
        tf.keras.utils.set_random_seed(42)

        model = build_model(Xtr.shape[1], lr)
        hist = model.fit(
            Xtr, ytr,
            validation_split=0.2,
            epochs=150,
            batch_size=bs,
            verbose=0,
            callbacks=[early_stop]
        )

        pred = model.predict(Xte, verbose=0).ravel()
        mae  = mean_absolute_error(yte, pred)
        rmse = root_mean_squared_error(yte, pred)
        r2   = r2_score(yte, pred)

        results.append((lr, bs, mae, rmse, r2))
        print(f"LR={lr:<6} | Batch={bs:<3} | MAE={mae:.3f}  RMSE={rmse:.3f}  R²={r2:.3f}")

# --- summarize best combo ---
best = min(results, key=lambda x: x[2])  # smallest MAE
print("\nBest combination:")
print(f"LR={best[0]} | Batch={best[1]} | MAE={best[2]:.3f}  RMSE={best[3]:.3f}  R²={best[4]:.3f}")

LR=0.01   | Batch=8   | MAE=2.601  RMSE=3.169  R²=0.811
LR=0.01   | Batch=16  | MAE=12.077  RMSE=14.126  R²=-2.760
LR=0.01   | Batch=32  | MAE=6.358  RMSE=7.656  R²=-0.104
LR=0.01   | Batch=64  | MAE=24.765  RMSE=25.804  R²=-11.546
LR=0.001  | Batch=8   | MAE=29.717  RMSE=30.634  R²=-16.683
LR=0.001  | Batch=16  | MAE=39.529  RMSE=40.336  R²=-29.657
LR=0.001  | Batch=32  | MAE=44.469  RMSE=45.260  R²=-37.600
LR=0.001  | Batch=64  | MAE=46.952  RMSE=47.743  R²=-41.950
LR=0.0001 | Batch=8   | MAE=47.465  RMSE=48.257  R²=-42.881
LR=0.0001 | Batch=16  | MAE=48.458  RMSE=49.252  R²=-44.708
LR=0.0001 | Batch=32  | MAE=48.959  RMSE=49.753  R²=-45.643
LR=0.0001 | Batch=64  | MAE=49.213  RMSE=50.008  R²=-46.122

Best combination:
LR=0.01 | Batch=8 | MAE=2.601  RMSE=3.169  R²=0.811


### Normalizing Feature Values

- Let's apply z-score normalization: `(X - μ) / σ`
- Using training set statistics only to prevent data leakage
- Same hyperparameter grid search performed

In [14]:
# --- Standardize using training stats ---
mu = X_train.mean()
sd = X_train.std().replace(0, 1)

Xtr = ((X_train - mu) / sd).to_numpy(dtype=np.float32)
Xte = ((X_test  - mu) / sd).to_numpy(dtype=np.float32)
ytr = y_train.to_numpy(dtype=np.float32)
yte = y_test.to_numpy(dtype=np.float32)

# --- Model builder ---
def build_model(input_dim, lr):
    model = keras.Sequential([
        keras.layers.Input(shape=(input_dim,)),
        keras.layers.Dense(32, activation="relu"),
        keras.layers.Dense(16, activation="relu"),
        keras.layers.Dense(1, activation="linear")
    ])
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=lr),
        loss="mae",
        metrics=[tf.keras.metrics.RootMeanSquaredError(name="rmse"), "mae"]
    )
    return model

early_stop = keras.callbacks.EarlyStopping(
    monitor="val_loss",
    patience=10,
    restore_best_weights=True
)

# --- Grid search parameters ---
learning_rates = [1e-2, 1e-3, 1e-4]
batch_sizes = [8, 16, 32, 64]
results = []

for lr in learning_rates:
    for bs in batch_sizes:
        tf.keras.utils.set_random_seed(42)

        model = build_model(Xtr.shape[1], lr)
        model.fit(
            Xtr, ytr,
            validation_split=0.2,
            epochs=150,
            batch_size=bs,
            verbose=0,
            callbacks=[early_stop]
        )

        pred = model.predict(Xte, verbose=0).ravel()
        mae  = mean_absolute_error(yte, pred)
        rmse = root_mean_squared_error(yte, pred)
        r2   = r2_score(yte, pred)

        results.append((lr, bs, mae, rmse, r2))
        print(f"LR={lr:<6} | Batch={bs:<3} | MAE={mae:.3f}  RMSE={rmse:.3f}  R²={r2:.3f}")

best = min(results, key=lambda x: x[2])  # lowest MAE
print("\nBest combination:")
print(f"LR={best[0]} | Batch={best[1]} | MAE={best[2]:.3f}  RMSE={best[3]:.3f}  R²={best[4]:.3f}")

LR=0.01   | Batch=8   | MAE=3.456  RMSE=3.980  R²=0.701
LR=0.01   | Batch=16  | MAE=32.845  RMSE=33.669  R²=-20.361
LR=0.01   | Batch=32  | MAE=33.806  RMSE=34.599  R²=-21.557
LR=0.01   | Batch=64  | MAE=34.288  RMSE=35.070  R²=-22.175
LR=0.001  | Batch=8   | MAE=34.450  RMSE=35.226  R²=-22.382
LR=0.001  | Batch=16  | MAE=34.620  RMSE=35.390  R²=-22.600
LR=0.001  | Batch=32  | MAE=34.719  RMSE=35.485  R²=-22.727
LR=0.001  | Batch=64  | MAE=34.773  RMSE=35.538  R²=-22.798
LR=0.0001 | Batch=8   | MAE=34.791  RMSE=35.555  R²=-22.820
LR=0.0001 | Batch=16  | MAE=34.809  RMSE=35.572  R²=-22.844
LR=0.0001 | Batch=32  | MAE=34.820  RMSE=35.582  R²=-22.857
LR=0.0001 | Batch=64  | MAE=34.825  RMSE=35.588  R²=-22.864

Best combination:
LR=0.01 | Batch=8 | MAE=3.456  RMSE=3.980  R²=0.701


### Cross-validation

In [15]:
X_np = X.values.astype(np.float32)
y_np = y.values.astype(np.float32)

# Build model
def build_model(input_dim):
    m = keras.Sequential([
        keras.layers.Input(shape=(input_dim,)),
        keras.layers.Dense(32, activation="relu"),
        keras.layers.Dense(16, activation="relu"),
        keras.layers.Dense(1, activation="linear")
    ])
    m.compile(optimizer=keras.optimizers.Adam(1e-3), loss="mae")
    return m

kf = KFold(n_splits=5, shuffle=True, random_state=42)
maes, rmses, r2s = [], [], []

for fold, (train_idx, val_idx) in enumerate(kf.split(X_np), 1):
    X_tr, X_val = X_np[train_idx], X_np[val_idx]
    y_tr, y_val = y_np[train_idx], y_np[val_idx]

    # Standardize (train stats only)
    mu = X_tr.mean(axis=0, keepdims=True)
    sd = X_tr.std(axis=0, keepdims=True)
    sd[sd == 0] = 1
    X_tr = (X_tr - mu) / sd
    X_val = (X_val - mu) / sd

    # Train
    tf.keras.utils.set_random_seed(42)
    model = build_model(X_tr.shape[1])
    model.fit(X_tr, y_tr, epochs=120, batch_size=16, verbose=0)

    # Predict & metrics
    pred = model.predict(X_val, verbose=0).ravel()
    mae  = mean_absolute_error(y_val, pred)
    rmse = mean_squared_error(y_val, pred, squared=False)
    r2   = r2_score(y_val, pred)

    maes.append(mae); rmses.append(rmse); r2s.append(r2)
    print(f"Fold {fold}: MAE={mae:.3f}, RMSE={rmse:.3f}, R²={r2:.3f}")

# --- Overall averages ---
print("\nAverage 5-Fold Results:")
print(f"MAE={np.mean(maes):.3f} ± {np.std(maes):.3f}")
print(f"RMSE={np.mean(rmses):.3f} ± {np.std(rmses):.3f}")
print(f"R²={np.mean(r2s):.3f} ± {np.std(r2s):.3f}")



Fold 1: MAE=3.129, RMSE=3.709, R²=0.741




Fold 2: MAE=3.421, RMSE=4.203, R²=0.605




Fold 3: MAE=3.204, RMSE=3.857, R²=0.662




Fold 4: MAE=3.301, RMSE=3.888, R²=0.600
Fold 5: MAE=2.792, RMSE=3.390, R²=0.764

Average 5-Fold Results:
MAE=3.169 ± 0.213
RMSE=3.809 ± 0.265
R²=0.674 ± 0.068


