<a href="https://colab.research.google.com/github/jcsmcmendes/Step_Class/blob/main/Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import the necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
# Load regression dataset
df = pd.read_excel("student_datasets_balanced.xlsx", sheet_name="regression")
X = df[['attendance', 'assignments_completed', 'participation']]
y = df['final_grade']

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Define regression models
models = {
    'Linear Regression': LinearRegression(),
    'K-Nearest Neighbors': KNeighborsRegressor(n_neighbors=5),
    'Decision Tree': DecisionTreeRegressor(max_depth=4, random_state=42),
    'Random Forest': RandomForestRegressor(n_estimators=100, max_depth=6, random_state=42),
    'XGBoost': XGBRegressor(n_estimators=100, max_depth=4, random_state=42)
}

In [3]:
# K-Fold Cross-Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
results_kfold = {name: [] for name in models}

for name, model in models.items():
    for train_idx, val_idx in kf.split(X_scaled):
        X_train, X_val = X_scaled[train_idx], X_scaled[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        model.fit(X_train, y_train)
        preds = model.predict(X_val)
        mse = mean_squared_error(y_val, preds)
        results_kfold[name].append(mse)

# Display K-Fold results
results_df_kfold = pd.DataFrame(results_kfold)
results_df_kfold.loc['Mean'] = results_df_kfold.mean()
results_df_kfold.loc['Std'] = results_df_kfold.std()
print("\n📊 Mean Squared Error per fold (K-Fold):")
print(results_df_kfold.round(2))


📊 Mean Squared Error per fold (K-Fold):
      Linear Regression  K-Nearest Neighbors  Decision Tree  Random Forest  \
0                  6.94                17.47          40.76          18.38   
1                 15.91                21.99          56.35          29.86   
2                  9.92                21.53          37.52          19.50   
3                 10.65                19.43          48.18          20.14   
4                  9.32                 9.26          55.58          11.60   
Mean              10.55                17.93          47.68          19.89   
Std                2.96                 4.63           7.60           5.84   

      XGBoost  
0       21.26  
1       27.47  
2       17.57  
3       25.38  
4       13.40  
Mean    21.02  
Std      5.11  


In [4]:
# Repeated Holdout Validation
n_repeats = 5
results_holdout = {name: [] for name in models}

for seed in range(n_repeats):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    for name, model in models.items():
        model.fit(X_train_scaled, y_train)
        preds = model.predict(X_test_scaled)
        mse = mean_squared_error(y_test, preds)
        results_holdout[name].append(mse)

# Display Repeated Holdout results
results_df_holdout = pd.DataFrame(results_holdout)
results_df_holdout.loc['Mean'] = results_df_holdout.mean()
results_df_holdout.loc['Std'] = results_df_holdout.std()
print("\n📊 Mean Squared Error per repetition (Repeated Holdout):")
print(results_df_holdout.round(2))



📊 Mean Squared Error per repetition (Repeated Holdout):
      Linear Regression  K-Nearest Neighbors  Decision Tree  Random Forest  \
0                  7.81                14.59          49.53          18.52   
1                 11.86                17.28          47.16          26.24   
2                 10.09                22.73          59.80          24.15   
3                 11.46                24.31          63.67          25.01   
4                  6.64                 8.67          34.44          12.68   
Mean               9.57                17.52          50.92          21.32   
Std                2.04                 5.66          10.29           5.06   

      XGBoost  
0       16.67  
1       25.03  
2       22.83  
3       23.87  
4       14.24  
Mean    20.53  
Std      4.27  
