# Models Pipeline Notebook

In this notebook, we train two models to predict student exam scores:
1. **Artificial Neural Network (ANN)** using TensorFlow
2. **Ordinary Least Squares (OLS)** linear regression using Statsmodels

The workflow is as follows:
- Load the processed training and test data.
- Train the ANN, visualize training curves, and evaluate.
- Train the OLS model, visualize predictions, and use LinearRegDiagnostic for regression diagnostics.
- Save plots to both `/results/plots` and `/docker/images/learningBase`.
- Save trained models to `/results/trained_models`.
- Compare performance metrics of both approaches.


## 0. Import Libraries & Setup


In [None]:
import os
import sys

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# TensorFlow/Keras for ANN
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Statsmodels for OLS
import statsmodels.api as sm

# Sklearn metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# For saving objects
import pickle

# (Optional) for reproducibility
tf.random.set_seed(42)
np.random.seed(42)

# Define paths (adjust if needed)
data_dir = os.path.abspath(os.path.join('..', '..', 'data', 'processed'))
train_path = os.path.join(data_dir, 'training_data.csv')
test_path = os.path.join(data_dir, 'test_data.csv')

# Directory for saving models
trained_models_path = os.path.abspath(os.path.join('..','..','results','trained_models'))
os.makedirs(trained_models_path, exist_ok=True)

# Directory for learning-related outputs (e.g., Docker images)
learning_base_path = os.path.abspath(os.path.join('..','..','docker','images','learningBase'))
os.makedirs(learning_base_path, exist_ok=True)

# Add code directory to system path
code_path = os.path.abspath(os.path.join('..'))
if code_path not in sys.path:
    sys.path.insert(0, code_path)

In [None]:
# Import custom plot saver function
from utils.plot_saver import save_plot

# Import custom linear regression diagnostic
from utils.LinearRegDiagnostic import LinearRegDiagnostic

## 1. Load Preprocessed Data


In [None]:
print("Training data path:", train_path)
print("Test data path:    ", test_path)

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

print("Training set shape:", train_df.shape)
print("Test set shape:", test_df.shape)

# Separate features (X) and target (y)
target_col = 'Exam_Score'
X_train = train_df.drop(columns=[target_col]).values
y_train = train_df[target_col].values

X_test = test_df.drop(columns=[target_col]).values
y_test = test_df[target_col].values

## 2. Artificial Neural Network (ANN)

### 2.1 Define and Compile the Model

In [None]:
model = keras.Sequential([
    keras.Input(shape=(X_train.shape[1],)),
    layers.Dense(64, activation='relu'),
    layers.Dense(64, activation='relu'),
    layers.Dense(1) # single output for regression
])

model.compile(
    optimizer='adam',
    loss='mse',
    metrics=['mae']
)

model.summary()

### 2.2 Train the Model

In [None]:
epochs = 50
batch_size = 8

history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=epochs,
    batch_size=batch_size,
    verbose=1
)

### 2.3 Visualize Training History

In [None]:
# Plot training & validation loss
plt.figure(figsize=(8,5))
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Test Loss')
plt.title('ANN Training & Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('MSE')
plt.legend()

# Save the figure using custom save_plot (results/plots) and also to learningBase
fig_loss = plt.gcf()  # get current figure
save_plot(fig_loss, 'ann_loss_curve.png')
fig_loss.savefig(os.path.join(learning_base_path, 'ann_loss_curve.png'), bbox_inches='tight')

plt.show()

# Plot training & validation MAE
plt.figure(figsize=(8,5))
plt.plot(history.history['mae'], label='Train MAE')
plt.plot(history.history['val_mae'], label='Test MAE')
plt.title('ANN Training & Validation MAE')
plt.xlabel('Epoch')
plt.ylabel('MAE')
plt.legend()

fig_mae = plt.gcf()
save_plot(fig_mae, 'ann_mae_curve.png')
fig_mae.savefig(os.path.join(learning_base_path, 'ann_mae_curve.png'), bbox_inches='tight')

plt.show()

### 2.4 Evaluate the ANN on the Test Set

In [None]:
y_pred_ANN = model.predict(X_test).flatten()

mse_ANN = mean_squared_error(y_test, y_pred_ANN)
mae_ANN = mean_absolute_error(y_test, y_pred_ANN)
r2_ANN = r2_score(y_test, y_pred_ANN)

print("=== ANN Performance on Test Set ===")
print("MSE :", mse_ANN)
print("MAE :", mae_ANN)
print("R^2 :", r2_ANN)

### 2.5 Scatter Plot of Predictions vs Actual

In [None]:
plt.figure(figsize=(6,6))
plt.scatter(y_test, y_pred_ANN, alpha=0.6)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red')
plt.xlabel('Actual Exam Score')
plt.ylabel('Predicted Exam Score')
plt.title('ANN Predictions vs. Actual')

fig_scatter_ann = plt.gcf()
save_plot(fig_scatter_ann, 'ann_predictions_scatter.png')
fig_scatter_ann.savefig(os.path.join(learning_base_path, 'ann_predictions_scatter.png'), bbox_inches='tight')
plt.show()

### 2.6 Residual Distribution Plot

In [None]:
residuals_ANN = y_test - y_pred_ANN

plt.figure(figsize=(6,4))
sns.histplot(residuals_ANN, kde=True)
plt.title('ANN Residual Distribution')
plt.xlabel('Residual (Actual - Predicted)')

fig_resid_ann = plt.gcf()
save_plot(fig_resid_ann, 'ann_residual_distribution.png')
fig_resid_ann.savefig(os.path.join(learning_base_path, 'ann_residual_distribution.png'), bbox_inches='tight')
plt.show()

### 2.7 Save the Trained ANN Model

In [None]:
ann_model_path = os.path.join(trained_models_path, 'currentAiSolution.keras')
model.save(ann_model_path)
print(f"ANN model saved at: {ann_model_path}")

# Save training history as CSV
history_df = pd.DataFrame(history.history)
history_csv_path = os.path.join(trained_models_path, 'ann_training_metrics.csv')
history_df.to_csv(history_csv_path, index=False)
print(f"ANN training metrics saved to {history_csv_path}")

---
## 3. Ordinary Least Squares (OLS) with Statsmodels


In [None]:
X_train_ols = sm.add_constant(X_train)
X_test_ols = sm.add_constant(X_test)

ols_model = sm.OLS(y_train, X_train_ols).fit()
print(ols_model.summary())

y_pred_OLS = ols_model.predict(X_test_ols)

mse_OLS = mean_squared_error(y_test, y_pred_OLS)
mae_OLS = mean_absolute_error(y_test, y_pred_OLS)
r2_OLS = r2_score(y_test, y_pred_OLS)

print("\n=== OLS Performance on Test Set ===")
print("MSE :", mse_OLS)
print("MAE :", mae_OLS)
print("R^2 :", r2_OLS)

### 3.1 Scatter Plot of OLS Predictions vs Actual

In [None]:
plt.figure(figsize=(6,6))
plt.scatter(y_test, y_pred_OLS, alpha=0.6)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red')
plt.xlabel('Actual Exam Score')
plt.ylabel('Predicted Exam Score')
plt.title('OLS Predictions vs. Actual')

fig_scatter_ols = plt.gcf()
save_plot(fig_scatter_ols, 'ols_predictions_scatter.png')
fig_scatter_ols.savefig(os.path.join(learning_base_path, 'ols_predictions_scatter.png'), bbox_inches='tight')
plt.show()

### 3.2 Linear Regression Diagnostics Using `LinearRegDiagnostic`


In [None]:
# Create an instance of the diagnostic class with the fitted OLS model
diag = LinearRegDiagnostic(ols_model)

# Generate all diagnostic plots (they appear as a 2×2 figure)
vif_table, fig_diagnostics, ax = diag()

# Save the 2×2 subplot figure to learningBase
diag_pdf_path = os.path.join(learning_base_path, 'OLS_DiagnosticPlots.pdf')
fig_diagnostics.savefig(diag_pdf_path, format='pdf', bbox_inches='tight')
print(f"Diagnostic plots saved to PDF: {diag_pdf_path}")

plt.show()

### 3.3 Save the OLS Model


In [None]:
ols_model_path = os.path.join(trained_models_path, 'currentOlsSolution.pkl')
with open(ols_model_path, 'wb') as f:
    pickle.dump(ols_model, f)

print(f"OLS model saved at: {ols_model_path}")

# Save the OLS summary to a text file
ols_summary_path = os.path.join(trained_models_path, 'ols_model_summary.txt')
with open(ols_summary_path, 'w') as f:
    f.write(str(ols_model.summary()))

print(f"OLS summary saved at: {ols_summary_path}")

---
## 4. Compare Model Performance


In [None]:
comparison_df = pd.DataFrame({
    "Model": ["ANN", "OLS"],
    "MSE": [mse_ANN, mse_OLS],
    "MAE": [mae_ANN, mae_OLS],
    "R^2": [r2_ANN, r2_OLS]
})

print("\n=== Model Performance Comparison ===")
print(comparison_df)