# Modeling
#### ✂️ Train-Test Split (80/20)

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the engineered data
df = pd.read_csv("../data/processed/engineered_data.csv")

# Separate features and target
X = df.drop(columns=["GradeClass"])
y = df["GradeClass"]

# Perform 80/20 train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y  # stratify keeps class distribution
)

# Confirm the split shapes
print(f"✅ X_train shape: {X_train.shape}")
print(f"✅ X_test shape: {X_test.shape}")
print(f"✅ y_train shape: {y_train.shape}")
print(f"✅ y_test shape: {y_test.shape}")


✅ X_train shape: (1913, 12)
✅ X_test shape: (479, 12)
✅ y_train shape: (1913,)
✅ y_test shape: (479,)


## Evaluation Metrics (Step 7)

## Model Building: Part 1 (Baseline ML Models) (Step 9)

### 1) Logistic Regression

In [4]:
import os
import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Ensure artifacts directory exists
os.makedirs("artifacts", exist_ok=True)

# Common function to train, evaluate and save a model
def train_and_evaluate(model, X_train, y_train, X_test, y_test, model_name="lr_model"):
    print(f"🔍 {model_name}")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred, digits=3))
    
    # Save model
    model_path = f"artifacts/{model_name.replace(' ', '_').lower()}.pkl"
    joblib.dump(model, model_path)
    print(f"✅ Model saved to: {model_path}")
    print("-" * 60)

# -------------------- 1. Logistic Regression --------------------
logreg = LogisticRegression(max_iter=3000, random_state=42)
train_and_evaluate(logreg, X_train, y_train, X_test, y_test, "Logistic Regression")


🔍 Logistic Regression
              precision    recall  f1-score   support

           0      0.667     0.400     0.500        15
           1      0.623     0.673     0.647        49
           2      0.649     0.649     0.649        77
           3      0.667     0.723     0.694        83
           4      0.964     0.945     0.954       255

    accuracy                          0.814       479
   macro avg      0.714     0.678     0.689       479
weighted avg      0.818     0.814     0.815       479

✅ Model saved to: artifacts/logistic_regression.pkl
------------------------------------------------------------


### 2) Random Forest

In [36]:
# -------------------- 2. Random Forest --------------------
rf = RandomForestClassifier(n_estimators=100, random_state=42)
train_and_evaluate(rf, X_train, y_train, X_test, y_test, "Random Forest")

🔍 Random Forest
              precision    recall  f1-score   support

           0      0.429     0.200     0.273        15
           1      0.543     0.510     0.526        49
           2      0.580     0.662     0.618        77
           3      0.605     0.590     0.598        83
           4      0.934     0.941     0.938       255

    accuracy                          0.768       479
   macro avg      0.618     0.581     0.590       479
weighted avg      0.764     0.768     0.764       479

------------------------------------------------------------


### 3) XGBoost

In [37]:
# -------------------- 3. XGBoost --------------------
xgb = XGBClassifier(eval_metric="mlogloss", random_state=42)
train_and_evaluate(xgb, X_train, y_train, X_test, y_test, "XGBoost")

🔍 XGBoost
              precision    recall  f1-score   support

           0      0.636     0.467     0.538        15
           1      0.588     0.612     0.600        49
           2      0.608     0.584     0.596        77
           3      0.556     0.663     0.604        83
           4      0.947     0.906     0.926       255

    accuracy                          0.768       479
   macro avg      0.667     0.646     0.653       479
weighted avg      0.778     0.768     0.772       479

------------------------------------------------------------


### 4) CatBoost

In [48]:
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report

# Initialize the CatBoostClassifier
cat_model = CatBoostClassifier(
    iterations=500,
    learning_rate=0.05,
    depth=6,
    loss_function='MultiClass',
    eval_metric='Accuracy',
    random_seed=42,
    verbose=100
)

# Fit the model
cat_model.fit(X_train, y_train, eval_set=(X_test, y_test))

# Predict
y_pred_cat = cat_model.predict(X_test)

# Evaluate
print("📊 CatBoost Classifier Performance:")
print(classification_report(y_test, y_pred_cat, digits=3))


ModuleNotFoundError: No module named 'catboost'

## Model Building: Part 2 (Deep Learning Model) (Step 10)
#### 🧠 Deep Learning Model: Neural Network

We will now train a feedforward neural network to classify students into GradeClass labels (A–F), using TensorFlow/Keras.


In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report

# 📊 Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 🧠 Define the model architecture
nn_model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(5, activation='softmax')  # 5 classes for GradeClass (0 to 4)
])

# ⚙️ Compile the model
nn_model.compile(optimizer='adam',
                 loss='sparse_categorical_crossentropy',
                 metrics=['accuracy'])

# 📈 Train the model
history = nn_model.fit(X_train_scaled, y_train,
                       epochs=50,
                       batch_size=32,
                       validation_split=0.2,
                       verbose=1)

# 🧪 Evaluate on test set
y_pred_nn = nn_model.predict(X_test_scaled)
y_pred_classes = tf.argmax(y_pred_nn, axis=1)

# 📊 Classification report
print("🔍 Neural Network Performance on Test Set:")
print(classification_report(y_test, y_pred_classes.numpy(), digits=3))


Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.2332 - loss: 1.6799 - val_accuracy: 0.5248 - val_loss: 1.3099
Epoch 2/50
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5349 - loss: 1.2915 - val_accuracy: 0.5326 - val_loss: 1.1678
Epoch 3/50
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5558 - loss: 1.1728 - val_accuracy: 0.5431 - val_loss: 1.0816
Epoch 4/50
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5653 - loss: 1.1038 - val_accuracy: 0.5640 - val_loss: 1.0084
Epoch 5/50
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5508 - loss: 1.0821 - val_accuracy: 0.5953 - val_loss: 0.9367
Epoch 6/50
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5679 - loss: 1.0410 - val_accuracy: 0.6188 - val_loss: 0.8745
Epoch 7/50
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
