# Modeling
#### ✂️ Train-Test Split (80/20)

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the engineered data
df = pd.read_csv("../data/processed/engineered_data.csv")

# Separate features and target
X = df.drop(columns=["GradeClass"])
y = df["GradeClass"]

# Perform 80/20 train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y  # stratify keeps class distribution
)

# Confirm the split shapes
print(f"✅ X_train shape: {X_train.shape}")
print(f"✅ X_test shape: {X_test.shape}")
print(f"✅ y_train shape: {y_train.shape}")
print(f"✅ y_test shape: {y_test.shape}")


✅ X_train shape: (1913, 13)
✅ X_test shape: (479, 13)
✅ y_train shape: (1913,)
✅ y_test shape: (479,)


## Evaluation Metrics (Step 7)

## Model Building: Part 1 (Baseline ML Models) (Step 9)

### 1) Logistic Regression

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

# Common function to train and evaluate a model
def train_and_evaluate(model, X_train, y_train, X_test, y_test, model_name="Model"):
    print(f"🔍 {model_name}")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred, digits=3))
    print("-" * 60)

# -------------------- 1. Logistic Regression --------------------
logreg = LogisticRegression(max_iter=3000, random_state=42)
train_and_evaluate(logreg, X_train, y_train, X_test, y_test, "Logistic Regression")

🔍 Logistic Regression
              precision    recall  f1-score   support

           0      0.800     0.533     0.640        15
           1      0.741     0.816     0.777        49
           2      0.838     0.805     0.821        77
           3      0.831     0.892     0.860        83
           4      0.984     0.973     0.978       255

    accuracy                          0.902       479
   macro avg      0.839     0.804     0.815       479
weighted avg      0.903     0.902     0.901       479

------------------------------------------------------------


### 2) Random Forest

In [7]:
# -------------------- 2. Random Forest --------------------
rf = RandomForestClassifier(n_estimators=100, random_state=42)
train_and_evaluate(rf, X_train, y_train, X_test, y_test, "Random Forest")

🔍 Random Forest
              precision    recall  f1-score   support

           0      1.000     0.867     0.929        15
           1      0.961     1.000     0.980        49
           2      1.000     0.987     0.993        77
           3      0.988     1.000     0.994        83
           4      1.000     1.000     1.000       255

    accuracy                          0.994       479
   macro avg      0.990     0.971     0.979       479
weighted avg      0.994     0.994     0.994       479

------------------------------------------------------------


### 3) XGBoost

In [10]:
# -------------------- 3. XGBoost --------------------
xgb = XGBClassifier(eval_metric="mlogloss", random_state=42)
train_and_evaluate(xgb, X_train, y_train, X_test, y_test, "XGBoost")

🔍 XGBoost
              precision    recall  f1-score   support

           0      1.000     1.000     1.000        15
           1      1.000     0.959     0.979        49
           2      0.975     1.000     0.987        77
           3      1.000     0.988     0.994        83
           4      0.996     1.000     0.998       255

    accuracy                          0.994       479
   macro avg      0.994     0.989     0.992       479
weighted avg      0.994     0.994     0.994       479

------------------------------------------------------------


## Model Building: Part 2 (Deep Learning Model) (Step 10)