In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix

# 🔁 Reusable evaluation function
def evaluate_classification_model(y_true, y_pred, model_name="Model"):
    print(f"Evaluation Metrics for {model_name}")
    print("-" * 50)
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_true, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_true, y_pred, average='weighted', zero_division=0)
    
    print(f"Accuracy     : {accuracy:.4f}")
    print(f"Precision    : {precision:.4f}")
    print(f"Recall       : {recall:.4f}")
    print(f"F1 Score     : {f1:.4f}")
    
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred, zero_division=0))

# Load the dataset
df = pd.read_csv('../data/processed/engineered_data.csv')

# Keep GradeClass as the target for multi-class classification
y = df['GradeClass']
X = df.drop(columns=['GradeClass'])

# Train/test split (stratify by GradeClass to preserve class distribution)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train logistic regression for multi-class classification
log_reg = LogisticRegression(solver='lbfgs', max_iter=1000, random_state=42)
log_reg.fit(X_train_scaled, y_train)

# Predict
y_pred = log_reg.predict(X_test_scaled)

# ✅ Evaluate
evaluate_classification_model(y_true=y_test, y_pred=y_pred, model_name="Logistic Regression (Raw)")

# 🔍 Feature Importance (Coefficients for each class)
coeffs = pd.DataFrame(log_reg.coef_, columns=X.columns)
coeffs['Class'] = log_reg.classes_

# Melt the DataFrame to make it easier to read
coeff_melted = coeffs.melt(id_vars='Class', var_name='Feature', value_name='Coefficient')
coeff_sorted = coeff_melted.sort_values(by='Coefficient', key=np.abs, ascending=False)

print("Top Feature Coefficients across classes:\n", coeff_sorted.head(12))


Evaluation Metrics for Logistic Regression (Raw)
--------------------------------------------------
Accuracy     : 0.8225
Precision    : 0.8276
Recall       : 0.8225
F1 Score     : 0.8223

Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.40      0.55        15
           1       0.67      0.69      0.68        49
           2       0.65      0.69      0.67        77
           3       0.67      0.71      0.69        83
           4       0.96      0.95      0.95       255

    accuracy                           0.82       479
   macro avg       0.76      0.69      0.71       479
weighted avg       0.83      0.82      0.82       479

Top Feature Coefficients across classes:
     Class          Feature  Coefficient
29      4         Absences     9.163548
25      0         Absences    -7.319748
26      1         Absences    -4.862214
28      3         Absences     3.717651
39      4  ParentalSupport    -1.854927
24      4  StudyT