# Feature Importance Analysis

This notebook implements comprehensive feature importance analysis for the Student Performance Analysis project.
We will extract feature importance from trained Random Forest models and analyze the key predictors.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
)
import warnings

warnings.filterwarnings("ignore")

# Load preprocessed data
df = pd.read_pickle("../data/processed/cleaned_dataset.pkl")
print(f"Dataset shape: {df.shape}")
print(f"Features: {df.columns.tolist()}")

Dataset shape: (649, 36)
Features: ['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu', 'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime', 'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences', 'G1', 'G2', 'G3', 'pass_fail', 'attendance_proxy', 'grade_average']


In [None]:
# Prepare feature sets for both scenarios
# WITH G1 & G2 (Mid-term prediction)
X_with_grades = df.drop(["pass_fail", "G3"], axis=1)
# WITHOUT G1 & G2 (Early intervention)
X_without_grades = df.drop(["pass_fail", "G1", "G2", "G3", "grade_average"], axis=1)

y = df["pass_fail"]

print(f"Features WITH G1&G2: {X_with_grades.shape[1]}")
print(f"Features WITHOUT G1&G2: {X_without_grades.shape[1]}")
print(f"Target distribution: {y.value_counts()}")

Features WITH G1&G2: 34
Features WITHOUT G1&G2: 31
Target distribution: pass_fail
1    549
0    100
Name: count, dtype: int64


In [None]:
# Feature Importance Analysis - WITH G1 & G2
print("=" * 60)
print("FEATURE IMPORTANCE ANALYSIS - WITH G1 & G2")
print("=" * 60)

# Split data
X_train_with, X_test_with, y_train, y_test = train_test_split(
    X_with_grades, y, test_size=0.2, random_state=42, stratify=y
)

# Train Random Forest model
rf_with = RandomForestClassifier(n_estimators=100, random_state=42)
rf_with.fit(X_train_with, y_train)

# Get feature importance
feature_importance_with = pd.DataFrame(
    {"feature": X_with_grades.columns, "importance": rf_with.feature_importances_}
).sort_values("importance", ascending=False)

# Model performance
y_pred_with = rf_with.predict(X_test_with)
f1_with = f1_score(y_test, y_pred_with)
accuracy_with = accuracy_score(y_test, y_pred_with)

print(f"Model Performance (WITH G1 & G2):")
print(f"F1-Score: {f1_with:.4f}")
print(f"Accuracy: {accuracy_with:.4f}")

print("\nTop 10 Most Important Features:")
for i, (_, row) in enumerate(feature_importance_with.head(10).iterrows(), 1):
    print(f"{i:2d}. {row['feature']:15s} ({row['importance']:.4f})")

# Visualize top features
plt.figure(figsize=(12, 8))
top_features_with = feature_importance_with.head(15)
plt.barh(
    range(len(top_features_with)),
    top_features_with["importance"],
    color="lightblue",
    edgecolor="navy",
    alpha=0.7,
)
plt.yticks(range(len(top_features_with)), top_features_with["feature"])
plt.xlabel("Feature Importance")
plt.title("Top 15 Feature Importance - WITH G1 & G2 (Mid-term Prediction)")
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

# Store results
feature_importance_with_dict = dict(
    zip(feature_importance_with["feature"], feature_importance_with["importance"])
)

FEATURE IMPORTANCE ANALYSIS - WITH G1 & G2


ValueError: Cannot cast object dtype to float32

In [None]:
# Feature Importance Analysis - WITHOUT G1 & G2
print("\n" + "=" * 60)
print("FEATURE IMPORTANCE ANALYSIS - WITHOUT G1 & G2")
print("=" * 60)

# Split data
X_train_without, X_test_without, y_train, y_test = train_test_split(
    X_without_grades, y, test_size=0.2, random_state=42, stratify=y
)

# Train Random Forest model
rf_without = RandomForestClassifier(n_estimators=100, random_state=42)
rf_without.fit(X_train_without, y_train)

# Get feature importance
feature_importance_without = pd.DataFrame(
    {"feature": X_without_grades.columns, "importance": rf_without.feature_importances_}
).sort_values("importance", ascending=False)

# Model performance
y_pred_without = rf_without.predict(X_test_without)
f1_without = f1_score(y_test, y_pred_without)
accuracy_without = accuracy_score(y_test, y_pred_without)

print(f"Model Performance (WITHOUT G1 & G2):")
print(f"F1-Score: {f1_without:.4f}")
print(f"Accuracy: {accuracy_without:.4f}")

print("\nTop 10 Most Important Features (Early Intervention):")
for i, (_, row) in enumerate(feature_importance_without.head(10).iterrows(), 1):
    print(f"{i:2d}. {row['feature']:15s} ({row['importance']:.4f})")

# Visualize top features
plt.figure(figsize=(12, 8))
top_features_without = feature_importance_without.head(15)
plt.barh(
    range(len(top_features_without)),
    top_features_without["importance"],
    color="lightcoral",
    edgecolor="darkred",
    alpha=0.7,
)
plt.yticks(range(len(top_features_without)), top_features_without["feature"])
plt.xlabel("Feature Importance")
plt.title("Top 15 Feature Importance - WITHOUT G1 & G2 (Early Intervention)")
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

# Store results
feature_importance_without_dict = dict(
    zip(feature_importance_without["feature"], feature_importance_without["importance"])
)

In [None]:
# Compare feature importance between scenarios
print("\n" + "=" * 60)
print("FEATURE IMPORTANCE COMPARISON")
print("=" * 60)

# Get common features
common_features = set(X_with_grades.columns) & set(X_without_grades.columns)

comparison_data = []
for feature in common_features:
    importance_with = feature_importance_with_dict.get(feature, 0)
    importance_without = feature_importance_without_dict.get(feature, 0)
    comparison_data.append(
        {
            "feature": feature,
            "with_grades": importance_with,
            "without_grades": importance_without,
            "difference": importance_without - importance_with,
        }
    )

comparison_df = pd.DataFrame(comparison_data).sort_values(
    "without_grades", ascending=False
)

print("Top Features in Early Intervention Scenario (WITHOUT G1 & G2):")
print("Feature\t\t\tWith G1&G2\tWithout G1&G2\tDifference")
print("-" * 70)
for _, row in comparison_df.head(10).iterrows():
    print(
        f"{row['feature']:15s}\t{row['with_grades']:.4f}\t\t{row['without_grades']:.4f}\t\t{row['difference']:+.4f}"
    )

# Save results for technical report
print("\n" + "=" * 60)
print("SUMMARY FOR TECHNICAL REPORT")
print("=" * 60)

print("\n**Top Features (Random Forest - Early Intervention Scenario):**")
for i, (_, row) in enumerate(feature_importance_without.head(10).iterrows(), 1):
    feature_name = row["feature"]
    importance = row["importance"]

    # Add feature descriptions based on common educational variables
    descriptions = {
        "failures": "Previous academic failures",
        "higher": "Higher education aspirations",
        "absences": "School absences",
        "age": "Student age",
        "studytime": "Weekly study time",
        "Medu": "Mother education level",
        "Fedu": "Father education level",
        "freetime": "Free time after school",
        "goout": "Going out with friends",
        "Dalc": "Workday alcohol consumption",
        "Walc": "Weekend alcohol consumption",
        "health": "Current health status",
        "famrel": "Quality of family relationships",
        "school": "School (GP or MS)",
        "sex": "Student gender",
        "traveltime": "Home to school travel time",
    }

    description = descriptions.get(feature_name, feature_name.replace("_", " ").title())
    print(f"{i}. `{feature_name}` ({importance:.3f}): {description}")

print(f"\n**Model Performance Metrics:**")
print(f"- WITH G1 & G2: F1-Score = {f1_with:.4f}, Accuracy = {accuracy_with:.4f}")
print(
    f"- WITHOUT G1 & G2: F1-Score = {f1_without:.4f}, Accuracy = {accuracy_without:.4f}"
)
print(
    f"- Performance Drop: {(f1_with - f1_without):.4f} F1-score when removing grade history"
)