## Scholarship vs covariates

In [None]:
# Train Random Forest to see which features matter for Scholarships
ml_m = RandomForestClassifier(n_estimators=100, random_state=42)
ml_m.fit(df[covariates_without_scholarship], df['scholarship_holder'])

# Extract Feature Importance
feature_importance = pd.Series(ml_m.feature_importances_, index=covariates_with_scholarship).sort_values(ascending=False)

In [None]:
# Reverse palette: most important = lightest
colors = sns.color_palette("Blues", len(feature_importance))[::-1]

# Create figure and plot
plt.figure(figsize=(8, 6))
ax = feature_importance.plot(
    kind='bar',
    color=colors,
    width=0.75
)

# Title and labels
ax.set_title("Feature Importance for Scholarship Eligibility", fontsize=22, weight="bold", pad=20)
ax.set_ylabel("Importance Score", fontsize=14)
ax.set_xlabel("")
ax.tick_params(axis="both", labelsize=12)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')


# Gridlines
ax.yaxis.grid(True, linestyle='--', linewidth=0.7)
ax.xaxis.grid(False)

# Remove spines
for spine in ax.spines.values():
    spine.set_visible(False)

# Finalize
plt.tight_layout()
plt.savefig("feature_scholarship.png", dpi=300, bbox_inches="tight")
plt.show()

## Treatment vs covariates

In [None]:
# Training
covariates_with_scholarship = [
    "application_order", "daytime/evening_attendance", "previous_qualification_(grade)",
    "admission_grade", "displaced", "educational_special_needs", "gender",
    "age_at_enrollment", "international", "unemployment_rate", "inflation_rate", "gdp",
    "mother_educ", "father_educ", 
    "scholarship_holder"
]
X = df[covariates_with_scholarship]

X_encoded = pd.get_dummies(X, drop_first=True)
y = df["binary_target"]

model = LassoCV(cv=5) 
model.fit(X_encoded, y)


# Coefficients from LassoCV
coef = model.coef_

# Create a Series for feature importance
feat_importance = pd.Series(coef, index=X_encoded.columns).abs()  # Use abs() to reflect strength

# Get top 15 most important features (by absolute coefficient size)
top_features = feat_importance.sort_values(ascending=False).head(15)

# Plot them
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(8, 5))
sns.barplot(x=top_features.values, y=top_features.index)
plt.title("Top 15 Features by Lasso Coefficient Magnitude")
plt.xlabel("Coefficient Magnitude (Importance)")
plt.ylabel("Feature")
plt.tight_layout()
plt.show()



# # Calculate feature importance and get top 15
# feat_importance = pd.Series(model.feature_importances_, index=X_encoded.columns)
# top_features = feat_importance.sort_values(ascending=False).head(15)

In [None]:
colors = sns.color_palette("Blues", len(top_features))[::-1]

# Create figure and plot
plt.figure(figsize=(8, 6))
ax = top_features.plot(
    kind='bar',
    color=colors,
    width=0.75
)

# Title and labels
ax.set_title("Feature Importance for Dropout Rate", fontsize=22, weight="bold", pad=20)
ax.set_ylabel("Importance Score", fontsize=14)
ax.set_xlabel("")
ax.tick_params(axis="both", labelsize=12)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')


# Gridlines
ax.yaxis.grid(True, linestyle='--', linewidth=0.7)
ax.xaxis.grid(False)

# Remove spines
for spine in ax.spines.values():
    spine.set_visible(False)

# Finalize
plt.tight_layout()
plt.savefig("feature_dropout.png", dpi=300, bbox_inches="tight")
plt.show()