In [None]:
# Data Preprocessing - Rwotolara Innocent
# Remove duplicates
df = df.drop_duplicates()
print('Shape after removing duplicates:', df.shape)

# Handle outliers in key numerical features
for col in ['Age at enrollment', 'Admission grade', 'School_Distance_km']:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
    print(f'Shape after removing outliers in {col}: {df.shape}')

In [None]:
# Data Preprocessing - Rwotolara Innocent
# Remove duplicates
df = df.drop_duplicates()
print('Shape after removing duplicates:', df.shape)

# Handle outliers in key numerical features
for col in ['Age at enrollment', 'Admission grade', 'School_Distance_km']:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
    print(f'Shape after removing outliers in {col}: {df.shape}')

In [None]:
# PCA Implementation - Rwotolara Innocent
# Apply PCA
pca = PCA(n_components=0.95)
X_pca = pca.fit_transform(X_scaled)
print('Shape after PCA:', X_pca.shape)
print('Variance retained:', pca.explained_variance_ratio_.sum()).round(3)

In [None]:
# Plot PCA elbow curve
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(pca.explained_variance_ratio_) + 1), np.cumsum(pca.explained_variance_ratio_), marker='o')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('PCA Elbow Curve')
plt.grid(True)
plt.savefig('pca_elbow_curve.png', bbox_inches='tight', dpi=150)
plt.show()

**Interpretation (PCA)** - The first few components capture most of the important structure in the data, with diminishing returns as more components are added. The “elbow” around 10-12 components suggests an optimal balance, providing most of the information without unnecessary dimensions. By 15-20 components, over 80% of the variance is already explained, so including more offers little additional benefit.

In [None]:
# PCA Implementation - Rwotolara Innocent
# Feature Importance Ranking After PCA
feature_names = X_encoded.columns.tolist()

feature_importance = np.sum(np.abs(pca.components_), axis=0)

importance_df = pd.DataFrame({
    "Feature": feature_names,
    "Importance": feature_importance
}).sort_values("Importance", ascending=True)

# Plot features ranked by importance
plt.figure(figsize=(10, 8))
plt.barh(importance_df["Feature"], importance_df["Importance"], color="skyblue")
plt.xlabel("Feature Importance", fontsize=12)
plt.title("Original Features Ranked by Importance After PCA", fontsize=14, fontweight="bold")
plt.grid(axis="x", alpha=0.3, linestyle="--")
plt.savefig("pca_feature_importance.png", bbox_inches="tight", dpi=150)
plt.show()

**Interpretation (PCA Feature Importance)** - This plot shows the original features ranked by their contribution to the principal components retained in PCA. Features with higher importance scores have a stronger influence on the transformed PCA space used for modeling. Both academic and non-academic factors contribute, highlighting which variables drive the variance in the dataset and are most influential in predicting student outcomes.

In [None]:
# Data Preprocessing - Rwotolara Innocent
# Split data
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42, stratify=y)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)