In [None]:
# Pie chart for Target distribution (Overview of outcomes)-Abigaba
plt.figure(figsize=(6, 6))
df['Target'].value_counts().plot.pie(startangle=90,  autopct='%1.1f%%', colors=['#1f77b4', '#ff7f0e'], textprops={'fontsize': 14})
plt.title('Student Outcomes at a Glance', fontsize=18, fontweight='bold')
plt.ylabel('')
plt.savefig('target_pie.png', bbox_inches='tight', dpi=150)
plt.show()

Interpretation (Univariate) - This pie chart shows the distribution of student outcomes, with the majority being graduates, indicating a class imbalance where dropouts are less frequent but critical to predict.

In [None]:
# Age bar plot -Abigaba
plt.figure(figsize=(8, 5))
sns.histplot(df['Age at enrollment'], bins=6, color='skyblue', kde=False, edgecolor='black')
plt.title('Age Distribution', fontsize=18, fontweight='bold')
plt.xlabel('Age', fontsize=14)
plt.ylabel('Number of Students', fontsize=14)
plt.savefig('age_bar.png', bbox_inches='tight', dpi=150)
plt.show()

Interpretation (Univariate) - The age distribution is right-skewed, with most students in their 20s, suggesting that many are recent high school graduates. Older students are less common, which may reflect different enrollment or dropout patterns.

In [None]:
# Admission grade bar plot - Abigaba
plt.figure(figsize=(8, 5))
sns.histplot(df['Admission grade'], bins=15, kde=False, color='lightgreen', edgecolor='black')
plt.title('Admission Grades', fontsize=18, fontweight='bold')
plt.xlabel('Grade', fontsize=14)
plt.ylabel('Number of Students', fontsize=14)
plt.savefig('admission_bar.png', bbox_inches='tight', dpi=150)
plt.show()

Interpretation (Univariate) - Admission grades follow a normal distribution centered around 120-140, with fewer low scores, implying the dataset captures relatively high-performing students overall.

In [None]:
# Bar plot for mean Admission grade by Target
plt.figure(figsize=(8, 5))
sns.barplot(x='Target', y='Admission grade', data=df, palette='Set2', errorbar=None)  # keep default CI or set errorbar='sd' for std
plt.title('Admission Grades vs. Graduation', fontsize=18, fontweight='bold')
plt.xlabel('Graduation Outcome', fontsize=14)
plt.ylabel('Average Admission Grade', fontsize=14)
plt.savefig('admission_by_target_bar.png', bbox_inches='tight', dpi=150)
plt.show()

Interpretation (Bivariate) - Both dropouts and graduates have relatively high admission grades, with only a small difference, suggesting that while academic performance matters, other non-academic factors also play a significant role in determining graduation outcomes

In [None]:
# Stacked bar for School Distance by Location Type
plt.figure(figsize=(8, 5))
sns.histplot(data=df, x='School_Distance_km', hue='Location_Type', multiple='stack', palette=['#1f77b4', '#ff7f0e'], bins=10, edgecolor='black', alpha=0.8  )
plt.title('School Travel Distance by Location', fontsize=18, fontweight='bold')
plt.xlabel('Distance to School (km)', fontsize=14)
plt.ylabel('Number of Students', fontsize=14)
plt.savefig('school_distance_stacked.png', bbox_inches='tight', dpi=150)
plt.show()

Interpretation (Multivariate) - Rural students generally travel farther to school compared to urban students, which may increase the risk of dropouts due to accessibility challenges.

In [None]:
# Bar plot for mean Internet Access by Target
df_filtered = df[df['Target'].isin(['Dropout', 'Graduate'])].copy()
plt.figure(figsize=(8, 5))
sns.barplot(x='Target', y='Internet_Access', data=df_filtered, palette='Set1', errorbar=None)
plt.title('Internet Access by Graduation Outcome', fontsize=18, fontweight='bold')
plt.xlabel('Graduation Outcome', fontsize=14)
plt.ylabel('Average Internet Access Score', fontsize=14)
plt.tight_layout()
plt.savefig('internet_by_target_bar.png', bbox_inches='tight', dpi=150)
plt.show()

Interpretation (Bivariate) - Graduation outcomes show little variation in internet access scores, indicating that factors beyond connectivity are likely more influential.

In [None]:
# Horizontal bar plot of top correlations with Target
num_df = df.select_dtypes(include=['float64', 'int64'])
corr_with_target = num_df.corr()['y'].abs().drop('y').sort_values(ascending=False)
top_corr = corr_with_target.head(10)

plt.figure(figsize=(10, 6))
top_corr.sort_values().plot(kind='barh', color=sns.color_palette('bright', len(top_corr)))
plt.title('Top 10 Factors Influencing Graduation', fontsize=18, fontweight='bold')
plt.xlabel('Correlation Strength', fontsize=14)
plt.ylabel('Features', fontsize=14)
plt.tight_layout()
plt.savefig('top_correlations_barh.png', bbox_inches='tight', dpi=150)
plt.show()

Interpretation (Bivariate) - These are the top 10 features most strongly associated with graduation, showing that academic factors dominate, while non-academic influences also contribute.

In [None]:
# Data Preprocessing - Rwotolara Innocent
# Remove duplicates
df = df.drop_duplicates()
print('Shape after removing duplicates:', df.shape)

# Handle outliers in key numerical features
for col in ['Age at enrollment', 'Admission grade', 'School_Distance_km']:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
    print(f'Shape after removing outliers in {col}: {df.shape}')

In [None]:
# Data Preprocessing - Rwotolara Innocent
# Remove duplicates
df = df.drop_duplicates()
print('Shape after removing duplicates:', df.shape)

# Handle outliers in key numerical features
for col in ['Age at enrollment', 'Admission grade', 'School_Distance_km']:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
    print(f'Shape after removing outliers in {col}: {df.shape}')

In [None]:
# PCA Implementation - Rwotolara Innocent
# Apply PCA
pca = PCA(n_components=0.95)
X_pca = pca.fit_transform(X_scaled)
print('Shape after PCA:', X_pca.shape)
print('Variance retained:', pca.explained_variance_ratio_.sum()).round(3)

In [None]:
# Plot PCA elbow curve
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(pca.explained_variance_ratio_) + 1), np.cumsum(pca.explained_variance_ratio_), marker='o')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('PCA Elbow Curve')
plt.grid(True)
plt.savefig('pca_elbow_curve.png', bbox_inches='tight', dpi=150)
plt.show()

**Interpretation (PCA)** - The first few components capture most of the important structure in the data, with diminishing returns as more components are added. The “elbow” around 10-12 components suggests an optimal balance, providing most of the information without unnecessary dimensions. By 15-20 components, over 80% of the variance is already explained, so including more offers little additional benefit.

In [None]:
# PCA Implementation - Rwotolara Innocent
# Feature Importance Ranking After PCA
feature_names = X_encoded.columns.tolist()

feature_importance = np.sum(np.abs(pca.components_), axis=0)

importance_df = pd.DataFrame({
    "Feature": feature_names,
    "Importance": feature_importance
}).sort_values("Importance", ascending=True)

# Plot features ranked by importance
plt.figure(figsize=(10, 8))
plt.barh(importance_df["Feature"], importance_df["Importance"], color="skyblue")
plt.xlabel("Feature Importance", fontsize=12)
plt.title("Original Features Ranked by Importance After PCA", fontsize=14, fontweight="bold")
plt.grid(axis="x", alpha=0.3, linestyle="--")
plt.savefig("pca_feature_importance.png", bbox_inches="tight", dpi=150)
plt.show()

**Interpretation (PCA Feature Importance)** - This plot shows the original features ranked by their contribution to the principal components retained in PCA. Features with higher importance scores have a stronger influence on the transformed PCA space used for modeling. Both academic and non-academic factors contribute, highlighting which variables drive the variance in the dataset and are most influential in predicting student outcomes.

In [None]:
# Data Preprocessing - Rwotolara Innocent
# Split data
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42, stratify=y)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)