In [None]:
# pip package installation 

!pip install pandas numpy matplotlib seaborn scikit-learn imbalanced-learn xgboost

In [None]:
# Import libaries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Import models 
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier

# Import metrics and preprocessing methods from sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc, precision_recall_curve, average_precision_score


In [None]:
# load data

df = pd.read_csv("student_success_data\students_dropout_academic_success.csv")

In [None]:
# data investigation

# print dataframe columns and size
print(df.columns)
print(df.shape)

# print unique values in each column
def col_unique(col) :
    """Prints the number of unique values in a column. Prints the unique values if there are less than 5"""
    if df[col].nunique() > 5 :
        print(f"There are {df[col].nunique()} unique values in {col}")
    else: 
        print(f"Unqiue values in {col}: {df[col].unique()}")

for col in df.columns:
    col_unique(col)

In [None]:
# Check for Nan and remove if found
print(df.shape)
print(df.isna().sum())
df.dropna(inplace=True)
print(df.shape)

In [None]:
#Check column data types 
df.dtypes

In [None]:
# Rename columns

rename_dict = {
    'Marital Status': 'marital_status',
    'Application mode': 'application_mode',
    'Application order': 'application_order',
    'Course': 'major',
    'Daytime/evening attendance': 'daytime_evening_attendance',
    'Previous qualification': 'previous_qualification',
    'Previous qualification (grade)': 'previous_qualification_grade',
    'Nacionality': 'nationality',
    "Mother's qualification": 'mothers_qualification',
    "Father's qualification": 'fathers_qualification',
    "Mother's occupation": 'mothers_occupation',
    "Father's occupation": 'fathers_occupation',
    'Admission grade': 'admission_grade',
    'Displaced': 'displaced',
    'Educational special needs': 'educational_special_needs',
    'Debtor': 'debtor',
    'Tuition fees up to date': 'tuition_fees_up_to_date',
    'Gender': 'gender',
    'Scholarship holder': 'scholarship_holder',
    'Age at enrollment': 'age_at_enrollment',
    'International': 'international',
    'Curricular units 1st sem (credited)': 'curricular_units_1st_sem_credited',
    'Curricular units 1st sem (enrolled)': 'curricular_units_1st_sem_enrolled',
    'Curricular units 1st sem (evaluations)': 'curricular_units_1st_sem_evaluations',
    'Curricular units 1st sem (approved)': 'curricular_units_1st_sem_approved',
    'Curricular units 1st sem (grade)': 'curricular_units_1st_sem_grade',
    'Curricular units 1st sem (without evaluations)': 'curricular_units_1st_sem_without_evaluations',
    'Curricular units 2nd sem (credited)': 'curricular_units_2nd_sem_credited',
    'Curricular units 2nd sem (enrolled)': 'curricular_units_2nd_sem_enrolled',
    'Curricular units 2nd sem (evaluations)': 'curricular_units_2nd_sem_evaluations',
    'Curricular units 2nd sem (approved)': 'curricular_units_2nd_sem_approved',
    'Curricular units 2nd sem (grade)': 'curricular_units_2nd_sem_grade',
    'Curricular units 2nd sem (without evaluations)': 'curricular_units_2nd_sem_without_evaluations',
    'Unemployment rate': 'unemployment_rate',
    'Inflation rate': 'inflation_rate',
    'GDP': 'gdp',
    'target': 'target'
}

# Apply the renaming
df.rename(columns=rename_dict, inplace=True)

In [None]:
# Charts to visualize the distribution of data for each feature of the data


# Histogram for numeric columns
numeric_cols = ['previous_qualification_grade', 'admission_grade', 'age_at_enrollment', 'curricular_units_1st_sem_credited', 'curricular_units_1st_sem_enrolled', 
           'curricular_units_1st_sem_evaluations', 'curricular_units_1st_sem_approved', 'curricular_units_1st_sem_grade', 'curricular_units_1st_sem_without_evaluations', 
           'curricular_units_2nd_sem_credited', 'curricular_units_2nd_sem_enrolled', 'curricular_units_2nd_sem_evaluations', 'curricular_units_2nd_sem_approved', 
           'curricular_units_2nd_sem_grade', 'curricular_units_2nd_sem_without_evaluations', 'unemployment_rate', 'inflation_rate', 'gdp']


cols = 3
rows = 6

fig, axes = plt.subplots(rows, cols, figsize=(cols * 5, rows * 4))
axes = axes.flatten()

for i, col in enumerate(numeric_cols):
    sns.histplot(df[col], bins=10, ax=axes[i], color='skyblue')
    axes[i].set_title(f'{col}')
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()



#Bar charts for binary columns
binary_cols = ['daytime_evening_attendance', 'displaced', 'educational_special_needs', 'debtor', 'tuition_fees_up_to_date', 'gender', 'scholarship_holder', 'international']

cols = 3
rows = 3

fig, axes = plt.subplots(rows, cols, figsize=(cols * 5, rows * 4))
axes = axes.flatten()

for i, col in enumerate(binary_cols):
    sns.countplot(data=df, x=col, ax=axes[i])
    axes[i].set_title(col)
    axes[i].tick_params(axis='x', rotation=45)
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()


# Bar charts for categorical columns
cat_cols = ['marital_status', 'application_mode', 'application_order', 'major', 'previous_qualification', 'nationality', 'mothers_qualification', 'fathers_qualification',
               'mothers_occupation', 'fathers_occupation', 'target']

cols = 3
rows = 4

fig, axes = plt.subplots(rows, cols, figsize=(cols * 5, rows * 4))
axes = axes.flatten()

for i, col in enumerate(cat_cols):
    sns.countplot(data=df, x=col, ax=axes[i])
    axes[i].set_title(col)
    axes[i].tick_params(axis='x', rotation=90)
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()


In [None]:
# Pick out interesting graphs


#target column
plt.figure(figsize=(6, 5))
sns.countplot(data=df, x='target')
plt.title('Distribution of Student Outcomes')
plt.xlabel('Outcome')
plt.ylabel('Number of Students')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

In [None]:
# Data encoding and scaling

categorical = ['marital_status', 'application_mode', 'application_order', 'major', 'previous_qualification', 'nationality', 'mothers_qualification', 'fathers_qualification',
               'mothers_occupation', 'fathers_occupation']
binary = ['daytime_evening_attendance', 'displaced', 'educational_special_needs', 'debtor', 'tuition_fees_up_to_date', 'gender', 'scholarship_holder', 'international']
numeric = ['previous_qualification_grade', 'admission_grade', 'age_at_enrollment', 'curricular_units_1st_sem_credited', 'curricular_units_1st_sem_enrolled', 
           'curricular_units_1st_sem_evaluations', 'curricular_units_1st_sem_approved', 'curricular_units_1st_sem_grade', 'curricular_units_1st_sem_without_evaluations', 
           'curricular_units_2nd_sem_credited', 'curricular_units_2nd_sem_enrolled', 'curricular_units_2nd_sem_evaluations', 'curricular_units_2nd_sem_approved', 
           'curricular_units_2nd_sem_grade', 'curricular_units_2nd_sem_without_evaluations', 'unemployment_rate', 'inflation_rate', 'gdp']
uneeded_cols = []

# one vs all target encoding
target_dummies = pd.get_dummies(df['target'], prefix='target')

# Drop uneeded cols
df.drop(columns=uneeded_cols, inplace=True)

# Encode categorical variables
enc = OneHotEncoder(drop='first', sparse_output=False)
encoded_array = enc.fit_transform(df[categorical])
encoded_cols = enc.get_feature_names_out(categorical)
encoded_df = pd.DataFrame(encoded_array, columns=encoded_cols, index=df.index)

df_encoded = pd.concat([df.drop(columns=categorical + ['target']), encoded_df, target_dummies], axis=1)

# Scale numeric columns
scaler = StandardScaler()
df_encoded[numeric] = scaler.fit_transform(df_encoded[numeric])



In [None]:
# correlation matrix

def has_sig_cor(row):
    return (abs(row) > 0.1).any()

corr_matrix = df_encoded.corr(numeric_only=True)

target_cols = ['target_Dropout', 'target_Enrolled', 'target_Graduate']
x_labels = ['Dropout', 'Enrolled', 'Graduate']

filtered_features = corr_matrix[target_cols].apply(has_sig_cor, axis=1)
filtered_corr = corr_matrix.loc[filtered_features, target_cols]

for col in target_cols:
    print(f"\nCorrelation with {col}")
    print(corr_matrix[col].sort_values(ascending=False))

plt.figure(figsize=(max(5, 2.5 * len(filtered_corr.columns)), 8))
ax = sns.heatmap(filtered_corr, cmap='coolwarm', annot=True, fmt=".2f", center=0)

ax.set_xticklabels(x_labels)
ax.set_yticklabels(ax.get_yticklabels(), rotation=0)

plt.title('Correlation of Features with Target Classes (≥ 0.1)')
plt.tight_layout()
plt.show()


In [None]:
# Filter to only Graduate and Dropout students
df_binary = df[df['target'].isin(['Graduate', 'Dropout'])].copy()

# Create binary target, 1 for Dropout, 0 for Graduate
target = (df_binary['target'] == 'Dropout').astype(int)

# Get corresponding features, drop target column
features = df_encoded.loc[df_binary.index].drop(columns=['target_Dropout', 'target_Enrolled', 'target_Graduate'])


# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    features, target, test_size=0.2, random_state=42, stratify=target
)

In [None]:
# Random forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# prediction
y_pred = rf.predict(X_test)

# Print classification report and confusion matrix 
print("\nBinary Classifier Performance (Graduate vs Dropout)")
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

In [None]:
# Logical regression
log = LogisticRegression(max_iter=1000, random_state=42)
log.fit(X_train, y_train)

#prediction
y_pred_log = log.predict(X_test)

# Print classification report and confusion matrix
print("\nLogistic Regression Performance")
print(classification_report(y_test, y_pred_log))
print(confusion_matrix(y_test, y_pred_log))

In [None]:
#XGboost
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb.fit(X_train, y_train)

# prediction
y_pred_xgb = xgb.predict(X_test)

# Print classification report and confusion matrix
print("\nXGBoost Performance")
print(classification_report(y_test, y_pred_xgb))
print(confusion_matrix(y_test, y_pred_xgb))

In [None]:
#Adaboost

ada = AdaBoostClassifier(n_estimators=100, random_state=42)
ada.fit(X_train, y_train)

# prediction
y_pred_ada = ada.predict(X_test)

# Print classification report and confusion matrix
print("\nAdaBoost Performance")
print(classification_report(y_test, y_pred_ada))
print(confusion_matrix(y_test, y_pred_ada))

In [None]:
# Feature Importance Graphs

def plot_feature_importance(model, feature_names, model_name, use_coef=False):
    """Plots the feature importance graphs for all four models"""
    
    # Check if using coefficents instead of feature importance
    if use_coef:
        importances = np.abs(model.coef_[0])
    else:
        importances = model.feature_importances_

    # Create df of features and cooresponding importance
    feat_imp_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
    feat_imp_df.sort_values(by='Importance', ascending=False, inplace=True)

    # Plot feature importance graph
    plt.figure(figsize=(10, 6))
    sns.barplot(data=feat_imp_df.head(20), x='Importance', y='Feature', hue=None, legend=False)
    plt.title(f'Top 20 Feature Importances - {model_name}')
    plt.tight_layout()
    plt.show()

# Feature names (shared by all)
feature_names = X_train.columns

# Random Forest
plot_feature_importance(rf, feature_names, 'Random Forest')

# logistic Regression (use coefficients)
plot_feature_importance(log_model, feature_names, 'Logistic Regression', use_coef=True)

# XGBoost
plot_feature_importance(xgb_model, feature_names, 'XGBoost')

# AdaBoost
plot_feature_importance(ada_model, feature_names, 'AdaBoost')

In [None]:
# ROC and Precision Recall curves

def plot_model_curves(model, X_test, y_test, model_name='Model'):
    """Plots the ROC_AUC and Precision Recall graphs for all the models"""

    # Predict probabilities for the positive class
    y_proba = model.predict_proba(X_test)[:, 1]

    # ROC Curve
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    roc_auc = auc(fpr, tpr)

    #Plot ROC
    plt.figure()
    plt.plot(fpr, tpr, label=f'AUC ={roc_auc:.2f}')
    plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve - {model_name}')
    plt.legend()
    plt.show()

    # Precision-Recall Curve
    precision, recall, _ = precision_recall_curve(y_test, y_proba)
    avg_precision = average_precision_score(y_test, y_proba)

    plt.figure()
    plt.plot(recall, precision, label=f'AP ={avg_precision:.2f}')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title(f'Precision-Recall Curve-{model_name}')
    plt.legend()
    plt.show()

plot_model_curves(rf, X_test, y_test, model_name='Random Forest')
plot_model_curves(log_model, X_test, y_test, model_name='Logistic Regression')
plot_model_curves(xgb_model, X_test, y_test, model_name='XGBoost')
plot_model_curves(ada_model, X_test, y_test, model_name='AdaBoost')

In [None]:
# predict outcomes for enrolled students

# Filter orignial df to enrolled students
df_enrolled = df[df['target'] == 'Enrolled'].copy()


# Get the features from the encoded dataframe
X_enrolled = df_encoded.loc[df_enrolled.index].drop(columns=['target_Dropout', 'target_Enrolled', 'target_Graduate'])

# Make predictions
enrolled_probs = log.predict_proba(X_enrolled)
enrolled_ids = df_enrolled.index

#dataframe for predictions
prob_df = pd.DataFrame(enrolled_probs, columns=log.classes_)


prob_df.rename(columns={0: 'Graduate', 1: 'Dropout'}, inplace=True)
print(prob_df)

In [None]:
# Sort by probability of dropout descending
high_risk = prob_df.sort_values(by='Dropout', ascending=False)

# top 10 at-risk students
print("\nTop 10 high-risk enrolled students:")
print(high_risk.head(10))

In [None]:
# risk categories

def risk_category(prob):
    """assigns a risk category for each enrolled student"""
    if prob > 0.7:
        return 'High Risk'
    elif prob < 0.3:
        return 'Low Risk'
    else:
        return 'Medium Risk'

# Assign risk categories    
prob_df['risk_category'] = prob_df['Dropout'].apply(risk_category)

In [None]:
# Plot risk categories
plt.figure(figsize=(6, 4))
sns.countplot(data=prob_df, x='risk_category', order=['Low Risk', 'Medium Risk', 'High Risk'], hue="risk_category", palette='coolwarm_r')
plt.title('Enrolled Students by Dropout Risk Category')
plt.xlabel('Risk Category')
plt.ylabel('Number of Students')
plt.tight_layout()
plt.show()