/usr/bin/env python
coding: utf-8
# Student Dropout Prediction: Model Training and Evaluation

This notebook implements comprehensive model training and evaluation for student dropout prediction, including:
1. Proper stratified validation strategy
2. Multiple model types (Logistic Regression, Random Forest, Gradient Boosting, XGBoost)
3. Systematic hyperparameter tuning
4. Model evaluation with appropriate metrics
## 1. Setup and Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
import lightgbm as lgb
import joblib
import os
import optuna
from optuna.visualization import plot_param_importances, plot_optimization_history
import warnings
warnings.filterwarnings('ignore')

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

print("Libraries imported successfully!")


Import necessary libraries
Set random seed for reproducibility
## 2. Data Loading and Preprocessing

In [None]:
print("Loading dataset...")
data_path = '../data/dataset.csv'  # Update this path if needed
df = pd.read_csv(data_path)

print(f"Dataset shape: {df.shape}")
print(f"Columns: {list(df.columns)}")

Load the dataset
Display basic information

Define target column
Split features and target

In [None]:
target_col = 'Target'  # Update this based on the actual target column name

X = df.drop(columns=[target_col])
y = df[target_col]

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Target classes: {y.unique()}")

Identify categorical and numerical columns

In [None]:
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

print(f"Categorical columns ({len(categorical_cols)}): {categorical_cols}")
print(f"Numerical columns ({len(numerical_cols)}): {numerical_cols[:10]}...")  # Show first 10

Create preprocessing pipelines
Combine preprocessing steps
## 3. Train-Test Split with Stratification

In [None]:
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

print("Preprocessing pipelines created successfully!")

Split data into train and test sets using stratification
This ensures the class distribution is preserved in both sets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
)

print(f"Training set size: {X_train.shape}")
print(f"Test set size: {X_test.shape}")

Check class distribution in train and test sets
Visualize class distributions
## 4. Model Evaluation Function

In [None]:
print("Class distribution in training set:")
train_dist = y_train.value_counts(normalize=True)
print(train_dist)

print("\nClass distribution in test set:")
test_dist = y_test.value_counts(normalize=True)
print(test_dist)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

train_dist.plot(kind='bar', ax=ax1, title='Training Set Class Distribution')
ax1.set_ylabel('Proportion')
ax1.set_xlabel('Class')

test_dist.plot(kind='bar', ax=ax2, title='Test Set Class Distribution')
ax2.set_ylabel('Proportion')
ax2.set_xlabel('Class')

plt.tight_layout()
plt.savefig('../models/class_distribution.png')
plt.show()

Define evaluation function
For multi-class problems, we use 'macro' averaging
Print classification report
Plot confusion matrix
## 5. Baseline Model: Logistic Regression

In [None]:
def evaluate_model(model, X_test, y_test, model_name="Model"):
    """Evaluate model performance with multiple metrics"""
    y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='macro')
    
    print(f"\nClassification Report for {model_name}:")
    print(classification_report(y_test, y_pred))
    
    plt.figure(figsize=(8, 6))
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=model.classes_, 
                yticklabels=model.classes_)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title(f'Confusion Matrix - {model_name}')
    plt.tight_layout()
    plt.savefig(f'../models/{model_name}_confusion_matrix.png')
    plt.show()
    
    print(f"\n{model_name} Performance:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1 Score (macro): {f1:.4f}")
    
    return {
        'accuracy': accuracy,
        'f1_score': f1,
        'confusion_matrix': cm
    }

Create Logistic Regression pipeline
Use cross-validation to evaluate baseline model

In [None]:
print("===== Training Baseline Model: Logistic Regression =====")

lr_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000, random_state=RANDOM_STATE))
])

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
cv_scores = cross_val_score(lr_pipeline, X_train, y_train, cv=cv, scoring='accuracy')
print(f"Cross-validation accuracy: {cv_scores.mean():.4f} ¬± {cv_scores.std():.4f}")

Fit the model on the full training set
Evaluate on test set
Save the baseline model
## 6. Random Forest with Hyperparameter Tuning

In [None]:
lr_pipeline.fit(X_train, y_train)

print("Evaluating baseline model on test set:")
lr_metrics = evaluate_model(lr_pipeline, X_test, y_test, "Logistic_Regression")

os.makedirs('../models', exist_ok=True)
joblib.dump(lr_pipeline, '../models/baseline_logistic_regression.joblib')
print("Baseline model saved.")

Define the pipeline
Define parameter grid for grid search

In [None]:
print("===== Training Random Forest with Hyperparameter Tuning =====")

rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=RANDOM_STATE))
])

param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_split': [2, 5],
    'classifier__min_samples_leaf': [1, 2]
}

print(f"Parameter grid: {param_grid}")

Set up grid search with cross-validation
Fit grid search
Print best parameters

In [None]:
grid_search_rf = GridSearchCV(
    rf_pipeline, 
    param_grid, 
    cv=cv, 
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

print("Performing grid search for Random Forest...")
grid_search_rf.fit(X_train, y_train)

print(f"Best parameters: {grid_search_rf.best_params_}")
print(f"Best cross-validation accuracy: {grid_search_rf.best_score_:.4f}")

Evaluate best model on test set
Save the best model
## 7. Gradient Boosting with Optuna Hyperparameter Tuning

In [None]:
print("Evaluating best Random Forest model on test set:")
rf_metrics = evaluate_model(grid_search_rf.best_estimator_, X_test, y_test, "Random_Forest")

joblib.dump(grid_search_rf.best_estimator_, '../models/tuned_random_forest.joblib')
print("Tuned Random Forest model saved.")





Define an objective function for Optuna
Define hyperparameters to optimize
Create and preprocess the data
Create and train the model
Use cross-validation to evaluate
Return the mean accuracy

In [None]:
print("===== Training Gradient Boosting with Optuna Hyperparameter Tuning =====")

def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'random_state': RANDOM_STATE
    }
    
    X_train_processed = preprocessor.fit_transform(X_train)
    
    model = GradientBoostingClassifier(**params)
    
    cv_scores = cross_val_score(
        model, X_train_processed, y_train, 
        cv=cv, scoring='accuracy', n_jobs=-1
    )
    
    return cv_scores.mean()

print("Objective function defined.")

Create a study object and optimize the objective function
Print the best parameters

In [None]:
print("Running Optuna optimization for Gradient Boosting...")
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)  # Adjust n_trials as needed

print(f"Best parameters: {study.best_params}")
print(f"Best accuracy: {study.best_value:.4f}")

Create the final model with the best parameters
Train the model on the full training set
Evaluate on test set
Save the model
## 8. XGBoost Model

In [None]:
gb_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', GradientBoostingClassifier(**study.best_params))
])

gb_pipeline.fit(X_train, y_train)

print("Evaluating best Gradient Boosting model on test set:")
gb_metrics = evaluate_model(gb_pipeline, X_test, y_test, "Gradient_Boosting")

joblib.dump(gb_pipeline, '../models/tuned_gradient_boosting.joblib')
print("Tuned Gradient Boosting model saved.")

Define the pipeline
Define parameter grid for grid search

In [None]:
print("===== Training XGBoost Model =====")

xgb_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', xgb.XGBClassifier(random_state=RANDOM_STATE, eval_metric='mlogloss'))
])

param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [3, 6, 9],
    'classifier__learning_rate': [0.01, 0.1, 0.2],
    'classifier__subsample': [0.8, 1.0],
    'classifier__colsample_bytree': [0.8, 1.0]
}

print(f"XGBoost parameter grid: {param_grid}")

Set up grid search with cross-validation
Fit grid search
Print best parameters

In [None]:
grid_search_xgb = GridSearchCV(
    xgb_pipeline, 
    param_grid, 
    cv=cv, 
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

print("Performing grid search for XGBoost...")
grid_search_xgb.fit(X_train, y_train)

print(f"Best parameters: {grid_search_xgb.best_params_}")
print(f"Best cross-validation accuracy: {grid_search_xgb.best_score_:.4f}")

Evaluate best model on test set
Save the best model
## 9. Model Comparison and Final Selection

In [None]:
print("Evaluating best XGBoost model on test set:")
xgb_metrics = evaluate_model(grid_search_xgb.best_estimator_, X_test, y_test, "XGBoost")

joblib.dump(grid_search_xgb.best_estimator_, '../models/tuned_xgboost.joblib')
print("Tuned XGBoost model saved.")

Collect all model results
Create comparison dataframe

In [None]:
print("===== Model Comparison =====")

models = {
    'Logistic Regression': lr_metrics,
    'Random Forest': rf_metrics,
    'Gradient Boosting': gb_metrics,
    'XGBoost': xgb_metrics
}

comparison = pd.DataFrame({
    'Model': list(models.keys()),
    'Accuracy': [m['accuracy'] for m in models.values()],
    'F1 Score (macro)': [m['f1_score'] for m in models.values()]
})

print("Model Performance Comparison:")
comparison_sorted = comparison.sort_values('F1 Score (macro)', ascending=False)
print(comparison_sorted)

Plot model comparison
Create subplot with two charts
Accuracy comparison
F1 Score comparison

In [None]:
plt.figure(figsize=(12, 8))

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

comparison.set_index('Model')['Accuracy'].plot(kind='bar', ax=ax1, color='skyblue')
ax1.set_title('Model Accuracy Comparison')
ax1.set_ylabel('Accuracy Score')
ax1.set_ylim(0, 1)
ax1.tick_params(axis='x', rotation=45)

comparison.set_index('Model')['F1 Score (macro)'].plot(kind='bar', ax=ax2, color='lightcoral')
ax2.set_title('Model F1 Score Comparison')
ax2.set_ylabel('F1 Score (Macro)')
ax2.set_ylim(0, 1)
ax2.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.savefig('../models/model_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

Identify and save the best model
Save the best model with a standardized name
Save best model for deployment
## 10. Model Performance Summary

In [None]:
best_model_name = comparison_sorted.iloc[0]['Model']
best_accuracy = comparison_sorted.iloc[0]['Accuracy']
best_f1 = comparison_sorted.iloc[0]['F1 Score (macro)']

print(f"\nüèÜ BEST MODEL: {best_model_name}")
print(f"üìä Accuracy: {best_accuracy:.4f}")
print(f"üìä F1 Score (macro): {best_f1:.4f}")

if best_model_name == 'Logistic Regression':
    best_model = lr_pipeline
elif best_model_name == 'Random Forest':
    best_model = grid_search_rf.best_estimator_
elif best_model_name == 'Gradient Boosting':
    best_model = gb_pipeline
else:  # XGBoost
    best_model = grid_search_xgb.best_estimator_

joblib.dump(best_model, '../models/best_model_for_deployment.joblib')
print(f"‚úÖ Best model saved for deployment!")

In [None]:
print("\n" + "="*60)
print("           MODEL TRAINING SUMMARY")
print("="*60)

print(f"Dataset: {df.shape[0]} samples, {df.shape[1]-1} features")
print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")
print(f"Target classes: {list(y.unique())}")
print(f"Cross-validation: {cv.n_splits}-fold stratified")

print("\nModel Performance (Test Set):")
print("-" * 40)
for i, row in comparison_sorted.iterrows():
    print(f"{row['Model']:<20} | Acc: {row['Accuracy']:.4f} | F1: {row['F1 Score (macro)']:.4f}")

print(f"\nüéØ Best performing model: {best_model_name}")
print(f"üìÅ Models saved to: ../models/")
print(f"üìä Visualizations saved to: ../models/")

print("\nFiles generated:")
print("- baseline_logistic_regression.joblib")
print("- tuned_random_forest.joblib") 
print("- tuned_gradient_boosting.joblib")
print("- tuned_xgboost.joblib")
print("- best_model_for_deployment.joblib")
print("- model_comparison.png")
print("- [model_name]_confusion_matrix.png (for each model)")

print("\n‚ú® Model training and evaluation complete! ‚ú®")