# Notebook 03 – Modeling
Student Retention Capstone – Harshitha Koppala

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, precision_score, recall_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

# Load cleaned dataset
df = pd.read_csv("../data/student_data_clean.csv")
df.head()

In [None]:
# Separate features and target
X = df.drop("Dropout", axis=1)
y = df["Dropout"]

# One-hot encode categoricals
X = pd.get_dummies(X, drop_first=True)

# 60/20/20 split
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

In [None]:
# Scale numeric features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

In [None]:
def evaluate(model, X, y):
    preds = model.predict(X)
    prob = model.predict_proba(X)[:, 1]
    return {
        "AUC": roc_auc_score(y, prob),
        "Precision": precision_score(y, preds),
        "Recall": precision_score(y, preds)
    }

In [None]:
# Logistic Regression
logreg = LogisticRegression(max_iter=300)
logreg.fit(X_train, y_train)
logreg_results = evaluate(logreg, X_val, y_val)
logreg_results

In [None]:
# Random Forest
rf = RandomForestClassifier(n_estimators=300, random_state=42)
rf.fit(X_train, y_train)
rf_results = evaluate(rf, X_val, y_val)
rf_results

In [None]:
# Gradient Boosting
gb = GradientBoostingClassifier(random_state=42)
gb.fit(X_train, y_train)
gb_results = evaluate(gb, X_val, y_val)
gb_results

In [None]:
# Compare models
model_summary = pd.DataFrame({
    "Logistic Regression": logreg_results,
    "Random Forest": rf_results,
    "Gradient Boosting": gb_results
})
model_summary