# Notebook 02 – Data Cleaning
Student Retention Capstone – Harshitha Koppala

In [None]:
import pandas as pd
import numpy as np

# Load raw dataset (update name/path if needed)
df = pd.read_csv("../data/student_data_raw.csv")
df.head()

In [None]:
# Drop duplicates
df = df.drop_duplicates()

In [None]:
# Remove impossible outliers
df = df[df["Age_at_enrollment"] <= 100]
df = df[df["First_sem_gpa"] <= 4.0]
df = df[df["Prior_GPA"] <= 4.0]

In [None]:
# Handle missing values
num_cols = df.select_dtypes(include=np.number).columns
for col in num_cols:
    df[col].fillna(df[col].median(), inplace=True)

cat_cols = df.select_dtypes(exclude=np.number).columns
for col in cat_cols:
    df[col].fillna(df[col].mode()[0], inplace=True)

In [None]:
# Feature engineering
df["gpa_delta"] = df["First_sem_gpa"] - df["Prior_GPA"]

# financial_stress: unpaid tuition & no scholarship
df["financial_stress"] = np.where(
    (df["tuition_balance"] > 0) & (df["Scholarship_status"] == 0),
    1, 0
)

# attendance_rate_category
df["attendance_rate_category"] = pd.cut(
    df["Attendance_rate"],
    bins=[0, 70, 85, 100],
    labels=["Low", "Medium", "High"]
)

# credit_completion_ratio
df["credit_completion_ratio"] = df["credits_earned"] / df["credits_attempted"]

In [None]:
# Save cleaned dataset
df.to_csv("../data/student_data_clean.csv", index=False)

df.head()
df.info()