# Notebook 01 – Data Exploration
Student Retention Capstone – Harshitha Koppala

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

sns.set(style="whitegrid")

# Load Dataset (update path/name if needed)
df = pd.read_csv("../data/student_data_raw.csv")
df.head()

In [None]:
# Basic Structure
print(df.shape)
df.info()
df.describe(include="all")

In [None]:
# Missing Values Heatmap
plt.figure(figsize=(10,5))
sns.heatmap(df.isnull(), cbar=False)
plt.title("Missing Values Heatmap")
plt.show()

df.isnull().sum()

In [None]:
# Univariate – numeric
numeric_cols = df.select_dtypes(include=np.number).columns
df[numeric_cols].hist(figsize=(12,10))
plt.suptitle("Numeric Feature Distributions")
plt.show()

In [None]:
# Univariate – categorical
cat_cols = df.select_dtypes(exclude=np.number).columns
for col in cat_cols:
    plt.figure(figsize=(6,4))
    df[col].value_counts().plot(kind="bar")
    plt.title(f"Distribution of {col}")
    plt.show()

In [None]:
# Bivariate – GPA vs Dropout
sns.boxplot(data=df, x="Dropout", y="First_sem_gpa")
plt.title("First Semester GPA by Dropout Status")
plt.show()

sns.boxplot(data=df, x="Dropout", y="Prior_GPA")
plt.title("Prior GPA by Dropout Status")
plt.show()

In [None]:
# Correlation Heatmap
plt.figure(figsize=(12,8))
sns.heatmap(df[numeric_cols].corr(), annot=True, cmap="coolwarm")
plt.title("Correlation Matrix")
plt.show()