In [1]:
# Synthetic Student Dropout Dataset (1,000 rows)
# - Demographic, academic, financial variables + binary target `dropout`
# - Built-in NULLS (e.g., 10% in first_sem_gpa) and OUTLIERS (age 80/100, hs_gpa >5, admission_score >100 or negative)
# - Dropout label from a logistic data-generating process with plausible signs

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# -----------------------------
# 1) Data generation function
# -----------------------------
def generate_dropout_dataset(N=1000, seed=7):
    rng = np.random.default_rng(seed)

    # Demographics
    age = rng.integers(16, 35, size=N).astype(float)                 # 16–34 (we'll add outliers later)
    gender = rng.choice(["Female","Male"], size=N, p=[0.48, 0.52])
    origin = rng.choice(["Urban","Rural"], size=N, p=[0.72, 0.28])

    # Academics
    hs_gpa = np.clip(rng.normal(3.6, 0.6, size=N), 1.0, 5.0)         # 0–5 scale in practice
    admission_score = np.clip(rng.normal(70, 12, size=N), 20, 100)   # 0–100 exam-like score
    first_sem_gpa = np.clip(rng.normal(3.2, 0.8, size=N), 0.0, 5.0)  # 0–5 (10% nulls later)

    # Financial
    socioeconomic_level = rng.choice([1,2,3,4,5,6], size=N, p=[0.10,0.18,0.28,0.22,0.16,0.06])
    scholarship = rng.choice(["Yes","No"], size=N, p=[0.35, 0.65])
    loan = rng.choice(["Yes","No"], size=N, p=[0.42, 0.58])
    financial_aid = rng.choice(["Yes","No"], size=N, p=[0.22, 0.78])

    # ---- LOGISTIC DGP for dropout (computed BEFORE injecting nulls/outliers) ----
    sex_n = (gender=="Female").astype(int)     # protective
    origin_n = (origin=="Rural").astype(int)   # slightly riskier
    ses_n = socioeconomic_level.astype(float)

    z = (
        -0.2
        - 0.7*(hs_gpa - 3.0)                   # lower HS GPA -> higher risk
        - 0.5*((admission_score - 60)/20.0)    # lower admission score -> higher risk
        - 1.2*((first_sem_gpa - 2.5)/1.0)      # low first-sem GPA is strong risk
        - 0.15*sex_n                            # female slightly lower risk
        + 0.25*origin_n                         # rural slightly higher risk
        - 0.35*(scholarship=="Yes")             # scholarship protective
        + 0.20*(loan=="Yes")                    # loan stress increases risk
        - 0.25*(financial_aid=="Yes")           # aid protective
        + 0.18*(3 - np.clip(ses_n, 1, 6))       # lower SES -> higher risk
        + 0.05*((age - 18)/5.0)                 # older slightly higher risk
    )
    p = 1/(1 + np.exp(-z))
    dropout = (rng.uniform(size=N) < p).astype(int)

    # ---- Build DataFrame ----
    df = pd.DataFrame({
        "student_id": np.arange(1, N+1),
        "age": age,
        "gender": gender,
        "origin": origin,
        "hs_gpa": hs_gpa,
        "admission_score": admission_score,
        "first_sem_gpa": first_sem_gpa,
        "socioeconomic_level": socioeconomic_level,
        "scholarship": scholarship,
        "loan": loan,
        "financial_aid": financial_aid,
        "dropout": dropout
    })

    # -----------------------------
    # 2) Inject OUTLIERS
    # -----------------------------
    # Ages: add unrealistic high and negative values
    oi = rng.choice(N, size=18, replace=False)
    df.loc[oi[:6], "age"] = [60, 15, 80, -3, 100, 2]  # includes 80, 100, negative, and too-low ages

    # hs_gpa: values > 5 and negative
    df.loc[oi[6:12], "hs_gpa"] = [7.0, -1.0, 6.5, 5.8, 0.0, 9.0]

    # admission_score: >100 and negative values
    df.loc[oi[12:], "admission_score"] = [130, -10, 125, 140, 200, -25]

    # -----------------------------
    # 3) Inject NULLS (MAR/MNAR mix)
    # -----------------------------
    def inject_nulls(series, frac, rng_local):
        s = series.copy()
        k = max(1, int(frac * len(s)))
        idx = rng_local.choice(s.index, size=k, replace=False)
        s.loc[idx] = np.nan
        return s

    rng_nulls = np.random.default_rng(seed + 123)

    df["age"] = inject_nulls(df["age"], 0.05, rng_nulls)
    df["hs_gpa"] = inject_nulls(df["hs_gpa"], 0.03, rng_nulls)
    df["admission_score"] = inject_nulls(df["admission_score"], 0.02, rng_nulls)
    df["first_sem_gpa"] = inject_nulls(df["first_sem_gpa"], 0.10, rng_nulls)  # 10% as requested
    df["socioeconomic_level"] = inject_nulls(df["socioeconomic_level"], 0.02, rng_nulls)
    df["scholarship"] = inject_nulls(df["scholarship"], 0.01, rng_nulls)
    df["loan"] = inject_nulls(df["loan"], 0.01, rng_nulls)
    df["financial_aid"] = inject_nulls(df["financial_aid"], 0.01, rng_nulls)

    return df

# -----------------------------
# Run + Save + Sanity checks
# -----------------------------
df = generate_dropout_dataset(N=1000, seed=7)
out_path = "/content/dropout_synthetic.csv"
df.to_csv(out_path, index=False)

print("Saved:", out_path)
print("Shape:", df.shape)
print("\nNull % by column:")
print((df.isnull().mean()*100).round(2).sort_values(ascending=False))

print("\nDropout rate (mean):", df["dropout"].mean().round(3))

# Simple outlier counts after injection (ignoring nulls)
age_out = df["age"].dropna().pipe(lambda s: ((s < 0) | (s > 50)).sum())
gpa_out = df["hs_gpa"].dropna().pipe(lambda s: ((s < 0) | (s > 5)).sum())
adm_out = df["admission_score"].dropna().pipe(lambda s: ((s < 0) | (s > 100)).sum())
print(f"\nOutlier counts → age: {age_out}, hs_gpa: {gpa_out}, admission_score: {adm_out}")

# Peek at the first 5 rows
df.head()


Saved: /content/dropout_synthetic.csv
Shape: (1000, 12)

Null % by column:
first_sem_gpa          10.0
age                     5.0
hs_gpa                  3.0
admission_score         2.0
socioeconomic_level     2.0
scholarship             1.0
financial_aid           1.0
loan                    1.0
origin                  0.0
gender                  0.0
student_id              0.0
dropout                 0.0
dtype: float64

Dropout rate (mean): 0.2

Outlier counts → age: 4, hs_gpa: 5, admission_score: 6


Unnamed: 0,student_id,age,gender,origin,hs_gpa,admission_score,first_sem_gpa,socioeconomic_level,scholarship,loan,financial_aid,dropout
0,1,,Female,Urban,2.337818,58.76494,3.146225,3.0,No,Yes,Yes,1
1,2,27.0,Male,Rural,3.196993,91.324946,4.05368,3.0,Yes,Yes,No,0
2,3,28.0,Male,Urban,2.384379,65.592479,2.935686,2.0,No,No,Yes,0
3,4,33.0,Female,Urban,3.640775,53.674319,2.861005,2.0,No,No,Yes,0
4,5,26.0,Male,Urban,3.602722,70.053589,4.943606,3.0,No,No,No,0
