In [None]:
from pathlib import Path
import pandas as pd
import numpy as np

ROOT_DIR = Path.cwd().resolve()
if ROOT_DIR.name == "notebooks":
    ROOT_DIR = ROOT_DIR.parent
DATA_PATH = ROOT_DIR / "data"

DATA_PATH = ROOT_DIR / "data"

X_TRAIN_PATH = DATA_PATH / "processed" / "X_train_clean.csv"
X_TEST_PATH  = DATA_PATH / "processed" / "X_test_clean.csv"
Y_TRAIN_PATH = DATA_PATH / "raw" / "y_train.csv"

print("ROOT_DIR:", ROOT_DIR)
print("DATA_PATH:", DATA_PATH)

for p in [X_TRAIN_PATH, X_TEST_PATH, Y_TRAIN_PATH]:
    print(f"{p} -> exists={p.exists()}")


In [None]:
CHUNKSIZE = 200_000

def read_header_columns(csv_path: Path):
    return list(pd.read_csv(csv_path, nrows=0).columns)

def detect_id_column(columns):
    for col in ["Unnamed: 0", "ID", "id"]:
        if col in columns:
            return col
    return None

def detect_year_column(columns):
    if "YEAR" in columns:
        return "YEAR"
    low = {c.lower(): c for c in columns}
    for cand in ["year", "annee", "annÃ©e"]:
        if cand in low:
            return low[cand]
    return None

def detect_target_column(columns):
    for cand in ["MathScore", "target", "y"]:
        if cand in columns:
            return cand
    idc = detect_id_column(columns)
    non_id = [c for c in columns if c != idc]
    if len(non_id) == 1:
        return non_id[0]
    return None

x_cols = read_header_columns(X_TRAIN_PATH)
ID_COL_X = detect_id_column(x_cols)
YEAR_COL = detect_year_column(x_cols)

y_cols = read_header_columns(Y_TRAIN_PATH)
ID_COL_Y = detect_id_column(y_cols)
TARGET_COL = detect_target_column(y_cols)

assert YEAR_COL is not None
assert TARGET_COL is not None
assert ID_COL_X is not None
assert ID_COL_Y is not None


In [None]:
def count_rows_by_year(csv_path: Path, year_col: str, chunksize: int):
    counts = {}
    for chunk in pd.read_csv(csv_path, usecols=[year_col], chunksize=chunksize):
        vc = chunk[year_col].value_counts(dropna=False)
        for k, v in vc.items():
            counts[k] = counts.get(k, 0) + int(v)
    return pd.Series(counts).sort_index()

train_year_counts = count_rows_by_year(X_TRAIN_PATH, YEAR_COL, CHUNKSIZE)
test_year_counts  = count_rows_by_year(X_TEST_PATH,  YEAR_COL, CHUNKSIZE)

df_counts = pd.DataFrame({
    "n_rows_train": train_year_counts,
    "n_rows_test": test_year_counts
}).fillna(0).astype(int)

display(df_counts)
print("train years:", list(train_year_counts.index))
print("test years:", list(test_year_counts.index))


In [None]:
def zero_rate_by_year_train(
    x_path: Path,
    y_path: Path,
    year_col: str,
    id_col_x: str,
    id_col_y: str,
    target_col: str,
    chunksize: int,
    id_check_rows: int = 50_000
):
    total_by_year = {}
    zero_by_year = {}

    x_iter = pd.read_csv(x_path, usecols=[year_col, id_col_x], chunksize=chunksize)
    y_iter = pd.read_csv(y_path, usecols=[id_col_y, target_col], chunksize=chunksize)

    first = True
    for x_chunk, y_chunk in zip(x_iter, y_iter):
        if first:
            ncheck = min(len(x_chunk), len(y_chunk), id_check_rows)
            if ncheck > 0:
                if not np.array_equal(
                    x_chunk[id_col_x].to_numpy()[:ncheck],
                    y_chunk[id_col_y].to_numpy()[:ncheck],
                ):
                    raise ValueError("IDs not aligned between X_train_clean and y_train (chunked).")
            first = False

        years = x_chunk[year_col]
        vc_total = years.value_counts(dropna=False)
        for k, v in vc_total.items():
            total_by_year[k] = total_by_year.get(k, 0) + int(v)

        y_vals = pd.to_numeric(y_chunk[target_col], errors="coerce")
        zero_years = years.loc[y_vals.eq(0)]
        vc_zero = zero_years.value_counts(dropna=False)
        for k, v in vc_zero.items():
            zero_by_year[k] = zero_by_year.get(k, 0) + int(v)

    total_s = pd.Series(total_by_year).sort_index().astype(int)
    zero_s  = pd.Series(zero_by_year).reindex(total_s.index).fillna(0).astype(int)
    rate_s  = (zero_s / total_s).astype(float)
    return total_s, zero_s, rate_s

total_by_year, zero_by_year, zero_rate = zero_rate_by_year_train(
    x_path=X_TRAIN_PATH,
    y_path=Y_TRAIN_PATH,
    year_col=YEAR_COL,
    id_col_x=ID_COL_X,
    id_col_y=ID_COL_Y,
    target_col=TARGET_COL,
    chunksize=CHUNKSIZE
)

df_zero = pd.DataFrame({
    "n_rows_train": total_by_year,
    "n_zero_train": zero_by_year,
    "zero_rate_train": zero_rate
})

display(df_zero)


In [None]:
summary = pd.DataFrame({
    "n_rows_train": total_by_year,
    "zero_rate_train": zero_rate,
    "n_rows_test": test_year_counts.reindex(total_by_year.index).fillna(0).astype(int),
})

display(summary)

spread = float(summary["zero_rate_train"].max() - summary["zero_rate_train"].min())
print("zero_rate spread:", round(spread, 4))
